Index: head/sys/dev/cxgbe/crypto/t4_kern_tls.c
===================================================================
--- head/sys/dev/cxgbe/crypto/t4_kern_tls.c	(revision 360578)
+++ head/sys/dev/cxgbe/crypto/t4_kern_tls.c	(revision 360579)
@@ -1,2398 +1,2398 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018-2019 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/ktls.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp_var.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "t4_l2t.h"
 #include "t4_clip.h"
 #include "t4_mp_ring.h"
 #include "crypto/t4_crypto.h"
 
 #if defined(INET) || defined(INET6)
 
 #define SALT_SIZE		4
 
 #define GCM_TAG_SIZE			16
 #define TLS_HEADER_LENGTH		5
 
 #define	TLS_KEY_CONTEXT_SZ	roundup2(sizeof(struct tls_keyctx), 32)
 
 struct tls_scmd {
 	__be32 seqno_numivs;
 	__be32 ivgen_hdrlen;
 };
 
 struct tls_key_req {
 	/* FW_ULPTX_WR */
 	__be32 wr_hi;
 	__be32 wr_mid;
         __be32 ftid;
         __u8   reneg_to_write_rx;
         __u8   protocol;
         __be16 mfs;
 	/* master command */
 	__be32 cmd;
 	__be32 len16;             /* command length */
 	__be32 dlen;              /* data length in 32-byte units */
 	__be32 kaddr;
 	/* sub-command */
 	__be32 sc_more;
 	__be32 sc_len;
 }__packed;
 
 struct tls_keyctx {
 	struct tx_keyctx_hdr {
 		__u8   ctxlen;
 		__u8   r2;
 		__be16 dualck_to_txvalid;
 		__u8   txsalt[4];
 		__be64 r5;
 	} txhdr;
         struct keys {
                 __u8   edkey[32];
                 __u8   ipad[64];
                 __u8   opad[64];
         } keys;
 };
 
 #define S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 11
 #define M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 0x1
 #define V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT)
 #define G_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) & \
      M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT)
 #define F_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT \
     V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1U)
 
 #define S_TLS_KEYCTX_TX_WR_SALT_PRESENT 10
 #define M_TLS_KEYCTX_TX_WR_SALT_PRESENT 0x1
 #define V_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_SALT_PRESENT)
 #define G_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_SALT_PRESENT) & \
      M_TLS_KEYCTX_TX_WR_SALT_PRESENT)
 #define F_TLS_KEYCTX_TX_WR_SALT_PRESENT \
     V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1U)
 
 #define S_TLS_KEYCTX_TX_WR_TXCK_SIZE 6
 #define M_TLS_KEYCTX_TX_WR_TXCK_SIZE 0xf
 #define V_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXCK_SIZE)
 #define G_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXCK_SIZE) & \
      M_TLS_KEYCTX_TX_WR_TXCK_SIZE)
 
 #define S_TLS_KEYCTX_TX_WR_TXMK_SIZE 2
 #define M_TLS_KEYCTX_TX_WR_TXMK_SIZE 0xf
 #define V_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXMK_SIZE)
 #define G_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXMK_SIZE) & \
      M_TLS_KEYCTX_TX_WR_TXMK_SIZE)
 
 #define S_TLS_KEYCTX_TX_WR_TXVALID   0
 #define M_TLS_KEYCTX_TX_WR_TXVALID   0x1
 #define V_TLS_KEYCTX_TX_WR_TXVALID(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXVALID)
 #define G_TLS_KEYCTX_TX_WR_TXVALID(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXVALID) & M_TLS_KEYCTX_TX_WR_TXVALID)
 #define F_TLS_KEYCTX_TX_WR_TXVALID   V_TLS_KEYCTX_TX_WR_TXVALID(1U)
 
 /* Key Context Programming Operation type */
 #define KEY_WRITE_RX			0x1
 #define KEY_WRITE_TX			0x2
 #define KEY_DELETE_RX			0x4
 #define KEY_DELETE_TX			0x8
 
 struct tlspcb {
 	struct cxgbe_snd_tag com;
 	struct vi_info *vi;	/* virtual interface */
 	struct adapter *sc;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	int tid;		/* Connection identifier */
 
 	int tx_key_addr;
 	bool inline_key;
 	bool using_timestamps;
 	unsigned char enc_mode;
 
 	struct tls_scmd scmd0;
 	struct tls_scmd scmd0_short;
 
 	unsigned int tx_key_info_size;
 
 	uint32_t prev_seq;
 	uint32_t prev_ack;
 	uint32_t prev_tsecr;
 	uint16_t prev_win;
 	uint16_t prev_mss;
 
 	/* Only used outside of setup and teardown when using inline keys. */
 	struct tls_keyctx keyctx;
 
 	/* Fields only used during setup and teardown. */
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 
 	unsigned char auth_mode;
 	unsigned char hmac_ctrl;
 	unsigned char mac_first;
 	unsigned char iv_size;
 
 	unsigned int frag_size;
 	unsigned int cipher_secret_size;
 	int proto_ver;
 
 	bool open_pending;
 };
 
 static int ktls_setup_keys(struct tlspcb *tlsp,
     const struct ktls_session *tls, struct sge_txq *txq);
 
 static inline struct tlspcb *
 mst_to_tls(struct m_snd_tag *t)
 {
 	return ((struct tlspcb *)mst_to_cst(t));
 }
 
 /* XXX: There are similar versions of these two in tom/t4_tls.c. */
 static int
 get_new_keyid(struct tlspcb *tlsp)
 {
 	vmem_addr_t addr;
 
 	if (vmem_alloc(tlsp->sc->key_map, TLS_KEY_CONTEXT_SZ,
 	    M_NOWAIT | M_FIRSTFIT, &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct tlspcb *tlsp, int keyid)
 {
 
 	CTR3(KTR_CXGBE, "%s: tid %d key addr %#x", __func__, tlsp->tid, keyid);
 	vmem_free(tlsp->sc->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static struct tlspcb *
 alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tlspcb *tlsp;
 
 	tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags);
 	if (tlsp == NULL)
 		return (NULL);
 
 	cxgbe_snd_tag_init(&tlsp->com, ifp, IF_SND_TAG_TYPE_TLS);
 	tlsp->vi = vi;
 	tlsp->sc = sc;
 	tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	tlsp->tid = -1;
 	tlsp->tx_key_addr = -1;
 
 	return (tlsp);
 }
 
 static void
 init_ktls_key_params(struct tlspcb *tlsp, const struct ktls_session *tls)
 {
 	int mac_key_size;
 
 	if (tls->params.tls_vminor == TLS_MINOR_VER_ONE)
 		tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_1;
 	else
 		tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_2;
 	tlsp->cipher_secret_size = tls->params.cipher_key_len;
 	tlsp->tx_key_info_size = sizeof(struct tx_keyctx_hdr) +
 	    tlsp->cipher_secret_size;
 	if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) {
 		tlsp->auth_mode = SCMD_AUTH_MODE_GHASH;
 		tlsp->enc_mode = SCMD_CIPH_MODE_AES_GCM;
 		tlsp->iv_size = 4;
 		tlsp->mac_first = 0;
 		tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NOP;
 		tlsp->tx_key_info_size += GMAC_BLOCK_LEN;
 	} else {
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			mac_key_size = roundup2(SHA1_HASH_LEN, 16);
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA1;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			mac_key_size = SHA2_256_HASH_LEN;
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA256;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			mac_key_size = SHA2_512_HASH_LEN;
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA512_384;
 			break;
 		}
 		tlsp->enc_mode = SCMD_CIPH_MODE_AES_CBC;
 		tlsp->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		tlsp->mac_first = 1;
 		tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC;
 		tlsp->tx_key_info_size += mac_key_size * 2;
 	}
 
 	tlsp->frag_size = tls->params.max_frame_len;
 }
 
 static int
 ktls_act_open_cpl_size(bool isipv6)
 {
 
 	if (isipv6)
 		return (sizeof(struct cpl_t6_act_open_req6));
 	else
 		return (sizeof(struct cpl_t6_act_open_req));
 }
 
 static void
 mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp,
     struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req *cpl6;
 	struct cpl_act_open_req *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		qid_atid));
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 	    &cpl->peer_ip, &cpl->peer_port);
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static void
 mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req6 *cpl6;
 	struct cpl_act_open_req6 *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req6 *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
 		qid_atid));
 	cpl->local_port = inp->inp_lport;
 	cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	cpl->peer_port = inp->inp_fport;
 	cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 	cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static int
 send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid)
 {
 	struct wrqe *wr;
 	bool isipv6;
 
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	if (isipv6) {
 		tlsp->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
 		if (tlsp->ce == NULL)
 			return (ENOENT);
 	}
 
 	wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq);
 	if (wr == NULL) {
 		CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__,
 		    atid);
 		return (ENOMEM);
 	}
 
 	if (isipv6)
 		mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr));
 	else
 		mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr));
 
 	tlsp->open_pending = true;
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 	u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
 	struct tlspcb *tlsp = lookup_atid(sc, atid);
 	struct inpcb *inp = tlsp->inp;
 
 	CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status);
 	free_atid(sc, atid);
 	if (status == 0)
 		tlsp->tid = GET_TID(cpl);
 
 	INP_WLOCK(inp);
 	tlsp->open_pending = false;
 	wakeup(tlsp);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0,
     "CPL_SET_TCB_FIELD ULP command not 16-byte aligned");
 
 static void
 write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq,
     uint16_t word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt;
 	struct ulptx_idata *idata;
 	struct cpl_set_tcb_field_core *cpl;
 
 	/* ULP_TXPKT */
 	txpkt = dst;
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (struct ulptx_idata *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htobe32(sizeof(*cpl));
 
 	/* CPL_SET_TCB_FIELD */
 	cpl = (struct cpl_set_tcb_field_core *)(idata + 1);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid));
 	cpl->reply_ctrl = htobe16(F_NO_REPLY);
 	cpl->word_cookie = htobe16(V_WORD(word));
 	cpl->mask = htobe64(mask);
 	cpl->val = htobe64(val);
 
 	/* ULPTX_NOOP */
 	idata = (struct ulptx_idata *)(cpl + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 	idata->len = htobe32(0);
 }
 
 static int
 ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq)
 {
 	struct fw_ulptx_wr *wr;
 	struct mbuf *m;
 	char *dst;
 	void *items[1];
 	int error, len;
 
 	len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com.com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 
 	/* FW_ULPTX_WR */
 	wr = mtod(m, void *);
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(len / 16));
 	wr->cookie = 0;
 	dst = (char *)(wr + 1);
 
         /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS,
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)),
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1)));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW,
 	    V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) |
 	    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 	    V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW,
 	    V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET,
 		    V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET),
 		    V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28));
 		dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	}
 
 	KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__));
 
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	return (error);
 }
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	const struct ktls_session *tls;
 	struct tlspcb *tlsp;
 	struct adapter *sc;
 	struct vi_info *vi;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct sge_txq *txq;
 	int atid, error, keyid;
 
 	tls = params->tls.tls;
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO)
 		return (EPROTONOSUPPORT);
 
 	/* Sanity check values in *tls. */
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			return (EPROTONOSUPPORT);
 		}
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE)
 			return (EINVAL);
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	vi = ifp->if_softc;
 	sc = vi->pi->adapter;
 
 	tlsp = alloc_tlspcb(ifp, vi, M_WAITOK);
 
 	atid = alloc_atid(sc, tlsp);
 	if (atid < 0) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (sc->tlst.inline_keys)
 		keyid = -1;
 	else
 		keyid = get_new_keyid(tlsp);
 	if (keyid < 0) {
 		CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__,
 		    atid);
 		tlsp->inline_key = true;
 	} else {
 		tlsp->tx_key_addr = keyid;
 		CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x",
 		    __func__,
 		    atid, tlsp->tx_key_addr);
 	}
 
 	inp = params->tls.inp;
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 	tlsp->inp = inp;
 
 	tp = inp->inp_ppcb;
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		tlsp->using_timestamps = true;
 		if ((tp->ts_offset & 0xfffffff) != 0) {
 			INP_RUNLOCK(inp);
 			error = EINVAL;
 			goto failed;
 		}
 	} else
 		tlsp->using_timestamps = false;
 
 	error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid);
 	if (error) {
 		INP_RUNLOCK(inp);
 		goto failed;
 	}
 
 	/* Wait for reply to active open. */
 	CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__,
 	    atid);
 	while (tlsp->open_pending) {
 		/*
 		 * XXX: PCATCH?  We would then have to discard the PCB
 		 * when the completion CPL arrived.
 		 */
 		error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0);
 	}
 
 	atid = -1;
 	if (tlsp->tid < 0) {
 		INP_RUNLOCK(inp);
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 
 	txq = &sc->sge.txq[vi->first_txq];
 	if (inp->inp_flowtype != M_HASHTYPE_NONE)
 		txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 	tlsp->txq = txq;
 
 	error = ktls_set_tcb_fields(tlsp, tp, txq);
 	INP_RUNLOCK(inp);
 	if (error)
 		goto failed;
 
 	init_ktls_key_params(tlsp, tls);
 
 	error = ktls_setup_keys(tlsp, tls, txq);
 	if (error)
 		goto failed;
 
 	/* The SCMD fields used when encrypting a full TLS record. */
 	tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) |
 	    V_SCMD_PROTO_VERSION(tlsp->proto_ver) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) |
 	    V_SCMD_CIPH_MODE(tlsp->enc_mode) |
 	    V_SCMD_AUTH_MODE(tlsp->auth_mode) |
 	    V_SCMD_HMAC_CTRL(tlsp->hmac_ctrl) |
 	    V_SCMD_IV_SIZE(tlsp->iv_size) | V_SCMD_NUM_IVS(1));
 
 	tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0);
 	if (tlsp->inline_key)
 		tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 	tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen);
 
 	/*
 	 * The SCMD fields used when encrypting a partial TLS record
 	 * (no trailer and possibly a truncated payload).
 	 */
 	tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) |
 	    V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) |
 	    V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) |
 	    V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) |
 	    V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR);
 	else
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(tlsp->enc_mode);
 	tlsp->scmd0_short.seqno_numivs =
 	    htobe32(tlsp->scmd0_short.seqno_numivs);
 
 	tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0) |
 	    V_SCMD_AADIVDROP(1);
 	if (tlsp->inline_key)
 		tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 
 	TXQ_LOCK(txq);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		txq->kern_tls_gcm++;
 	else
 		txq->kern_tls_cbc++;
 	TXQ_UNLOCK(txq);
 	*pt = &tlsp->com.com;
 	return (0);
 
 failed:
 	if (atid >= 0)
 		free_atid(sc, atid);
 	m_snd_tag_rele(&tlsp->com.com);
 	return (error);
 }
 
 static int
 ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls,
     struct sge_txq *txq)
 {
 	struct auth_hash *axf;
 	int error, keyid, kwrlen, kctxlen, len;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 	void *items[1], *key;
 	struct tx_keyctx_hdr *khdr;
 	unsigned int ck_size, mk_size, partial_digest_len;
 	struct mbuf *m;
 
 	/*
 	 * Store the salt and keys in the key context.  For
 	 * connections with an inline key, this key context is passed
 	 * as immediate data in each work request.  For connections
 	 * storing the key in DDR, a work request is used to store a
 	 * copy of the key context in DDR.
 	 */
 	kctx = &tlsp->keyctx;
 	khdr = &kctx->txhdr;
 
 	switch (tlsp->cipher_secret_size) {
 	case 128 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128;
 		break;
 	case 192 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192;
 		break;
 	case 256 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256;
 		break;
 	default:
 		panic("bad key size");
 	}
 	axf = NULL;
 	partial_digest_len = 0;
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512;
 	else {
 		switch (tlsp->auth_mode) {
 		case SCMD_AUTH_MODE_SHA1:
 			axf = &auth_hash_hmac_sha1;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_160;
 			partial_digest_len = SHA1_HASH_LEN;
 			break;
 		case SCMD_AUTH_MODE_SHA256:
 			axf = &auth_hash_hmac_sha2_256;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_256;
 			partial_digest_len = SHA2_256_HASH_LEN;
 			break;
 		case SCMD_AUTH_MODE_SHA512_384:
 			axf = &auth_hash_hmac_sha2_384;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512;
 			partial_digest_len = SHA2_512_HASH_LEN;
 			break;
 		default:
 			panic("bad auth mode");
 		}
 	}
 
 	khdr->ctxlen = (tlsp->tx_key_info_size >> 4);
 	khdr->dualck_to_txvalid = V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 	    V_TLS_KEYCTX_TX_WR_TXCK_SIZE(ck_size) |
 	    V_TLS_KEYCTX_TX_WR_TXMK_SIZE(mk_size) |
 	    V_TLS_KEYCTX_TX_WR_TXVALID(1);
 	if (tlsp->enc_mode != SCMD_CIPH_MODE_AES_GCM)
 		khdr->dualck_to_txvalid |= V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1);
 	khdr->dualck_to_txvalid = htobe16(khdr->dualck_to_txvalid);
 	key = kctx->keys.edkey;
 	memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		memcpy(khdr->txsalt, tls->params.iv, SALT_SIZE);
 		t4_init_gmac_hash(tls->params.cipher_key,
 		    tls->params.cipher_key_len,
 		    (char *)key + tls->params.cipher_key_len);
 	} else {
 		t4_init_hmac_digest(axf, partial_digest_len,
 		    tls->params.auth_key, tls->params.auth_key_len,
 		    (char *)key + tls->params.cipher_key_len);
 	}
 
 	if (tlsp->inline_key)
 		return (0);
 
 	keyid = tlsp->tx_key_addr;
 
 	/* Populate key work request. */
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = kwrlen + kctxlen;
 
         m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com.com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	kwr = mtod(m, void *);
 	memset(kwr, 0, len);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)));
 	kwr->protocol = tlsp->proto_ver;
 	kwr->mfs = htons(tlsp->frag_size);
 	kwr->reneg_to_write_rx = KEY_WRITE_TX;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((tlsp->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memcpy(kctx, &tlsp->keyctx, sizeof(*kctx));
 
 	/*
 	 * Place the key work request in the transmit queue.  It
 	 * should be sent to the NIC before any TLS packets using this
 	 * session.
 	 */
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	else
 		CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid);
 	return (error);
 }
 
 static u_int
 ktls_base_wr_size(struct tlspcb *tlsp)
 {
 	u_int wr_len;
 
 	wr_len = sizeof(struct fw_ulptx_wr);	// 16
 	wr_len += sizeof(struct ulp_txpkt);	// 8
 	wr_len += sizeof(struct ulptx_idata);	// 8
 	wr_len += sizeof(struct cpl_tx_sec_pdu);// 32
 	if (tlsp->inline_key)
 		wr_len += tlsp->tx_key_info_size;
 	else {
 		wr_len += sizeof(struct ulptx_sc_memrd);// 8
 		wr_len += sizeof(struct ulptx_idata);	// 8
 	}
 	wr_len += sizeof(struct cpl_tx_data);	// 16
 	return (wr_len);
 }
 
 /* How many bytes of TCP payload to send for a given TLS record. */
 static u_int
 ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int plen, mlen;
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 
 	/*
 	 * What range of the TLS record is the mbuf requesting to be
 	 * sent.
 	 */
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 
 	/* Always send complete records. */
 	if (mlen == TLS_HEADER_LENGTH + plen)
 		return (mlen);
 
 	/*
 	 * If the host stack has asked to send part of the trailer,
 	 * trim the length to avoid sending any of the trailer.  There
 	 * is no way to send a partial trailer currently.
 	 */
-	if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_ext_pgs.trail_len)
-		mlen = TLS_HEADER_LENGTH + plen - m_tls->m_ext_pgs.trail_len;
+	if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen)
+		mlen = TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen;
 
 
 	/*
 	 * For AES-CBC adjust the ciphertext length for the block
 	 * size.
 	 */
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    mlen > TLS_HEADER_LENGTH) {
 		mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)",
 	    __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen);
 #endif
 	return (mlen);
 }
 
 /*
  * For a "short" TLS record, determine the offset into the TLS record
  * payload to send.  This offset does not include the TLS header, but
  * a non-zero offset implies that a header will not be sent.
  */
 static u_int
 ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int offset, plen;
 #ifdef INVARIANTS
 	u_int mlen;
 #endif
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 #ifdef INVARIANTS
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 	MPASS(mlen < TLS_HEADER_LENGTH + plen);
 #endif
-	if (mtod(m_tls, vm_offset_t) <= m_tls->m_ext_pgs.hdr_len)
+	if (mtod(m_tls, vm_offset_t) <= m_tls->m_epg_hdrlen)
 		return (0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		/*
 		 * Always send something.  This function is only called
 		 * if we aren't sending the tag at all, but if the
 		 * request starts in the tag then we are in an odd
 		 * state where would effectively send nothing.  Cap
 		 * the offset at the last byte of the record payload
 		 * to send the last cipher block.
 		 */
-		offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_ext_pgs.hdr_len,
-		    (plen - TLS_HEADER_LENGTH - m_tls->m_ext_pgs.trail_len) - 1);
+		offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_epg_hdrlen,
+		    (plen - TLS_HEADER_LENGTH - m_tls->m_epg_trllen) - 1);
 		return (rounddown(offset, AES_BLOCK_LEN));
 	}
 	return (0);
 }
 
 static u_int
 ktls_sgl_size(u_int nsegs)
 {
 	u_int wr_len;
 
 	/* First segment is part of ulptx_sgl. */
 	nsegs--;
 
 	wr_len = sizeof(struct ulptx_sgl);
 	wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	return (wr_len);
 }
 
 static int
 ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls,
     int *nsegsp)
 {
 	struct tls_record_layer *hdr;
 	u_int imm_len, offset, plen, wr_len, tlen;
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 
 	/*
 	 * Determine the size of the TLS record payload to send
 	 * excluding header and trailer.
 	 */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
-	if (tlen <= m_tls->m_ext_pgs.hdr_len) {
+	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) +
 		    roundup2(m->m_len + m_tls->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 		    "%s: tid %d TLS header-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len + m_tls->m_len);
 		}
 
 		/* This should always be the last TLS record in a chain. */
 		MPASS(m_tls->m_next == NULL);
 
 		/*
 		 * XXX: Set a bogus 'nsegs' value to avoid tripping an
 		 * assertion in mbuf_nsegs() in t4_sge.c.
 		 */
 		*nsegsp = 1;
 		return (wr_len);
 	}
 
 	hdr = (void *)m_tls->m_epg_hdr;
-	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_ext_pgs.trail_len;
+	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 	} else
 		offset = 0;
 
 	/* Calculate the size of the work request. */
 	wr_len = ktls_base_wr_size(tlsp);
 
 	/*
 	 * Full records and short records with an offset of 0 include
 	 * the TLS header as immediate data.  Short records include a
 	 * raw AES IV as immediate data.
 	 */
 	imm_len = 0;
 	if (offset == 0)
-		imm_len += m_tls->m_ext_pgs.hdr_len;
+		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	wr_len += roundup2(imm_len, 16);
 
 	/* TLS record payload via DSGL. */
-	*nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_ext_pgs.hdr_len + offset,
-	    plen - (m_tls->m_ext_pgs.hdr_len + offset));
+	*nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset,
+	    plen - (m_tls->m_epg_hdrlen + offset));
 	wr_len += ktls_sgl_size(*nsegsp);
 
 	wr_len = roundup2(wr_len, 16);
 	return (wr_len);
 }
 
 /*
  * See if we have any TCP options requiring a dedicated options-only
  * packet.
  */
 static int
 ktls_has_tcp_options(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_NOP:
 		case TCPOPT_TIMESTAMP:
 			break;
 		default:
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Find the TCP timestamp option.
  */
 static void *
 ktls_find_tcp_timestamps(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP)
 			return (cp + 2);
 	}
 	return (NULL);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	struct tlspcb *tlsp;
 	struct ether_header *eh;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	int nsegs;
 	u_int wr_len, tot_len;
 
 	/*
 	 * Locate headers in initial mbuf.
 	 *
 	 * XXX: This assumes all of the headers are in the initial mbuf.
 	 * Could perhaps use m_advance() like parse_pkt() if that turns
 	 * out to not be true.
 	 */
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	if (m->m_len <= sizeof(*eh) + sizeof(*ip)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__,
 		    tlsp->tid);
 		return (EINVAL);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ntohs(eh->ether_type) != ETHERTYPE_IP &&
 	    ntohs(eh->ether_type) != ETHERTYPE_IPV6) {
 		CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	m->m_pkthdr.l2hlen = sizeof(*eh);
 
 	/* XXX: Reject unsupported IP options? */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (struct ip *)(eh + 1);
 		if (ip->ip_p != IPPROTO_TCP) {
 			CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP",
 			    __func__, tlsp->tid);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = ip->ip_hl * 4;
 	} else {
 		ip6 = (struct ip6_hdr *)(eh + 1);
 		if (ip6->ip6_nxt != IPPROTO_TCP) {
 			CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)",
 			    __func__, tlsp->tid, ip6->ip6_nxt);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 	}
 	if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    sizeof(*tcp)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen);
 	m->m_pkthdr.l4hlen = tcp->th_off * 4;
 
 	/* Bail if there is TCP payload before the TLS record. */
 	if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    m->m_pkthdr.l4hlen) {
 		CTR6(KTR_CXGBE,
 		    "%s: tid %d header mbuf bad length (%d + %d + %d != %d)",
 		    __func__, tlsp->tid, m->m_pkthdr.l2hlen,
 		    m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len);
 		return (EINVAL);
 	}
 
 	/* Assume all headers are in 'm' for now. */
 	MPASS(m->m_next != NULL);
 	MPASS(m->m_next->m_flags & M_NOMAP);
 
 	tot_len = 0;
 
 	/*
 	 * Each of the remaining mbufs in the chain should reference a
 	 * TLS record.
 	 */
 	*nsegsp = 0;
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_NOMAP);
 
 		wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__,
 		    tlsp->tid, wr_len, nsegs);
 #endif
 		if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS)
 			return (EFBIG);
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 
 		/*
 		 * Store 'nsegs' for the first TLS record in the
 		 * header mbuf's metadata.
 		 */
 		if (*nsegsp == 0)
 			*nsegsp = nsegs;
 	}
 
 	MPASS(tot_len != 0);
 
 	/*
 	 * See if we have any TCP options or a FIN requiring a
 	 * dedicated packet.
 	 */
 	if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) {
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 			    "%s: tid %d options-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len);
 			return (EINVAL);
 		}
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 	}
 
 	/* Include room for a TP work request to program an L2T entry. */
 	tot_len += EQ_ESIZE;
 
 	/*
 	 * Include room for a ULPTX work request including up to 5
 	 * CPL_SET_TCB_FIELD commands before the first TLS work
 	 * request.
 	 */
 	wr_len = sizeof(struct fw_ulptx_wr) +
 	    5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/*
 	 * If timestamps are present, reserve 1 more command for
 	 * setting the echoed timestamp.
 	 */
 	if (tlsp->using_timestamps)
 		wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	tot_len += roundup2(wr_len, EQ_ESIZE);
 
 	*len16p = tot_len / 16;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__,
 	    tlsp->tid, *len16p, *nsegsp);
 #endif
 	return (0);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_buf(struct sglist *gl, caddr_t to)
 {
 	struct sglist_seg *seg;
 	__be64 *flitp;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)to & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, to));
 
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)to;
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	for (i = 0; i < nsegs - 1; i++, seg++) {
 		usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 		usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	flitp += nflits;
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 		if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx])
 			(*to) = (caddr_t)eq->desc;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static int
 ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Clear PUSH and FIN in the TCP header if present. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_flags &= ~(TH_PUSH | TH_FIN);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_options++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = NULL;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m,
     struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	/* Locate the template TLS header. */
 	MBUF_EXT_PGS_ASSERT(m_tls);
 
 	/* This should always be the last TLS record in a chain. */
 	MPASS(m_tls->m_next == NULL);
 
 	wr = dst;
 	pktlen = m->m_len + m_tls->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t));
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of TCP header. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 
 	/* Copy the subset of the TLS header requested. */
 	copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr +
 	    mtod(m_tls, vm_offset_t), &out, m_tls->m_len);
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_header++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE,
     "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor");
 _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW,
     "SND_NXT_RAW and SND_UNA_RAW are in different words");
 
 static int
 ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq,
     void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls,
     u_int nsegs, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt,
     u_int pidx, bool set_l2t_idx)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct fw_ulptx_wr *wr;
 	struct ulp_txpkt *txpkt;
 	struct ulptx_sc_memrd *memrd;
 	struct ulptx_idata *idata;
 	struct cpl_tx_sec_pdu *sec_pdu;
 	struct cpl_tx_data *tx_data;
 	struct tls_record_layer *hdr;
 	char *iv, *out;
 	u_int aad_start, aad_stop;
 	u_int auth_start, auth_stop, auth_insert;
 	u_int cipher_start, cipher_stop, iv_offset;
 	u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len;
 	u_int fields, tx_max_offset, tx_max;
 	bool first_wr, last_wr, using_scratch;
 
 	ndesc = 0;
 	MPASS(tlsp->txq == txq);
 
 	first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 &&
 	    tlsp->prev_win == 0);
 
 	/*
 	 * Use the per-txq scratch pad if near the end of the ring to
 	 * simplify handling of wrap-around.  This uses a simple but
 	 * not quite perfect test of using the scratch buffer if we
 	 * can't fit a maximal work request in without wrapping.
 	 */
 	using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE);
 
 	/* Locate the TLS header. */
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
-	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_ext_pgs.trail_len;
+	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 
 	/* Determine how much of the TLS record to send. */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
-	if (tlen <= m_tls->m_ext_pgs.hdr_len) {
+	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u",
-		    __func__, tlsp->tid, (u_int)m_tls->m_ext_pgs.seqno);
+		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno);
 #endif
 		return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available,
 		    tcp_seqno, pidx));
 	}
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u",
-		    __func__, tlsp->tid, (u_int)m_tls->m_ext_pgs.seqno, offset);
+		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno, offset);
 #endif
 		if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) {
 			txq->kern_tls_fin_short++;
 #ifdef INVARIANTS
 			panic("%s: FIN on short TLS record", __func__);
 #endif
 		}
 	} else
 		offset = 0;
 
 	/*
 	 * This is the last work request for a given TLS mbuf chain if
 	 * it is the last mbuf in the chain and FIN is not set.  If
 	 * FIN is set, then ktls_write_tcp_fin() will write out the
 	 * last work request.
 	 */
 	last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0;
 
 	/*
 	 * The host stack may ask us to not send part of the start of
 	 * a TLS record.  (For example, the stack might have
 	 * previously sent a "short" TLS record and might later send
 	 * down an mbuf that requests to send the remainder of the TLS
 	 * record.)  The crypto engine must process a TLS record from
 	 * the beginning if computing a GCM tag or HMAC, so we always
 	 * send the TLS record from the beginning as input to the
 	 * crypto engine and via CPL_TX_DATA to TP.  However, TP will
 	 * drop individual packets after they have been chopped up
 	 * into MSS-sized chunks if the entire sequence range of those
 	 * packets is less than SND_UNA.  SND_UNA is computed as
 	 * TX_MAX - SND_UNA_RAW.  Thus, use the offset stored in
 	 * m_data to set TX_MAX to the first byte in the TCP sequence
 	 * space the host actually wants us to send and set
 	 * SND_UNA_RAW to 0.
 	 *
 	 * If the host sends us back to back requests that span the
 	 * trailer of a single TLS record (first request ends "in" the
 	 * trailer and second request starts at the next byte but
 	 * still "in" the trailer), the initial bytes of the trailer
 	 * that the first request drops will not be retransmitted.  If
 	 * the host uses the same requests when retransmitting the
 	 * connection will hang.  To handle this, always transmit the
 	 * full trailer for a request that begins "in" the trailer
 	 * (the second request in the example above).  This should
 	 * also help to avoid retransmits for the common case.
 	 *
 	 * A similar condition exists when using CBC for back to back
 	 * requests that span a single AES block.  The first request
 	 * will be truncated to end at the end of the previous AES
 	 * block.  To handle this, always begin transmission at the
 	 * start of the current AES block.
 	 */
 	tx_max_offset = mtod(m_tls, vm_offset_t);
 	if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
-	    m_tls->m_ext_pgs.trail_len) {
+	    m_tls->m_epg_trllen) {
 		/* Always send the full trailer. */
 		tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
-		    m_tls->m_ext_pgs.trail_len;
+		    m_tls->m_epg_trllen;
 	}
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    tx_max_offset > TLS_HEADER_LENGTH) {
 		/* Always send all of the first AES block. */
 		tx_max_offset = TLS_HEADER_LENGTH +
 		    rounddown(tx_max_offset - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 	tx_max = tcp_seqno + tx_max_offset;
 
 	/*
 	 * Update TCB fields.  Reserve space for the FW_ULPTX_WR header
 	 * but don't populate it until we know how many field updates
 	 * are required.
 	 */
 	if (using_scratch)
 		wr = (void *)txq->ss;
 	else
 		wr = dst;
 	out = (void *)(wr + 1);
 	fields = 0;
 	if (set_l2t_idx) {
 		KASSERT(nsegs != 0,
 		    ("trying to set L2T_IX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__,
 		    tlsp->tid, tlsp->l2te->idx);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX,
 		    V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 	if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) {
 		KASSERT(nsegs != 0,
 		    ("trying to set T_RTSEQ_RECENT for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT",
 		    __func__, tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT,
 		    V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT),
 		    V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1])));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_tsecr = ntohl(tsopt[1]);
 	}
 
 	if (first_wr || tlsp->prev_seq != tx_max) {
 		KASSERT(nsegs != 0,
 		    ("trying to set TX_MAX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE,
 		    "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)",
 		    __func__, tlsp->tid, tx_max, tcp_seqno);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX,
 		    V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * If there is data to drop at the beginning of this TLS
 	 * record or if this is a retransmit,
 	 * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX.
 	 */
 	if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) {
 		KASSERT(nsegs != 0,
 		    ("trying to clear SND_UNA_RAW for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__,
 		    tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW,
 		    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 		    V_TCB_SND_UNA_RAW(0));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * Store the expected sequence number of the next byte after
 	 * this record.
 	 */
 	tlsp->prev_seq = tcp_seqno + tlen;
 
 	if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_NXT for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT,
 		    V_TCB_RCV_NXT(M_TCB_RCV_NXT),
 		    V_TCB_RCV_NXT(ntohl(tcp->th_ack)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_ack = ntohl(tcp->th_ack);
 	}
 
 	if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_WND for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND,
 		    V_TCB_RCV_WND(M_TCB_RCV_WND),
 		    V_TCB_RCV_WND(ntohs(tcp->th_win)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_win = ntohs(tcp->th_win);
 	}
 
 	/* Recalculate 'nsegs' if cached value is not available. */
 	if (nsegs == 0)
-		nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_ext_pgs.hdr_len +
-		    offset, plen - (m_tls->m_ext_pgs.hdr_len + offset));
+		nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen +
+		    offset, plen - (m_tls->m_epg_hdrlen + offset));
 
 	/* Calculate the size of the TLS work request. */
 	twr_len = ktls_base_wr_size(tlsp);
 
 	imm_len = 0;
 	if (offset == 0)
-		imm_len += m_tls->m_ext_pgs.hdr_len;
+		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	twr_len += roundup2(imm_len, 16);
 	twr_len += ktls_sgl_size(nsegs);
 
 	/*
 	 * If any field updates were required, determine if they can
 	 * be included in the TLS work request.  If not, use the
 	 * FW_ULPTX_WR work request header at 'wr' as a dedicated work
 	 * request for the field updates and start a new work request
 	 * for the TLS work request afterward.
 	 */
 	if (fields != 0) {
 		wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		if (twr_len + wr_len <= SGE_MAX_WR_LEN &&
 		    tlsp->sc->tlst.combo_wrs) {
 			wr_len += twr_len;
 			txpkt = (void *)out;
 		} else {
 			wr_len += sizeof(*wr);
 			wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 			wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 			    V_FW_WR_LEN16(wr_len / 16));
 			wr->cookie = 0;
 
 			/*
 			 * If we were using scratch space, copy the
 			 * field updates work request to the ring.
 			 */
 			if (using_scratch) {
 				out = dst;
 				copy_to_txd(eq, txq->ss, &out, wr_len);
 			}
 
 			ndesc = howmany(wr_len, EQ_ESIZE);
 			MPASS(ndesc <= available);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 
 			/*
 			 * Determine if we should use scratch space
 			 * for the TLS work request based on the
 			 * available space after advancing pidx for
 			 * the field updates work request.
 			 */
 			wr_len = twr_len;
 			using_scratch = (eq->sidx - pidx <
 			    howmany(wr_len, EQ_ESIZE));
 			if (using_scratch)
 				wr = (void *)txq->ss;
 			else
 				wr = dst;
 			txpkt = (void *)(wr + 1);
 		}
 	} else {
 		wr_len = twr_len;
 		txpkt = (void *)out;
 	}
 
 	wr_len = roundup2(wr_len, 16);
 	MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available);
 
 	/* FW_ULPTX_WR */
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(wr_len / 16));
 	wr->cookie = 0;
 
 	/* ULP_TXPKT */
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (void *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 	    V_ULP_TX_SC_MORE(1));
 	idata->len = sizeof(struct cpl_tx_sec_pdu);
 
 	/*
 	 * The key context, CPL_TX_DATA, and immediate data are part
 	 * of this ULPTX_IDATA when using an inline key.  When reading
 	 * the key from memory, the CPL_TX_DATA and immediate data are
 	 * part of a separate ULPTX_IDATA.
 	 */
 	if (tlsp->inline_key)
 		idata->len += tlsp->tx_key_info_size +
 		    sizeof(struct cpl_tx_data) + imm_len;
 	idata->len = htobe32(idata->len);
 
 	/* CPL_TX_SEC_PDU */
 	sec_pdu = (void *)(idata + 1);
 
 	/*
 	 * For short records, AAD is counted as header data in SCMD0,
 	 * the IV is next followed by a cipher region for the payload.
 	 */
 	if (plen == tlen) {
 		aad_start = 0;
 		aad_stop = 0;
 		iv_offset = 1;
 		auth_start = 0;
 		auth_stop = 0;
 		auth_insert = 0;
 		cipher_start = AES_BLOCK_LEN + 1;
 		cipher_stop = 0;
 
 		sec_pdu->pldlen = htobe32(16 + plen -
-		    (m_tls->m_ext_pgs.hdr_len + offset));
+		    (m_tls->m_epg_hdrlen + offset));
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = htobe32(
 		    tlsp->scmd0_short.ivgen_hdrlen |
-		    V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_ext_pgs.hdr_len : 0));
+		    V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_epg_hdrlen : 0));
 
 		txq->kern_tls_short++;
 	} else {
 		/*
 		 * AAD is TLS header.  IV is after AAD.  The cipher region
 		 * starts after the IV.  See comments in ccr_authenc() and
 		 * ccr_gmac() in t4_crypto.c regarding cipher and auth
 		 * start/stop values.
 		 */
 		aad_start = 1;
 		aad_stop = TLS_HEADER_LENGTH;
 		iv_offset = TLS_HEADER_LENGTH + 1;
-		cipher_start = m_tls->m_ext_pgs.hdr_len + 1;
+		cipher_start = m_tls->m_epg_hdrlen + 1;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		} else {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		}
 
 		sec_pdu->pldlen = htobe32(plen);
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen;
 
 		if (mtod(m_tls, vm_offset_t) == 0)
 			txq->kern_tls_full++;
 		else
 			txq->kern_tls_partial++;
 	}
 	sec_pdu->op_ivinsrtofst = htobe32(
 	    V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) |
 	    V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) |
 	    V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset));
 	sec_pdu->aadstart_cipherstop_hi = htobe32(
 	    V_CPL_TX_SEC_PDU_AADSTART(aad_start) |
 	    V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4));
 	sec_pdu->cipherstop_lo_authinsert = htobe32(
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) |
 	    V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) |
 	    V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) |
 	    V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert));
 
-	sec_pdu->scmd1 = htobe64(m_tls->m_ext_pgs.seqno);
+	sec_pdu->scmd1 = htobe64(m_tls->m_epg_seqno);
 
 	/* Key context */
 	out = (void *)(sec_pdu + 1);
 	if (tlsp->inline_key) {
 		memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size);
 		out += tlsp->tx_key_info_size;
 	} else {
 		/* ULPTX_SC_MEMRD to read key context. */
 		memrd = (void *)out;
 		memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4));
 		memrd->addr = htobe32(tlsp->tx_key_addr >> 5);
 
 		/* ULPTX_IDATA for CPL_TX_DATA and TLS header. */
 		idata = (void *)(memrd + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 		    V_ULP_TX_SC_MORE(1));
 		idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len);
 
 		out = (void *)(idata + 1);
 	}
 
 	/* CPL_TX_DATA */
 	tx_data = (void *)out;
 	OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid));
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		mss = m->m_pkthdr.tso_segsz;
 		tlsp->prev_mss = mss;
 	} else if (tlsp->prev_mss != 0)
 		mss = tlsp->prev_mss;
 	else
 		mss = tlsp->vi->ifp->if_mtu -
 		    (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen);
 	if (offset == 0) {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen));
 		tx_data->rsvd = htobe32(tcp_seqno);
 	} else {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) |
-		    V_TX_LENGTH(tlen - (m_tls->m_ext_pgs.hdr_len + offset)));
-		tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_ext_pgs.hdr_len + offset);
+		    V_TX_LENGTH(tlen - (m_tls->m_epg_hdrlen + offset)));
+		tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_epg_hdrlen + offset);
 	}
 	tx_data->flags = htobe32(F_TX_BYPASS);
 	if (last_wr && tcp->th_flags & TH_PUSH)
 		tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE);
 
 	/* Populate the TLS header */
 	out = (void *)(tx_data + 1);
 	if (offset == 0) {
-		memcpy(out, m_tls->m_epg_hdr, m_tls->m_ext_pgs.hdr_len);
-		out += m_tls->m_ext_pgs.hdr_len;
+		memcpy(out, m_tls->m_epg_hdr, m_tls->m_epg_hdrlen);
+		out += m_tls->m_epg_hdrlen;
 	}
 
 	/* AES IV for a short record. */
 	if (plen == tlen) {
 		iv = out;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			memcpy(iv, tlsp->keyctx.txhdr.txsalt, SALT_SIZE);
 			memcpy(iv + 4, hdr + 1, 8);
 			*(uint32_t *)(iv + 12) = htobe32(2 +
 			    offset / AES_BLOCK_LEN);
 		} else
 			memcpy(iv, hdr + 1, AES_BLOCK_LEN);
 		out += AES_BLOCK_LEN;
 	}
 
 	if (imm_len % 16 != 0) {
 		/* Zero pad to an 8-byte boundary. */
 		memset(out, 0, 8 - (imm_len % 8));
 		out += 8 - (imm_len % 8);
 
 		/*
 		 * Insert a ULP_TX_SC_NOOP if needed so the SGL is
 		 * 16-byte aligned.
 		 */
 		if (imm_len % 16 <= 8) {
 			idata = (void *)out;
 			idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 			idata->len = htobe32(0);
 			out = (void *)(idata + 1);
 		}
 	}
 
 	/* SGL for record payload */
 	sglist_reset(txq->gl);
-	if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_ext_pgs.hdr_len + offset,
-	    plen - (m_tls->m_ext_pgs.hdr_len + offset)) != 0) {
+	if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_epg_hdrlen + offset,
+	    plen - (m_tls->m_epg_hdrlen + offset)) != 0) {
 #ifdef INVARIANTS
 		panic("%s: failed to append sglist", __func__);
 #endif
 	}
 	write_gl_to_buf(txq->gl, out);
 
 	if (using_scratch) {
 		out = dst;
 		copy_to_txd(eq, txq->ss, &out, wr_len);
 	}
 
 	ndesc += howmany(wr_len, EQ_ESIZE);
 	MPASS(ndesc <= available);
 
 	txq->kern_tls_records++;
 	txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t);
 	if (mtod(m_tls, vm_offset_t) != 0) {
 		if (offset == 0)
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t);
 		else
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t) -
-			    (m_tls->m_ext_pgs.hdr_len + offset);
+			    (m_tls->m_epg_hdrlen + offset);
 	}
 
 	txsd = &txq->sdesc[pidx];
 	if (last_wr)
 		txsd->m = m;
 	else
 		txsd->m = NULL;
 	txsd->desc_used = howmany(wr_len, EQ_ESIZE);
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_fin++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct tlspcb *tlsp;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	struct ether_header *eh;
 	tcp_seq tcp_seqno;
 	u_int ndesc, pidx, totdesc;
 	uint16_t vlan_tag;
 	bool has_fin, set_l2t_idx;
 	void *tsopt;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	totdesc = 0;
 	eh = mtod(m, struct ether_header *);
 	tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen +
 	    m->m_pkthdr.l3hlen);
 	pidx = eq->pidx;
 	has_fin = (tcp->th_flags & TH_FIN) != 0;
 
 	/*
 	 * If this TLS record has a FIN, then we will send any
 	 * requested options as part of the FIN packet.
 	 */
 	if (!has_fin && ktls_has_tcp_options(tcp)) {
 		ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__,
 		    tlsp->tid);
 #endif
 	}
 
 	/*
 	 * Allocate a new L2T entry if necessary.  This may write out
 	 * a work request to the txq.
 	 */
 	if (m->m_flags & M_VLANTAG)
 		vlan_tag = m->m_pkthdr.ether_vtag;
 	else
 		vlan_tag = 0xfff;
 	set_l2t_idx = false;
 	if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag ||
 	    memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
 		set_l2t_idx = true;
 		if (tlsp->l2te)
 			t4_l2t_release(tlsp->l2te);
 		tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc,
 		    vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost);
 		if (tlsp->l2te == NULL)
 			CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE");
 		if (ndesc != 0) {
 			MPASS(ndesc <= available - totdesc);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			totdesc += ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 		}
 	}
 
 	/*
 	 * Iterate over each TLS record constructing a work request
 	 * for that record.
 	 */
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_NOMAP);
 
 		/*
 		 * Determine the initial TCP sequence number for this
 		 * record.
 		 */
 		tsopt = NULL;
 		if (m_tls == m->m_next) {
 			tcp_seqno = ntohl(tcp->th_seq) -
 			    mtod(m_tls, vm_offset_t);
 			if (tlsp->using_timestamps)
 				tsopt = ktls_find_tcp_timestamps(tcp);
 		} else {
 			MPASS(mtod(m_tls, vm_offset_t) == 0);
 			tcp_seqno = tlsp->prev_seq;
 		}
 
 		ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls,
 		    nsegs, available - totdesc, tcp_seqno, tsopt, pidx,
 		    set_l2t_idx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 
 		/*
 		 * The value of nsegs from the header mbuf's metadata
 		 * is only valid for the first TLS record.
 		 */
 		nsegs = 0;
 
 		/* Only need to set the L2T index once. */
 		set_l2t_idx = false;
 	}
 
 	if (has_fin) {
 		/*
 		 * If the TCP header for this chain has FIN sent, then
 		 * explicitly send a packet that has FIN set.  This
 		 * will also have PUSH set if requested.  This assumes
 		 * we sent at least one TLS record work request and
 		 * uses the TCP sequence number after that reqeust as
 		 * the sequence number for the FIN packet.
 		 */
 		ndesc = ktls_write_tcp_fin(txq, dst, m, available,
 		    tlsp->prev_seq, pidx);
 		totdesc += ndesc;
 	}
 
 	MPASS(totdesc <= available);
 	return (totdesc);
 }
 
 void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	struct adapter *sc;
 	struct tlspcb *tlsp;
 
 	tlsp = mst_to_tls(mst);
 	sc = tlsp->sc;
 
 	CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid);
 
 	if (tlsp->l2te)
 		t4_l2t_release(tlsp->l2te);
 	if (tlsp->tid >= 0)
 		release_tid(sc, tlsp->tid, tlsp->ctrlq);
 	if (tlsp->ce)
 		t4_release_lip(sc, tlsp->ce);
 	if (tlsp->tx_key_addr >= 0)
 		free_keyid(tlsp, tlsp->tx_key_addr);
 
 	explicit_bzero(&tlsp->keyctx, sizeof(&tlsp->keyctx));
 	free(tlsp, M_CXGBE);
 }
 
 void
 t6_ktls_modload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 void
 t6_ktls_modunload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 #else
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	return (ENXIO);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	return (EINVAL);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	panic("can't happen");
 }
 
 void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	panic("can't happen");
 }
 
 void
 t6_ktls_modload(void)
 {
 }
 
 void
 t6_ktls_modunload(void)
 {
 }
 
 #endif
Index: head/sys/dev/cxgbe/t4_sge.c
===================================================================
--- head/sys/dev/cxgbe/t4_sge.c	(revision 360578)
+++ head/sys/dev/cxgbe/t4_sge.c	(revision 360579)
@@ -1,6068 +1,6068 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/socketvar.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /* Internal mbuf flags stored in PH_loc.eight[1]. */
 #define	MC_NOMAP		0x01
 #define	MC_RAW_WR		0x02
 #define	MC_TLS			0x04
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
     "payload DMA offset in rx buffer (bytes)");
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
     "payload pad boundary (bytes)");
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
     "status page size (bytes)");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
     "Congestion control for RX queues (0 = backpressure, 1 = drop");
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
     0, "Enable buffer packing");
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
     "payload pack boundary (bytes)");
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
 
 #ifdef RATELIMIT
 /*
  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
  * for rewriting.  -1 and 0-3 are all valid values.
  * -1: hardware should leave the TCP timestamps alone.
  * 0: 1ms
  * 1: 100us
  * 2: 10us
  * 3: 1us
  */
 static int tsclk = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
     "Control TCP timestamp rewriting when using pacing");
 
 static int eo_max_backlog = 1024 * 1024;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
     0, "Maximum backlog of ratelimited data per flow");
 #endif
 
 /*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
 static int tscale = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
     "Interrupt holdoff timer scale on T6+");
 
 /*
  * Number of LRO entries in the lro_ctrl structure per rx queue.
  */
 static int lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
     "Number of LRO entries per RX queue");
 
 /*
  * This enables presorting of frames before they're fed into tcp_lro_rx.
  */
 static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *,
     u_int);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_iq *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int,
     struct sysctl_oid *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static int find_refill_source(struct adapter *, int, bool);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 #endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
 
 void
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(h == NULL || t4_an_handler == NULL);
 
 	loc = (uintptr_t *)&t4_an_handler;
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(type < nitems(t4_fw_msg_handler));
 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	MPASS(type != FW_TYPE_RSSCPL);
 	MPASS(type != FW6_TYPE_RSSCPL);
 
 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
 
 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 static int
 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	u_int tid;
 	int cookie;
 
 	MPASS(m == NULL);
 
 	tid = GET_TID(cpl);
 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
 		/*
 		 * The return code for filter-write is put in the CPL cookie so
 		 * we have to rely on the hardware tid (is_ftid) to determine
 		 * that this is a response to a filter.
 		 */
 		cookie = CPL_COOKIE_FILTER;
 	} else {
 		cookie = G_COOKIE(cpl->cookie);
 	}
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
 
 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 	unsigned int cookie;
 
 	MPASS(m == NULL);
 
 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 
 	MPASS(m == NULL);
 	MPASS(cookie != CPL_COOKIE_RESERVED);
 
 	return (act_open_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_hashfilter(sc))
 		cookie = CPL_COOKIE_HASHFILTER;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
 }
 
 static int
 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_etid(sc, tid))
 		cookie = CPL_COOKIE_ETHOFLD;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (fw4_ack_handlers[cookie](iq, rss, m));
 }
 
 static void
 t4_init_shared_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
 }
 
 void
 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < NUM_CPL_COOKIES);
 	MPASS(t4_cpl_handler[opcode] != NULL);
 
 	switch (opcode) {
 	case CPL_SET_TCB_RPL:
 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
 		break;
 	case CPL_L2T_WRITE_RPL:
 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
 		break;
 	case CPL_ACT_OPEN_RPL:
 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
 		break;
 	case CPL_ABORT_RPL_RSS:
 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
 		break;
 	case CPL_FW4_ACK:
 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
 		break;
 	default:
 		MPASS(0);
 		return;
 	}
 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 0 instead.\n", fl_pktshift);
 		fl_pktshift = 0;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
 		printf("Invalid hw.cxgbe.tscale value (%d),"
 		    " using 1 instead.\n", tscale);
 		tscale = 1;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_init_shared_cpl_handlers();
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 #ifdef RATELIMIT
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
 	    CPL_COOKIE_ETHOFLD);
 #endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 /* max 4096 */
 #define MAX_PACK_BOUNDARY 512
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		if (sc->params.pci.mps > MAX_PACK_BOUNDARY)
 			pack = MAX_PACK_BOUNDARY;
 		else
 			pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i, reg;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
 	reg = A_SGE_FL_BUFFER_SIZE2;
 	for (i = 0; i < nitems(sw_buf_sizes); i++) {
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i]);
 		reg += 4;
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE);
 		reg += 4;
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		m = V_TSCALE(M_TSCALE);
 		if (tscale == 1)
 			v = 0;
 		else
 			v = V_TSCALE(tscale - 2);
 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
 
 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 			v &= ~m;
 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(16);
 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 		}
 	}
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  Its
  * address mut be 16B aligned.  If padding is in use the buffer's start and end
  * need to be aligned to the pad boundary as well.  We'll just make sure that
  * the size is a multiple of the pad boundary here, it is up to the buffer
  * allocation code to make sure the start of the buffer is aligned.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct rx_buf_info *rxb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	s->safe_zidx = -1;
 	rxb = &s->rx_buf_info[0];
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		rxb->size1 = sw_buf_sizes[i];
 		rxb->zone = m_getzone(rxb->size1);
 		rxb->type = m_gettype(rxb->size1);
 		rxb->size2 = 0;
 		rxb->hwidx1 = -1;
 		rxb->hwidx2 = -1;
 		for (j = 0; j < SGE_FLBUF_SIZES; j++) {
 			int hwsize = sp->sge_fl_buffer_size[j];
 
 			if (!hwsz_ok(sc, hwsize))
 				continue;
 
 			/* hwidx for size1 */
 			if (rxb->hwidx1 == -1 && rxb->size1 == hwsize)
 				rxb->hwidx1 = j;
 
 			/* hwidx for size2 (buffer packing) */
 			if (rxb->size1 - CL_METADATA_SIZE < hwsize)
 				continue;
 			n = rxb->size1 - hwsize - CL_METADATA_SIZE;
 			if (n == 0) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 				break;	/* stop looking */
 			}
 			if (rxb->hwidx2 != -1) {
 				if (n < sp->sge_fl_buffer_size[rxb->hwidx2] -
 				    hwsize - CL_METADATA_SIZE) {
 					rxb->hwidx2 = j;
 					rxb->size2 = hwsize;
 				}
 			} else if (n <= 2 * CL_METADATA_SIZE) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 			}
 		}
 		if (rxb->hwidx2 != -1)
 			sc->flags |= BUF_PACKING_OK;
 		if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster)
 			s->safe_zidx = i;
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc, 1);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, sc, 0,
 	    sysctl_bufsizes, "A", "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue, control queues, and special
  * purpose rx queues owned by the adapter.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * That's all for the VF driver.
 	 */
 	if (sc->flags & IS_VF)
 		return (rc);
 
 	oid = device_get_sysctl_tree(sc->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	/*
 	 * XXX: General purpose rx queues, one per port.
 	 */
 
 	/*
 	 * Control queues, one per port.
 	 */
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues");
 	for_each_port(sc, i) {
 		struct sge_wrq *ctrlq = &sc->sge.ctrlq[i];
 
 		rc = alloc_ctrlq(sc, ctrlq, i, oid);
 		if (rc != 0)
 			return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	if (!(sc->flags & IS_VF)) {
 		for_each_port(sc, i)
 			free_wrq(sc, &sc->sge.ctrlq[i]);
 	}
 	free_fwq(sc);
 
 	return (0);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu)
 {
 
 	/* large enough even when hw VLAN extraction is disabled */
 	return (sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 	    ETHER_VLAN_ENCAP_LEN + mtu);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, intr_idx, iqidx;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = vi->first_intr;
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(!forwarding_intr_to_fwq(sc));
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * Allocate rx queues first because a default iqid is required when
 	 * creating a tx queue.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_rxq(vi, rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues for offloaded TCP connections");
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_ofld_rxq(vi, ofld_rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #endif
 
 	/*
 	 * Now the tx queues.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
 	for_each_txq(vi, i, txq) {
 		iqidx = vi->first_rxq + (i % vi->nrxq);
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues for TOE/ETHOFLD");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		if (vi->nofldrxq > 0) {
 			iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id,
 			    name);
 		} else {
 			iqidx = vi->first_rxq + (i % vi->nrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name);
 		}
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 	}
 #endif
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
  * unusual scenario.
  *
  * a) Deals with errors, if any.
  * b) Services firmware event queue, which is taking interrupts for all other
  *    queues.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	MPASS(sc->intr_count == 1);
 
 	if (sc->intr_type == INTR_INTX)
 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 
 	t4_intr_err(arg);
 	t4_intr_evt(fwq);
 }
 
 /*
  * Interrupt handler for errors (installed directly when multiple interrupts are
  * being used, or called by t4_intr_all).
  */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 	uint32_t v;
 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
 
 	if (sc->flags & ADAP_ERR)
 		return;
 
 	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
 	if (v & F_PFSW) {
 		sc->swintr++;
 		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
 	}
 
 	t4_slow_intr_handler(sc, verbose);
 }
 
 /*
  * Interrupt handler for iq-only queues.  The firmware event queue is the only
  * such queue right now.
  */
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /*
  * Interrupt handler for iq+fl queues.
  */
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq_fl(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 #ifdef DEV_NETMAP
 /*
  * Interrupt handler for netmap rx queues.
  */
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 
 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
 		service_nm_rxq(nm_rxq);
 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
 	}
 }
 
 /*
  * Interrupt handler for vectors shared between NIC and netmap rx queues.
  */
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 	MPASS(irq->nm_rxq != NULL);
 	t4_nm_intr(irq->nm_rxq);
 
 	MPASS(irq->rxq != NULL);
 	t4_intr(irq->rxq);
 }
 #endif
 
 /*
  * Deals with interrupts on an iq-only (no freelist) queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type;
 	uint32_t lq;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
 	    iq->flags));
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 				panic("%s: data for an iq (%p) with no freelist",
 				    __func__, iq);
 
 				/* NOTREACHED */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (__predict_true(lq >= 1024)) {
 					t4_an_handler(iq, &d->rsp);
 					break;
 				}
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq_fl(q, q->qsize / 16) == 0) {
 						(void) atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 				if (budget) {
 					return (EINPROGRESS);
 				}
 			}
 		}
 
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq_fl(q, q->qsize / 8) == 0)
 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	return (0);
 }
 
 static inline int
 sort_before_lro(struct lro_ctrl *lro)
 {
 
 	return (lro->lro_mbuf_max != 0);
 }
 
 static inline uint64_t
 last_flit_to_ns(struct adapter *sc, uint64_t lf)
 {
 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
 
 	if (n > UINT64_MAX / 1000000)
 		return (n / sc->params.vpd.cclk * 1000000);
 	else
 		return (n * 1000000 / sc->params.vpd.cclk);
 }
 
 static inline void
 move_to_next_rxbuf(struct sge_fl *fl)
 {
 
 	fl->rx_offset = 0;
 	if (__predict_false((++fl->cidx & 7) == 0)) {
 		uint16_t cidx = fl->cidx >> 3;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 }
 
 /*
  * Deals with interrupts on an iq+fl queue.
  */
 static int
 service_iq_fl(struct sge_iq *iq, int budget)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct sge_fl *fl;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs, limit;
 	int rsp_type, starved;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	MPASS(iq->flags & IQ_HAS_FL);
 
 	ndescs = 0;
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_ADJ_CREDIT) {
 		MPASS(sort_before_lro(lro));
 		iq->flags &= ~IQ_ADJ_CREDIT;
 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
 			tcp_lro_flush_all(lro);
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
 			    V_INGRESSQID((u32)iq->cntxt_id) |
 			    V_SEINTARM(iq->intr_params));
 			return (0);
 		}
 		ndescs = 1;
 	}
 #else
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 #endif
 
 	limit = budget ? budget : iq->qsize / 16;
 	fl = &rxq->fl;
 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 		rmb();
 
 		m0 = NULL;
 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 		lq = be32toh(d->rsp.pldbuflen_qid);
 
 		switch (rsp_type) {
 		case X_RSPD_TYPE_FLBUF:
 			if (lq & F_RSPD_NEWBUF) {
 				if (fl->rx_offset > 0)
 					move_to_next_rxbuf(fl);
 				lq = G_RSPD_LEN(lq);
 			}
 			if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 64);
 				FL_UNLOCK(fl);
 				fl_hw_cidx = fl->hw_cidx;
 			}
 
 			if (d->rss.opcode == CPL_RX_PKT) {
 				if (__predict_true(eth_rx(sc, rxq, d, lq) == 0))
 					break;
 				goto out;
 			}
 			m0 = get_fl_payload(sc, fl, lq);
 			if (__predict_false(m0 == NULL))
 				goto out;
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 			break;
 
 		case X_RSPD_TYPE_INTR:
 
 			/*
 			 * There are 1K interrupt-capable queues (qids 0
 			 * through 1023).  A response type indicating a
 			 * forwarded interrupt with a qid >= 1K is an
 			 * iWARP async notification.  That is the only
 			 * acceptable indirect interrupt on this queue.
 			 */
 			if (__predict_false(lq < 1024)) {
 				panic("%s: indirect interrupt on iq_fl %p "
 				    "with qid %u", __func__, iq, lq);
 			}
 
 			t4_an_handler(iq, &d->rsp);
 			break;
 
 		default:
 			KASSERT(0, ("%s: illegal response type %d on iq %p",
 			    __func__, rsp_type, iq));
 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
 			    device_get_nameunit(sc->dev), rsp_type, iq);
 			break;
 		}
 
 		d++;
 		if (__predict_false(++iq->cidx == iq->sidx)) {
 			iq->cidx = 0;
 			iq->gen ^= F_RSPD_GEN;
 			d = &iq->desc[0];
 		}
 		if (__predict_false(++ndescs == limit)) {
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 			    V_INGRESSQID(iq->cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 
 #if defined(INET) || defined(INET6)
 			if (iq->flags & IQ_LRO_ENABLED &&
 			    !sort_before_lro(lro) &&
 			    sc->lro_timeout != 0) {
 				tcp_lro_flush_inactive(lro, &lro_timeout);
 			}
 #endif
 			if (budget)
 				return (EINPROGRESS);
 			ndescs = 0;
 		}
 	}
 out:
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
 			MPASS(sort_before_lro(lro));
 			/* hold back one credit and don't flush LRO state */
 			iq->flags |= IQ_ADJ_CREDIT;
 			ndescs--;
 		} else {
 			tcp_lro_flush_all(lro);
 		}
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	FL_LOCK(fl);
 	starved = refill_fl(sc, fl, 64);
 	FL_UNLOCK(fl);
 	if (__predict_false(starved != 0))
 		add_fl_to_sfl(sc, fl);
 
 	return (0);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct fl_sdesc *sd)
 {
 
 	return ((void *)(sd->cl + sd->moff));
 }
 
 static void
 rxb_free(struct mbuf *m)
 {
 	struct cluster_metadata *clm = m->m_ext.ext_arg1;
 
 	uma_zfree(clm->zone, clm->cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned comes from zone_muf and carries the payload in one of these
  * ways
  * a) complete frame inside the mbuf
  * b) m_cljset (for clusters without metadata)
  * d) m_extaddref (cluster with metadata)
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	struct cluster_metadata *clm;
 	int len, blen;
 	caddr_t payload;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 		payload = sd->cl + fl->rx_offset;
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= rxb->size2);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 		payload = sd->cl;
 	}
 
 	if (fr_offset == 0) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 		m->m_pkthdr.len = remaining;
 	} else {
 		m = m_get(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 	}
 	m->m_len = len;
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->rx_offset += blen;
 			MPASS(fl->rx_offset <= rxb->size2);
 			if (fl->rx_offset < rxb->size2)
 				return (m);	/* without advancing the cidx */
 		}
 	} else if (fl->flags & FL_BUF_PACKING) {
 		clm = cl_metadata(sd);
 		if (sd->nmbuf++ == 0) {
 			clm->refcount = 1;
 			clm->zone = rxb->zone;
 			clm->cl = sd->cl;
 			counter_u64_add(extfree_refs, 1);
 		}
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm,
 		    NULL);
 
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (m);	/* without advancing the cidx */
 	} else {
 		m_cljset(m, sd->cl, rxb->type);
 		sd->cl = NULL;	/* consumed, not a recycle candidate */
 	}
 
 	move_to_next_rxbuf(fl);
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == plen);
 		MPASS(fl->remaining < plen);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, plen);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = plen - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, plen - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	int len, blen;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (len);	/* without advancing the cidx */
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 	}
 	move_to_next_rxbuf(fl);
 	return (len);
 }
 
 static inline void
 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int remaining, fr_offset, len;
 
 	fr_offset = 0;
 	remaining = plen;
 	while (remaining > 0) {
 		len = skip_scatter_segment(sc, fl, fr_offset, remaining);
 		fr_offset += len;
 		remaining -= len;
 	}
 }
 
 static inline int
 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int len;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 
 	if (fl->flags & FL_BUF_PACKING)
 		len = rxb->size2 - fl->rx_offset;
 	else
 		len = rxb->size1;
 
 	return (min(plen, len));
 }
 
 static int
 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d,
     u_int plen)
 {
 	struct mbuf *m0;
 	struct ifnet *ifp = rxq->ifp;
 	struct sge_fl *fl = &rxq->fl;
 	struct vi_info *vi = ifp->if_softc;
 	const struct cpl_rx_pkt *cpl;
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	MPASS(plen > sc->params.sge.fl_pktshift);
 	if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
 	    __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
 		struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 		caddr_t frame;
 		int rc, slen;
 
 		slen = get_segment_len(sc, fl, plen) -
 		    sc->params.sge.fl_pktshift;
 		frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift;
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		rc = pfil_run_hooks(vi->pfil, frame, ifp,
 		    slen | PFIL_MEMPTR | PFIL_IN, NULL);
 		CURVNET_RESTORE();
 		if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) {
 			skip_fl_payload(sc, fl, plen);
 			return (0);
 		}
 		if (rc == PFIL_REALLOCED) {
 			skip_fl_payload(sc, fl, plen);
 			m0 = pfil_mem2mbuf(frame);
 			goto have_mbuf;
 		}
 	}
 
 	m0 = get_fl_payload(sc, fl, plen);
 	if (__predict_false(m0 == NULL))
 		return (ENOMEM);
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 have_mbuf:
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]);
 	m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
 
 	cpl = (const void *)(&d->rss + 1);
 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
 		/*
 		 * Fill up rcv_tstmp but do not set M_TSTMP.
 		 * rcv_tstmp is not in the format that the
 		 * kernel expects and we don't want to mislead
 		 * it.  For now this is only for custom code
 		 * that knows how to interpret cxgbe's stamp.
 		 */
 		m0->m_pkthdr.rcv_tstmp =
 		    last_flit_to_ns(sc, d->rsp.u.last_flit);
 #ifdef notyet
 		m0->m_flags |= M_TSTMP;
 #endif
 	}
 
 #ifdef NUMA
 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
 	if (rxq->iq.flags & IQ_LRO_ENABLED &&
 	    (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
 	    M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
 		if (sort_before_lro(lro)) {
 			tcp_lro_queue_mbuf(lro, m0);
 			return (0); /* queued for sort, then LRO */
 		}
 		if (tcp_lro_rx(lro, m0, 0) == 0)
 			return (0); /* queued for LRO */
 	}
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_cflags(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[4]);
 }
 
 static inline void
 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[4] = flags;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	if (!(mbuf_cflags(m) & MC_TLS))
 		MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 #ifdef RATELIMIT
 static inline int
 mbuf_eo_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[1]);
 }
 
 static inline void
 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
 }
 
 static inline int
 mbuf_eo_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[2];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[2] = len16;
 }
 
 static inline int
 mbuf_eo_tsclk_tsoff(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[3]);
 }
 
 static inline void
 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
 }
 
 static inline int
 needs_eo(struct cxgbe_snd_tag *cst)
 {
 
 	return (cst != NULL && cst->type == IF_SND_TAG_TYPE_RATE_LIMIT);
 }
 #endif
 
 /*
  * Try to allocate an mbuf to contain a raw work request.  To make it
  * easy to construct the work request, don't allocate a chain but a
  * single mbuf.
  */
 struct mbuf *
 alloc_wr_mbuf(int len, int how)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(how, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_cflags(m, MC_RAW_WR);
 	set_mbuf_len16(m, howmany(len, 16));
 	return (m);
 }
 
 static inline int
 needs_hwcsum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
 	    CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & CSUM_TSO);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
 }
 
 static inline int
 needs_tcp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 #ifdef RATELIMIT
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 static inline int
 needs_udp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
 }
 #endif
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_flags & M_VLANTAG);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 static inline int
 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
 {
 	vm_paddr_t paddr;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	int nsegs = 0;
 
 	MBUF_EXT_PGS_ASSERT(m);
 	off = mtod(m, vm_offset_t);
 	len = m->m_len;
 	off += skip;
 	len -= skip;
 
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			paddr = pmap_kextract(
 			    (vm_offset_t)&m->m_epg_hdr[segoff]);
 			if (*nextaddr != paddr)
 				nsegs++;
 			*nextaddr = paddr + seglen;
 		}
 	}
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 		pgoff = 0;
 	};
 	if (len != 0) {
-		seglen = min(len, m->m_ext_pgs.trail_len - off);
+		seglen = min(len, m->m_epg_trllen - off);
 		len -= seglen;
 		paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]);
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 	}
 
 	return (nsegs);
 }
 
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.  It is possible for this
  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
 {
 	vm_paddr_t nextaddr, paddr;
 	vm_offset_t va;
 	int len, nsegs;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len > 0);
 	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	nextaddr = 0;
 	for (; m; m = m->m_next) {
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		if (skip >= len) {
 			skip -= len;
 			continue;
 		}
 		if ((m->m_flags & M_NOMAP) != 0) {
 			*cflags |= MC_NOMAP;
 			nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
 			skip = 0;
 			continue;
 		}
 		va = mtod(m, vm_offset_t) + skip;
 		len -= skip;
 		skip = 0;
 		paddr = pmap_kextract(va);
 		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (paddr == nextaddr)
 			nsegs--;
 		nextaddr = pmap_kextract(va + len - 1) + 1;
 	}
 
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	struct cxgbe_snd_tag *cst;
 #endif
 	uint16_t eh_type;
 	uint8_t cflags;
 
 	cflags = 0;
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0, 0, &cflags);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		cst = mst_to_cst(m0->m_pkthdr.snd_tag);
 	else
 		cst = NULL;
 #endif
 #ifdef KERN_TLS
 	if (cst != NULL && cst->type == IF_SND_TAG_TYPE_TLS) {
 		int len16;
 
 		cflags |= MC_TLS;
 		set_mbuf_cflags(m0, cflags);
 		rc = t6_ktls_parse_pkt(m0, &nsegs, &len16);
 		if (rc != 0)
 			goto fail;
 		set_mbuf_nsegs(m0, nsegs);
 		set_mbuf_len16(m0, len16);
 		return (0);
 	}
 #endif
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
 	    !(cflags & MC_NOMAP))) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, cflags);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
 	 * checksumming is enabled.  needs_l4_csum happens to check for all the
 	 * right things.
 	 */
 	if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
 		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
 		m0->m_pkthdr.snd_tag = NULL;
 		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		cst = NULL;
 	}
 #endif
 
 	if (!needs_hwcsum(m0)
 #ifdef RATELIMIT
    		 && !needs_eo(cst)
 #endif
 	)
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
 			set_mbuf_eo_tsclk_tsoff(m0,
 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
 	} else if (needs_udp_csum(m0)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
 #ifdef RATELIMIT
 	if (needs_eo(cst)) {
 		u_int immhdrs;
 
 		/* EO WRs have the headers in the WR and not the GL. */
 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
 		    m0->m_pkthdr.l4hlen;
 		cflags = 0;
 		nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
 		MPASS(cflags == mbuf_cflags(m0));
 		set_mbuf_eo_nsegs(m0, nsegs);
 		set_mbuf_eo_len16(m0,
 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
 	}
 #endif
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16) {
 			int available;
 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 			/*
 			 * Note that the WR via which we'll request tx updates
 			 * is at pidx and not eq->pidx, which has moved on
 			 * already.
 			 */
 			dst = (void *)&eq->desc[pidx];
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 			if (available < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 			}
 
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		} else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0);
 }
 
 static inline int
 discard_tx(struct sge_eq *eq)
 {
 
 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
 }
 
 static inline int
 wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 {
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
 	case FW_ETH_TX_PKT_WR:
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false(discard_tx(eq))) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) {
 			MPASS(howmany(mbuf_len16(m0), EQ_ESIZE / 16) <= 64);
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (mbuf_cflags(m0) & MC_RAW_WR) {
 			total++;
 			remaining--;
 			n = write_raw_wr(txq, (void *)wr, m0, available);
 #ifdef KERN_TLS
 		} else if (mbuf_cflags(m0) & MC_TLS) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = t6_ktls_write_wr(txq,(void *)wr, m0,
 			    mbuf_nsegs(m0), available);
 #endif
 		} else if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(sc, txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(sc, txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available);
 		if (!(mbuf_cflags(m0) & MC_TLS))
 			MPASS(n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (wr_can_update_eq(wr)) {
 			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
 			    32) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			}
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING);
 	fl->safe_zidx = sc->sge.safe_zidx;
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly then the intr_idx
  * specifies the vector, starting from 0.  -1 means the interrupts for this
  * queue should be forwarded to the fwq.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (intr_idx < 0) {
 		/* Forwarded interrupts, all headed to fwq */
 		v |= F_FW_IQ_CMD_IQANDST;
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
 	} else {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 	}
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_iq *iq)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->abs_id, 0,
 	    sysctl_uint16, "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->cntxt_id, 0,
 	    sysctl_uint16, "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &fl->cntxt_id, 0,
 	    sysctl_uint16, "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue");
 	add_iq_sysctls(&sc->ctx, oid, fwq);
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	char name[16];
 	struct sysctl_oid_list *children;
 
 	snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev),
 	    idx);
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 
 	children = SYSCTL_CHILDREN(oid);
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrl queue");
 	rc = alloc_wrq(sc, NULL, ctrlq, oid);
 
 	return (rc);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_e_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
 	if (rc != 0)
 		return (rc);
 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP)
 		rxq->iq.flags |= IQ_RX_TIMESTAMP;
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
 	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->fl_sidx2 = nm_rxq->fl_sidx;	/* copy for rxsync cacheline */
 	nm_rxq->intr_idx = intr_idx;
 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_abs_id,
 	    0, sysctl_uint16, "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_cntxt_id,
 	    0, sysctl_uint16, "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->fl_cntxt_id,
 	    0, sysctl_uint16, "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
 	else
 		MPASS(nm_rxq->iq_cntxt_id == 0);
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	if (sc->params.fw_vers >= FW_VERSION32(1, 24, 11, 0))
 		nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS2_WR));
 	else
 		nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_txq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_txq->pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
 	else
 		MPASS(nm_txq->cntxt_id == 0);
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 /*
  * Returns a reasonable automatic cidx flush threshold for a given queue size.
  */
 static u_int
 qsize_to_fthresh(int qsize)
 {
 	u_int fthresh;
 
 	while (!powerof2(qsize))
 		qsize++;
 	fthresh = ilog2(qsize);
 	if (fthresh > X_CIDXFLUSHTHRESH_128)
 		fthresh = X_CIDXFLUSHTHRESH_128;
 
 	return (fthresh);
 }
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = eq->dbidx = 0;
 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
 	eq->equeqidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &wrq->eq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &wrq->eq.cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &wrq->eq.pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 		    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &eq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &eq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &eq->pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, vi, idx, sysctl_tc,
 	    "I", "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
 
 #ifdef KERN_TLS
 	if (sc->flags & KERN_TLS_OK) {
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_records", CTLFLAG_RD, &txq->kern_tls_records,
 		    "# of NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_short", CTLFLAG_RD, &txq->kern_tls_short,
 		    "# of short NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_partial", CTLFLAG_RD, &txq->kern_tls_partial,
 		    "# of partial NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_full", CTLFLAG_RD, &txq->kern_tls_full,
 		    "# of full NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_octets", CTLFLAG_RD, &txq->kern_tls_octets,
 		    "# of payload octets in transmitted NIC TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_waste", CTLFLAG_RD, &txq->kern_tls_waste,
 		    "# of octets DMAd but not transmitted in NIC TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_options", CTLFLAG_RD, &txq->kern_tls_options,
 		    "# of NIC TLS options-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_header", CTLFLAG_RD, &txq->kern_tls_header,
 		    "# of NIC TLS header-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin,
 		    "# of NIC TLS FIN-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_fin_short", CTLFLAG_RD, &txq->kern_tls_fin_short,
 		    "# of NIC TLS padded FIN packets on short TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_cbc", CTLFLAG_RD, &txq->kern_tls_cbc,
 		    "# of NIC TLS sessions using AES-CBC");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_gcm", CTLFLAG_RD, &txq->kern_tls_gcm,
 		    "# of NIC TLS sessions using AES-GCM");
 	}
 #endif
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct rx_buf_info *rxb;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 				goto recycled;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sd);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 		rxb = &sc->sge.rx_buf_info[fl->zidx];
 		cl = uma_zalloc(rxb->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (fl->zidx != fl->safe_zidx) {
 				rxb = &sc->sge.rx_buf_info[fl->safe_zidx];
 				cl = uma_zalloc(rxb->zone, M_NOWAIT);
 			}
 			if (cl == NULL)
 				break;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		sd->cl = cl;
 		sd->zidx = fl->zidx;
 
 		if (fl->flags & FL_BUF_PACKING) {
 			*d = htobe64(pa | rxb->hwidx2);
 			sd->moff = rxb->size2;
 		} else {
 			*d = htobe64(pa | rxb->hwidx1);
 			sd->moff = 0;
 		}
 recycled:
 		sd->nmbuf = 0;
 		d++;
 		sd++;
 		if (__predict_false((++fl->pidx & 7) == 0)) {
 			uint16_t pidx = fl->pidx >> 3;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (n < 8 || pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if ((fl->pidx >> 3) != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl);
 		else if (fl->flags & FL_BUF_PACKING) {
 			clm = cl_metadata(sd);
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone,
 				    sd->cl);
 				counter_u64_add(extfree_rels, 1);
 			}
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 static inline uint64_t
 csum_to_ctrl(struct adapter *sc, struct mbuf *m)
 {
 	uint64_t ctrl;
 	int csum_type;
 
 	M_ASSERTPKTHDR(m);
 
 	if (needs_hwcsum(m) == 0)
 		return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
 
 	ctrl = 0;
 	if (needs_l3_csum(m) == 0)
 		ctrl |= F_TXPKT_IPCSUM_DIS;
 	switch (m->m_pkthdr.csum_flags &
 	    (CSUM_IP_TCP | CSUM_IP_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP)) {
 	case CSUM_IP_TCP:
 		csum_type = TX_CSUM_TCPIP;
 		break;
 	case CSUM_IP_UDP:
 		csum_type = TX_CSUM_UDPIP;
 		break;
 	case CSUM_IP6_TCP:
 		csum_type = TX_CSUM_TCPIP6;
 		break;
 	case CSUM_IP6_UDP:
 		csum_type = TX_CSUM_UDPIP6;
 		break;
 	default:
 		/* needs_hwcsum told us that at least some hwcsum is needed. */
 		MPASS(ctrl == 0);
 		MPASS(m->m_pkthdr.csum_flags & CSUM_IP);
 		ctrl |= F_TXPKT_L4CSUM_DIS;
 		csum_type = TX_CSUM_IP;
 		break;
 	}
 
 	MPASS(m->m_pkthdr.l2hlen > 0);
 	MPASS(m->m_pkthdr.l3hlen > 0);
 	ctrl |= V_TXPKT_CSUM_TYPE(csum_type) |
 	    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	if (chip_id(sc) <= CHELSIO_T5)
 		ctrl |= V_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
 	else
 		ctrl |= V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
 
 	return (ctrl);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 			ETHER_HDR_LEN) >> 2) |
 		    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 		    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a raw WR to the hardware descriptors, update the software
  * descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct mbuf *m;
 	caddr_t dst;
 	int len16, ndesc;
 
 	len16 = mbuf_len16(m0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	dst = wr;
 	for (m = m0; m != NULL; m = m->m_next)
 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 
 	txq->raw_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
 	    available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 			ETHER_HDR_LEN) >> 2) |
 		    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 		    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	if (cannot_use_txpkts(m))
 		return (1);
 
 	nsegs = mbuf_nsegs(m);
 	if (txp->wr_type == 1 && nsegs != 1)
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 		} else {
 			cpl = flitp;
 		}
 
 		/* Checksum offload */
 		ctrl1 = csum_to_ctrl(sc, m);
 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
 	} else {
 		txq->txpkts1_pkts += txp->npkt;
 		txq->txpkts1_wrs++;
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 		KASSERT(ndesc != 0,
 		    ("%s: descriptor with no credits: cidx %d",
 		    __func__, eq->cidx));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		uint64_t rc;
 
 		rc = (uint64_t)segs[i].ss_len << 32;
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)(segs[i + 1].ss_len);
 
 		return (htobe64(rc));
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static int
 find_refill_source(struct adapter *sc, int maxp, bool packing)
 {
 	int i, zidx = -1;
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 
 	if (packing) {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx2 == -1)
 				continue;
 			if (rxb->size1 < PAGE_SIZE &&
 			    rxb->size1 < largest_rx_cluster)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE);
 			if (rxb->size2 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	} else {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx1 == -1)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			if (rxb->size1 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	}
 
 	return (zidx);
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static inline bool
 bufidx_used(struct adapter *sc, int idx)
 {
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 	int i;
 
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		if (rxb->size1 > largest_rx_cluster)
 			continue;
 		if (rxb->hwidx1 == idx || rxb->hwidx2 == idx)
 			return (true);
 	}
 
 	return (false);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sge_params *sp = &sc->params.sge;
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++) {
 		if (bufidx_used(sc, i))
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 #ifdef RATELIMIT
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
 {
 	u_int n;
 
 	MPASS(immhdrs > 0);
 
 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
 	if (__predict_false(nsegs == 0))
 		goto done;
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 done:
 	return (howmany(n, 16));
 }
 
 #define ETID_FLOWC_NPARAMS 6
 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
 
 static int
 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi,
     struct vi_info *vi)
 {
 	struct wrq_cookie cookie;
 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
 	struct fw_flowc_wr *flowc;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
 	    EO_FLOWC_PENDING);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		return (ENOMEM);
 
 	bzero(flowc, ETID_FLOWC_LEN);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(cst->iqid);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 	flowc->mnemval[5].val = htobe32(cst->schedcl);
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags &= ~EO_FLOWC_PENDING;
 	cst->flags |= EO_FLOWC_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
 	cst->tx_credits -= ETID_FLOWC_LEN16;
 
 	return (0);
 }
 
 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
 
 void
 send_etid_flush_wr(struct cxgbe_rate_tag *cst)
 {
 	struct fw_flowc_wr *flowc;
 	struct wrq_cookie cookie;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		CXGBE_UNIMPLEMENTED(__func__);
 
 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags |= EO_FLUSH_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
 	cst->tx_credits -= ETID_FLUSH_LEN16;
 	cst->ncompl++;
 }
 
 static void
 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr,
     struct mbuf *m0, int compl)
 {
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	uint32_t ctrl;	/* used in many unrelated places */
 	int len16, pktlen, nsegs, immhdrs;
 	caddr_t dst;
 	uintptr_t p;
 	struct ulptx_sgl *usgl;
 	struct sglist sg;
 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	M_ASSERTPKTHDR(m0);
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
 
 	len16 = mbuf_eo_len16(m0);
 	nsegs = mbuf_eo_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
 	ctrl += immhdrs;
 
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	wr->r3 = 0;
 	if (needs_udp_csum(m0)) {
 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
 		wr->u.udpseg.rtplen = 0;
 		wr->u.udpseg.r4 = 0;
 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
 		cpl = (void *)(wr + 1);
 	} else {
 		MPASS(needs_tcp_csum(m0));
 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
 		wr->u.tcpseg.r4 = 0;
 		wr->u.tcpseg.r5 = 0;
 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
 
 		if (needs_tso(m0)) {
 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
 
 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 			    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 				ETHER_HDR_LEN) >> 2) |
 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 				ctrl |= F_LSO_IPV6;
 			lso->lso_ctrl = htobe32(ctrl);
 			lso->ipid_ofst = htobe16(0);
 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 			lso->seqno_offset = htobe32(0);
 			lso->len = htobe32(pktlen);
 
 			cpl = (void *)(lso + 1);
 		} else {
 			wr->u.tcpseg.mss = htobe16(0xffff);
 			cpl = (void *)(wr + 1);
 		}
 	}
 
 	/* Checksum offload must be requested for ethofld. */
 	MPASS(needs_l4_csum(m0));
 	ctrl1 = csum_to_ctrl(cst->adapter, m0);
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = cst->ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
 	p = (uintptr_t)(cpl + 1);
 	m_copydata(m0, 0, immhdrs, (void *)p);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 		int i, pad;
 
 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
 		p += immhdrs;
 		pad = 16 - (immhdrs & 0xf);
 		bzero((void *)p, pad);
 
 		usgl = (void *)(p + pad);
 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 		    V_ULPTX_NSGE(nsegs));
 
 		sglist_init(&sg, nitems(segs), segs);
 		for (; m0 != NULL; m0 = m0->m_next) {
 			if (__predict_false(m0->m_len == 0))
 				continue;
 			if (immhdrs >= m0->m_len) {
 				immhdrs -= m0->m_len;
 				continue;
 			}
 			if (m0->m_flags & M_NOMAP)
 				sglist_append_mbuf_epg(&sg, m0,
 				    mtod(m0, vm_offset_t), m0->m_len);
                         else
 				sglist_append(&sg, mtod(m0, char *) + immhdrs,
 				    m0->m_len - immhdrs);
 			immhdrs = 0;
 		}
 		MPASS(sg.sg_nseg == nsegs);
 
 		/*
 		 * Zero pad last 8B in case the WR doesn't end on a 16B
 		 * boundary.
 		 */
 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
 
 		usgl->len0 = htobe32(segs[0].ss_len);
 		usgl->addr0 = htobe64(segs[0].ss_paddr);
 		for (i = 0; i < nsegs - 1; i++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 	}
 
 }
 
 static void
 ethofld_tx(struct cxgbe_rate_tag *cst)
 {
 	struct mbuf *m;
 	struct wrq_cookie cookie;
 	int next_credits, compl;
 	struct fw_eth_tx_eo_wr *wr;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
 		M_ASSERTPKTHDR(m);
 
 		/* How many len16 credits do we need to send this mbuf. */
 		next_credits = mbuf_eo_len16(m);
 		MPASS(next_credits > 0);
 		if (next_credits > cst->tx_credits) {
 			/*
 			 * Tx will make progress eventually because there is at
 			 * least one outstanding fw4_ack that will return
 			 * credits and kick the tx.
 			 */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
 		if (__predict_false(wr == NULL)) {
 			/* XXX: wishful thinking, not a real assertion. */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		cst->tx_credits -= next_credits;
 		cst->tx_nocompl += next_credits;
 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
 		ETHER_BPF_MTAP(cst->com.com.ifp, m);
 		write_ethofld_wr(cst, wr, m, compl);
 		commit_wrq_wr(cst->eo_txq, wr, &cookie);
 		if (compl) {
 			cst->ncompl++;
 			cst->tx_nocompl	= 0;
 		}
 		(void) mbufq_dequeue(&cst->pending_tx);
 
 		/*
 		 * Drop the mbuf's reference on the tag now rather
 		 * than waiting until m_freem().  This ensures that
 		 * cxgbe_rate_tag_free gets called when the inp drops
 		 * its reference on the tag and there are no more
 		 * mbufs in the pending_tx queue and can flush any
 		 * pending requests.  Otherwise if the last mbuf
 		 * doesn't request a completion the etid will never be
 		 * released.
 		 */
 		m->m_pkthdr.snd_tag = NULL;
 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		m_snd_tag_rele(&cst->com.com);
 
 		mbufq_enqueue(&cst->pending_fwack, m);
 	}
 }
 
 int
 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
 {
 	struct cxgbe_rate_tag *cst;
 	int rc;
 
 	MPASS(m0->m_nextpkt == NULL);
 	MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
 	MPASS(m0->m_pkthdr.snd_tag != NULL);
 	cst = mst_to_crt(m0->m_pkthdr.snd_tag);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 
 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
 		struct vi_info *vi = ifp->if_softc;
 		struct port_info *pi = vi->pi;
 		struct adapter *sc = pi->adapter;
 		const uint32_t rss_mask = vi->rss_size - 1;
 		uint32_t rss_hash;
 
 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
 		if (M_HASHTYPE_ISHASH(m0))
 			rss_hash = m0->m_pkthdr.flowid;
 		else
 			rss_hash = arc4random();
 		/* We assume RSS hashing */
 		cst->iqid = vi->rss[rss_hash & rss_mask];
 		cst->eo_txq += rss_hash % vi->nofldtxq;
 		rc = send_etid_flowc_wr(cst, pi, vi);
 		if (rc != 0)
 			goto done;
 	}
 
 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
 		rc = ENOBUFS;
 		goto done;
 	}
 
 	mbufq_enqueue(&cst->pending_tx, m0);
 	cst->plen += m0->m_pkthdr.len;
 
 	/*
 	 * Hold an extra reference on the tag while generating work
 	 * requests to ensure that we don't try to free the tag during
 	 * ethofld_tx() in case we are sending the final mbuf after
 	 * the inp was freed.
 	 */
 	m_snd_tag_ref(&cst->com.com);
 	ethofld_tx(cst);
 	mtx_unlock(&cst->lock);
 	m_snd_tag_rele(&cst->com.com);
 	return (0);
 
 done:
 	mtx_unlock(&cst->lock);
 	if (__predict_false(rc != 0))
 		m_freem(m0);
 	return (rc);
 }
 
 static int
 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	struct mbuf *m;
 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct cxgbe_rate_tag *cst;
 	uint8_t credits = cpl->credits;
 
 	cst = lookup_etid(sc, etid);
 	mtx_lock(&cst->lock);
 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
 		MPASS(credits >= ETID_FLOWC_LEN16);
 		credits -= ETID_FLOWC_LEN16;
 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
 	}
 
 	KASSERT(cst->ncompl > 0,
 	    ("%s: etid %u (%p) wasn't expecting completion.",
 	    __func__, etid, cst));
 	cst->ncompl--;
 
 	while (credits > 0) {
 		m = mbufq_dequeue(&cst->pending_fwack);
 		if (__predict_false(m == NULL)) {
 			/*
 			 * The remaining credits are for the final flush that
 			 * was issued when the tag was freed by the kernel.
 			 */
 			MPASS((cst->flags &
 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
 			    EO_FLUSH_RPL_PENDING);
 			MPASS(credits == ETID_FLUSH_LEN16);
 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
 			MPASS(cst->ncompl == 0);
 
 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
 			cst->tx_credits += cpl->credits;
 			cxgbe_rate_tag_free_locked(cst);
 			return (0);	/* cst is gone. */
 		}
 		KASSERT(m != NULL,
 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
 		    credits));
 		KASSERT(credits >= mbuf_eo_len16(m),
 		    ("%s: too few credits (%u, %u, %u)", __func__,
 		    cpl->credits, credits, mbuf_eo_len16(m)));
 		credits -= mbuf_eo_len16(m);
 		cst->plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 
 	cst->tx_credits += cpl->credits;
 	MPASS(cst->tx_credits <= cst->tx_total);
 
 	if (cst->flags & EO_SND_TAG_REF) {
 		/*
 		 * As with ethofld_transmit(), hold an extra reference
 		 * so that the tag is stable across ethold_tx().
 		 */
 		m_snd_tag_ref(&cst->com.com);
 		m = mbufq_first(&cst->pending_tx);
 		if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
 			ethofld_tx(cst);
 		mtx_unlock(&cst->lock);
 		m_snd_tag_rele(&cst->com.com);
 	} else {
 		/*
 		 * There shouldn't be any pending packets if the tag
 		 * was freed by the kernel since any pending packet
 		 * should hold a reference to the tag.
 		 */
 		MPASS(mbufq_first(&cst->pending_tx) == NULL);
 		mtx_unlock(&cst->lock);
 	}
 
 	return (0);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 360578)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 360579)
@@ -1,2304 +1,2304 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
 
 void
 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (tp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 	if (toep->params.tc_idx != -1) {
 		MPASS(toep->params.tc_idx >= 0 &&
 		    toep->params.tc_idx < sc->chip_params->nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
 	if (tp) {
 		FLOWC_PARAM(MSS, toep->params.emss);
 		FLOWC_PARAM(SNDNXT, tp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
 	} else
 		FLOWC_PARAM(MSS, 512);
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
 	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, ulp_mode(toep));
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->params.tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 	}
 
 	if (toep->params.tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->params.tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
 	toep->params.tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 
 	INP_LOCK_ASSERT(inp);
 
 	toep->params.mtu_idx = G_TCPOPT_MSS(opt);
 	tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
 
 	toep->params.emss = tp->t_maxseg;
 	if (G_TCPOPT_TSTAMP(opt)) {
 		toep->params.tstamp = 1;
 		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 	} else
 		toep->params.tstamp = 0;
 
 	if (G_TCPOPT_SACK(opt)) {
 		toep->params.sack = 1;
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	} else {
 		toep->params.sack = 0;
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 	}
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	} else
 		toep->params.wscale = 0;
 
 	CTR6(KTR_CXGBE,
 	    "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
 	    toep->tid, toep->params.mtu_idx, toep->params.emss,
 	    toep->params.tstamp, toep->params.sack, toep->params.wscale);
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	uint16_t tcpopt = be16toh(opt);
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	send_flowc_wr(toep, tp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (ulp_mode(toep) == ULP_MODE_TLS) {
 		if (toep->tls.rcv_over >= rx_credits) {
 			toep->tls.rcv_over -= rx_credits;
 			rx_credits = 0;
 		} else {
 			rx_credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
 
 	if (rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
 	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits)
 {
 	const int n = 1;	/* Use no more than one desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
     unsigned int plen, uint8_t credits, int shove, int ulp_submode)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (toep->params.tx_align > 0) {
 		if (plen < 2 * toep->params.emss)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(toep->params.nagle == 0 ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (m->m_flags & M_NOMAP)
 			rc = sglist_append_mbuf_epg(&sg, m,
 			    mtod(m, vm_offset_t), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool nomap_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP ||
 	    ulp_mode(toep) == ULP_MODE_TLS ||
 	    ulp_mode(toep) == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		nomap_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if (m->m_flags & M_NOMAP) {
 #ifdef KERN_TLS
-				if (m->m_ext_pgs.tls != NULL) {
+				if (m->m_epg_tls != NULL) {
 					toep->flags |= TPF_KTLS;
 					if (plen == 0) {
 						SOCKBUF_UNLOCK(sb);
 						t4_push_ktls(sc, toep, 0);
 						return;
 					}
 					break;
 				}
 #endif
 				n = sglist_count_mbuf_epg(m,
 				    mtod(m, vm_offset_t), m->m_len);
 			} else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(so,
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (m->m_flags & M_NOMAP)
 				nomap_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL,
 			    ("%s: nothing to send, but m != NULL", __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !nomap_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		M_ASSERTPKTHDR(sndptr);
 
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/*
 			 * This mbuf would send us _over_ the nsegs limit.
 			 * Suspend tx because the PDU can't be sent out.
 			 */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/*
 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
 		 * is NULL.  A PDU's length is always a multiple of 4.
 		 */
 		MPASS(m == NULL);
 		MPASS((plen & 3) == 0);
 		MPASS(sndptr->m_pkthdr.len == plen);
 
 		shove = !(tp->t_flags & TF_MORETOCOME);
 		ulp_submode = mbuf_ulp_submode(sndptr);
 		MPASS(ulp_submode < nitems(ulp_extra_len));
 
 		/*
 		 * plen doesn't include header and data digests, which are
 		 * generated and inserted in the right places by the TOE, but
 		 * they do occupy TCP sequence space and need to be accounted
 		 * for.
 		 */
 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
 		if (plen <= max_imm) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
 			    shove, ulp_submode);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
 			    shove, ulp_submode);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += adjusted_plen;
 		tp->snd_max += adjusted_plen;
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 {
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, drop);
 	else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM)
 		t4_push_tls_records(sc, toep, drop);
 #ifdef KERN_TLS
 	else if (toep->flags & TPF_KTLS)
 		t4_push_ktls(sc, toep, drop);
 #endif
 	else
 		t4_push_frames(sc, toep, drop);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED)
 		t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    toep->ddp.flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	tp->rcv_nxt++;	/* FIN */
 
 	so = inp->inp_socket;
 	socantrcvmore(so);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 
 	if (ulp_mode(toep) != ULP_MODE_RDMA) {
 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    		be32toh(cpl->rcv_nxt)));
 	}
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_wrq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
 	int len, rx_credits;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	sbappendstream_locked(sb, m, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		if (txsd->iv_buffer) {
 			free(txsd->iv_buffer, M_CXGBE);
 			txsd->iv_buffer = NULL;
 		}
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		t4_push_data(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data trasmitted before the tid's ULP mode
 				 * changed to ISCSI is still in so_snd.
 				 * Incoming credits should account for so_snd
 				 * first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (tls_tx_key(toep) &&
 			    toep->tls.mode == TLS_MODE_TLSOM) {
 				struct tls_ofld_info *tls_ofld = &toep->tls;
 
 				MPASS(tls_ofld->sb_off >= plen);
 				tls_ofld->sb_off -= plen;
 			}
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend1' field in AIO jobs to hold an error that should
  * be reported when the job is completed, the 'backend3' field to
  * store the amount of data sent by the AIO job so far, and the
  * 'backend4' field to hold a reference count on the job.
  *
  * Each unmapped mbuf holds a reference on the job as does the queue
  * so long as the job is queued.
  */
 #define	aio_error	backend1
 #define	aio_sent	backend3
 #define	aio_refs	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 
 static void
 aiotx_free_job(struct kaiocb *job)
 {
 	long status;
 	int error;
 
 	if (refcount_release(&job->aio_refs) == 0)
 		return;
 
 	error = (intptr_t)job->aio_error;
 	status = job->aio_sent;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error != 0 && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else {
 		job->msgsnd = 1;
 		aio_complete(job, status, 0);
 	}
 }
 
 static void
 aiotx_free_pgs(struct mbuf *m)
 {
 	struct kaiocb *job;
 	vm_page_t pg;
 
 	MBUF_EXT_PGS_ASSERT(m);
 	job = m->m_ext.ext_arg1;
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(job));
 #endif
 
-	for (int i = 0; i < m->m_ext_pgs.npgs; i++) {
+	for (int i = 0; i < m->m_epg_npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire(pg, PQ_ACTIVE);
 	}
 
 	aiotx_free_job(job);
 }
 
 /*
  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
  * of an AIO job.
  */
 static struct mbuf *
 alloc_aiotx_mbuf(struct kaiocb *job, int len)
 {
 	struct vmspace *vm;
 	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
 	struct mbuf *m, *top, *last;
 	vm_map_t map;
 	vm_offset_t start;
 	int i, mlen, npages, pgoff;
 
 	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
 	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
 	    job, len));
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
 	pgoff = start & PAGE_MASK;
 
 	top = NULL;
 	last = NULL;
 	while (len > 0) {
 		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
 		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
 		    ("%s: next start (%#jx + %#x) is not page aligned",
 		    __func__, (uintmax_t)start, mlen));
 
 		npages = vm_fault_quick_hold_pages(map, start, mlen,
 		    VM_PROT_WRITE, pgs, nitems(pgs));
 		if (npages < 0)
 			break;
 
 		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
 		if (m == NULL) {
 			vm_page_unhold_pages(pgs, npages);
 			break;
 		}
 
-		m->m_ext_pgs.first_pg_off = pgoff;
-		m->m_ext_pgs.npgs = npages;
+		m->m_epg_1st_off = pgoff;
+		m->m_epg_npgs = npages;
 		if (npages == 1) {
 			KASSERT(mlen + pgoff <= PAGE_SIZE,
 			    ("%s: single page is too large (off %d len %d)",
 			    __func__, pgoff, mlen));
-			m->m_ext_pgs.last_pg_len = mlen;
+			m->m_epg_last_len = mlen;
 		} else {
-			m->m_ext_pgs.last_pg_len = mlen - (PAGE_SIZE - pgoff) -
+			m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
 			    (npages - 2) * PAGE_SIZE;
 		}
 		for (i = 0; i < npages; i++)
 			m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
 
 		m->m_len = mlen;
 		m->m_ext.ext_size = npages * PAGE_SIZE;
 		m->m_ext.ext_arg1 = job;
 		refcount_acquire(&job->aio_refs);
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
 		    __func__, jobtotid(job), m, job, npages);
 #endif
 
 		if (top == NULL)
 			top = m;
 		else
 			last->m_next = m;
 		last = m;
 
 		len -= mlen;
 		start += mlen;
 		pgoff = 0;
 	}
 
 	return (top);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct sockbuf *sb;
 	struct file *fp;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error, len;
 	bool moretocome, sendmore;
 
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	fp = job->fd_file;
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(fp->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Inline sosend_generic(). */
 
 	error = sblock(sb, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			sbunlock(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	len = sbspace(sb);
 	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
 		len = job->uaiocb.aio_nbytes - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (len > toep->params.sndbuf) {
 		len = toep->params.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(len != 0);
 
 	m = alloc_aiotx_mbuf(job, len);
 	if (m == NULL) {
 		sbunlock(sb);
 		error = EFAULT;
 		goto out;
 	}
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		sbunlock(sb);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	job->aio_sent += m_length(m, NULL);
 
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tp->t_fb->tfb_tcp_output(tp);
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	sbunlock(sb);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a blocking socket and the request has not been
 	 * fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop the queue's
 	 * reference to the job.  Any mbufs in flight should still
 	 * hold a reference, but this drops the reference that the
 	 * queue owns while it is waiting to queue mbufs to the
 	 * socket.
 	 */
 	aiotx_free_job(job);
 
 out:
 	if (error) {
 		job->aio_error = (void *)(intptr_t)error;
 		aiotx_free_job(job);
 	}
 	if (m != NULL)
 		m_free(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct socket *so;
 	struct kaiocb *job;
 
 	so = toep->aiotx_so;
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_so = NULL;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	CURVNET_RESTORE();
 
 	free_toepcb(toep);
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 static void
 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
 #endif
 	if (toep->aiotx_so != NULL)
 		return;
 	soref(so);
 	toep->aiotx_so = so;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	job->aio_error = (void *)(intptr_t)ECANCELED;
 	aiotx_free_job(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	refcount_init(&job->aio_refs, 1);
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(so, toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_tls.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tls.c	(revision 360578)
+++ head/sys/dev/cxgbe/tom/t4_tls.c	(revision 360579)
@@ -1,2242 +1,2242 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017-2018 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #ifdef KERN_TLS
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 #endif
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "crypto/t4_crypto.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /*
  * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while
  * the mbuf is in the ulp_pdu_reclaimq.
  */
 #define	tls_tcp_seq	PH_loc.thirtytwo[0]
 
 /*
  * Handshake lock used for the handshake timer.  Having a global lock
  * is perhaps not ideal, but it avoids having to use callout_drain()
  * in tls_uninit_toep() which can't block.  Also, the timer shouldn't
  * actually fire for most connections.
  */
 static struct mtx tls_handshake_lock;
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
     uint64_t val)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0);
 }
 
 /* TLS and DTLS common routines */
 bool
 can_tls_offload(struct adapter *sc)
 {
 
 	return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS);
 }
 
 int
 tls_tx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
 int
 tls_rx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->rx_key_addr >= 0);
 }
 
 static int
 key_size(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ?
 		tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE);
 }
 
 /* Set TLS Key-Id in TCB */
 static void
 t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
 			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
 			 V_TCB_RX_TLS_KEY_TAG(key_id));
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
 static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
 static void
 tls_clr_ofld_mode(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 
 	/* Operate in PDU extraction mode only. */
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 	t4_clear_rx_quiesce(toep);
 }
 
 static void
 tls_clr_quiesce(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 	t4_clear_rx_quiesce(toep);
 }
 
 /*
  * Calculate the TLS data expansion size
  */
 static int
 tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only,
     unsigned short *pdus_per_ulp)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_scmd *scmd = &tls_ofld->scmd0;
 	int expn_size = 0, frag_count = 0, pad_per_pdu = 0,
 	    pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0;
 	int exp_per_pdu = 0;
 	int hdr_len = TLS_HEADER_LENGTH;
 
 	do {
 		max_frag_size = tls_ofld->k_ctx.frag_size;
 		if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) ==
 		   SCMD_CIPH_MODE_AES_GCM) {
 			frag_count = (data_len / max_frag_size);
 			exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE +
 				hdr_len;
 			expn_size =  frag_count * exp_per_pdu;
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if(!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				frag_count += 1;
 				expn_size += exp_per_pdu;
 			}
 			break;
 		} else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) !=
 			   SCMD_CIPH_MODE_NOP) {
 			/* Calculate the number of fragments we can make */
 			frag_count  = (data_len / max_frag_size);
 			if (frag_count > 0) {
 				pad_per_pdu = (((howmany((max_frag_size +
 						       tls_ofld->mac_length),
 						      CIPHER_BLOCK_SIZE)) *
 						CIPHER_BLOCK_SIZE) -
 					       (max_frag_size +
 						tls_ofld->mac_length));
 				if (!pad_per_pdu)
 					pad_per_pdu = CIPHER_BLOCK_SIZE;
 				exp_per_pdu = pad_per_pdu +
 				       	tls_ofld->mac_length +
 					hdr_len + CIPHER_BLOCK_SIZE;
 				expn_size = frag_count * exp_per_pdu;
 			}
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if (!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			/* Consider the last fragment */
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				pad_last_pdu = (((howmany((last_frag_size +
 							tls_ofld->mac_length),
 						       CIPHER_BLOCK_SIZE)) *
 						 CIPHER_BLOCK_SIZE) -
 						(last_frag_size +
 						 tls_ofld->mac_length));
 				if (!pad_last_pdu)
 					pad_last_pdu = CIPHER_BLOCK_SIZE;
 				expn_size += (pad_last_pdu +
 					      tls_ofld->mac_length + hdr_len +
 					      CIPHER_BLOCK_SIZE);
 			}
 		}
 	} while (0);
 
 	return (expn_size);
 }
 
 /* Copy Key to WR */
 static void
 tls_copy_tx_key(struct toepcb *toep, void *dst)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct ulptx_sc_memrd *sc_memrd;
 	struct ulptx_idata *sc;
 
 	if (tls_ofld->k_ctx.tx_key_info_size <= 0)
 		return;
 
 	if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) {
 		sc = dst;
 		sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		sc->len = htobe32(0);
 		sc_memrd = (struct ulptx_sc_memrd *)(sc + 1);
 		sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4));
 		sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5);
 	} else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) {
 		memcpy(dst, &tls_ofld->k_ctx.tx,
 		    tls_ofld->k_ctx.tx_key_info_size);
 	}
 }
 
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
 {
 	/*
 	 * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and
 	 * default to "CUSTOM" for all other types including
 	 * heartbeat?
 	 */
 	switch (content_type) {
 	case CONTENT_TYPE_CCS:
 		return CPL_TX_TLS_SFO_TYPE_CCS;
 	case CONTENT_TYPE_ALERT:
 		return CPL_TX_TLS_SFO_TYPE_ALERT;
 	case CONTENT_TYPE_HANDSHAKE:
 		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
 	case CONTENT_TYPE_HEARTBEAT:
 		return CPL_TX_TLS_SFO_TYPE_HEARTBEAT;
 	}
 	return CPL_TX_TLS_SFO_TYPE_DATA;
 }
 
 static unsigned char
 get_cipher_key_size(unsigned int ck_size)
 {
 	switch (ck_size) {
 	case AES_NOP: /* NOP */
 		return 15;
 	case AES_128: /* AES128 */
 		return CH_CK_SIZE_128;
 	case AES_192: /* AES192 */
 		return CH_CK_SIZE_192;
 	case AES_256: /* AES256 */
 		return CH_CK_SIZE_256;
 	default:
 		return CH_CK_SIZE_256;
 	}
 }
 
 static unsigned char
 get_mac_key_size(unsigned int mk_size)
 {
 	switch (mk_size) {
 	case SHA_NOP: /* NOP */
 		return CH_MK_SIZE_128;
 	case SHA_GHASH: /* GHASH */
 	case SHA_512: /* SHA512 */
 		return CH_MK_SIZE_512;
 	case SHA_224: /* SHA2-224 */
 		return CH_MK_SIZE_192;
 	case SHA_256: /* SHA2-256*/
 		return CH_MK_SIZE_256;
 	case SHA_384: /* SHA384 */
 		return CH_MK_SIZE_512;
 	case SHA1: /* SHA1 */
 	default:
 		return CH_MK_SIZE_160;
 	}
 }
 
 static unsigned int
 get_proto_ver(int proto_ver)
 {
 	switch (proto_ver) {
 	case TLS1_2_VERSION:
 		return TLS_1_2_VERSION;
 	case TLS1_1_VERSION:
 		return TLS_1_1_VERSION;
 	case DTLS1_2_VERSION:
 		return DTLS_1_2_VERSION;
 	default:
 		return TLS_VERSION_MAX;
 	}
 }
 
 static void
 tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL));
 		kwr->u.rxhdr.ivpresent_to_rxmk_size &=
 			~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1));
 		kwr->u.rxhdr.authmode_to_rxvalid &=
 			~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1));
 	} else {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL));
 	}
 }
 
 /* Rx key */
 static void
 prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 	int proto_ver = kctx->proto_ver;
 
 	kwr->u.rxhdr.flitcnt_hmacctrl =
 		((kctx->rx_key_info_size >> 4) << 3) | kctx->hmac_ctrl;
 
 	kwr->u.rxhdr.protover_ciphmode =
 		V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) |
 		V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode);
 
 	kwr->u.rxhdr.authmode_to_rxvalid =
 		V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) |
 		V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) |
 		V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) |
 		V_TLS_KEYCTX_TX_WR_RXVALID(1);
 
 	kwr->u.rxhdr.ivpresent_to_rxmk_size =
 		V_TLS_KEYCTX_TX_WR_IVPRESENT(0) |
 		V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size));
 
 	tls_rxkey_flit1(kwr, kctx);
 
 	/* No key reversal for GCM */
 	if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) {
 		t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key,
 				 (kctx->cipher_secret_size << 3));
 		memcpy(kwr->keys.edkey + kctx->cipher_secret_size,
 		       kctx->rx.key + kctx->cipher_secret_size,
 		       (IPAD_SIZE + OPAD_SIZE));
 	} else {
 		memcpy(kwr->keys.edkey, kctx->rx.key,
 		       (kctx->rx_key_info_size - SALT_SIZE));
 		memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE);
 	}
 }
 
 /* Tx key */
 static void
 prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 
 	kwr->u.txhdr.ctxlen =
 		(kctx->tx_key_info_size >> 4);
 	kwr->u.txhdr.dualck_to_txvalid =
 		V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) |
 		V_TLS_KEYCTX_TX_WR_TXVALID(1);
 
 	memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE);
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE);
 		kwr->u.txhdr.dualck_to_txvalid &=
 			~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1));
 	}
 	kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid);
 }
 
 /* TLS Key memory management */
 static int
 get_new_keyid(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	vmem_addr_t addr;
 
 	if (vmem_alloc(sc->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct toepcb *toep, int keyid)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	vmem_free(sc->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static void
 clear_tls_keyid(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	if (tls_ofld->rx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->rx_key_addr);
 		tls_ofld->rx_key_addr = -1;
 	}
 	if (tls_ofld->tx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->tx_key_addr);
 		tls_ofld->tx_key_addr = -1;
 	}
 }
 
 static int
 get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops)
 {
 	return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr :
 		((ops & KEY_WRITE_TX) ? tls_ofld->tx_key_addr : -1));
 }
 
 static int
 get_tp_plen_max(struct tls_ofld_info *tls_ofld)
 {
 	int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448;
 
 	return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
 /* Send request to get the key-id */
 static int
 tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 	struct ofld_tx_sdesc *txsd;
 	int kwrlen, kctxlen, keyid, len;
 	struct wrqe *wr;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = roundup2(kwrlen + kctxlen, 16);
 
 	if (toep->txsd_avail == 0)
 		return (EAGAIN);
 
 	/* Dont initialize key for re-neg */
 	if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) {
 		if ((keyid = get_new_keyid(toep)) < 0) {
 			return (ENOSPC);
 		}
 	} else {
 		keyid = get_keyid(tls_ofld, k_ctx->l_p_key);
 	}
 
 	wr = alloc_wrqe(len, toep->ofld_txq);
 	if (wr == NULL) {
 		free_keyid(toep, keyid);
 		return (ENOMEM);
 	}
 	kwr = wrtod(wr);
 	memset(kwr, 0, kwrlen);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 	kwr->protocol = get_proto_ver(k_ctx->proto_ver);
 	kwr->mfs = htons(k_ctx->frag_size);
 	kwr->reneg_to_write_rx = k_ctx->l_p_key;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((toep->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memset(kctx, 0, kctxlen);
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		tls_ofld->tx_key_addr = keyid;
 		prepare_txkey_wr(kctx, k_ctx);
 	} else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		tls_ofld->rx_key_addr = keyid;
 		prepare_rxkey_wr(kctx, k_ctx);
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = DIV_ROUND_UP(len, 16);
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 
 	return (0);
 }
 
 /* Store a key received from SSL in DDR. */
 static int
 program_key_context(struct tcpcb *tp, struct toepcb *toep,
     struct tls_key_context *uk_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_key_context *k_ctx;
 	int error, key_offset;
 
 	if (tp->t_state != TCPS_ESTABLISHED) {
 		/*
 		 * XXX: Matches Linux driver, but not sure this is a
 		 * very appropriate error.
 		 */
 		return (ENOENT);
 	}
 
 	/* Stop timer on handshake completion */
 	tls_stop_handshake_timer(toep);
 
 	toep->flags &= ~TPF_FORCE_CREDITS;
 
 	CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid,
 	    G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" :
 	    "KEY_WRITE_TX", uk_ctx->proto_ver);
 
 	if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX &&
 	    ulp_mode(toep) != ULP_MODE_TLS)
 		return (EOPNOTSUPP);
 
 	/* Don't copy the 'tx' and 'rx' fields. */
 	k_ctx = &tls_ofld->k_ctx;
 	memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key,
 	    sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key));
 
 	/* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */
 	if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) {
 		if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 			tls_ofld->rx_key_addr = -1;
 			t4_clear_rx_quiesce(toep);
 		} else {
 			tls_ofld->tx_key_addr = -1;
 		}
 		return (0);
 	}
 
 	if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		k_ctx->iv_size = 4;
 		k_ctx->mac_first = 0;
 		k_ctx->hmac_ctrl = 0;
 	} else {
 		k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		k_ctx->mac_first = 1;
 	}
 
 	tls_ofld->scmd0.seqno_numivs =
 		(V_SCMD_SEQ_NO_CTRL(3) |
 		 V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) |
 		 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 		 V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) |
 		 V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) |
 		 V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) |
 		 V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) |
 		 V_SCMD_IV_SIZE(k_ctx->iv_size));
 
 	tls_ofld->scmd0.ivgen_hdrlen =
 		(V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) |
 		 V_SCMD_KEY_CTX_INLINE(0) |
 		 V_SCMD_TLS_FRAG_ENABLE(1));
 
 	tls_ofld->mac_length = k_ctx->mac_secret_size;
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		k_ctx->rx = uk_ctx->rx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->rx_key_addr = -1;
 	} else {
 		k_ctx->tx = uk_ctx->tx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->tx_key_addr = -1;
 	}
 
 	/* Flush pending data before new Tx key becomes active */
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		struct sockbuf *sb;
 
 		/* XXX: This might not drain everything. */
 		t4_push_frames(sc, toep, 0);
 		sb = &toep->inp->inp_socket->so_snd;
 		SOCKBUF_LOCK(sb);
 
 		/* XXX: This asserts that everything has been pushed. */
 		MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL);
 		sb->sb_sndptr = NULL;
 		tls_ofld->sb_off = sbavail(sb);
 		SOCKBUF_UNLOCK(sb);
 		tls_ofld->tx_seq_no = 0;
 	}
 
 	if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) ||
 	    (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) {
 
 		/*
 		 * XXX: The userland library sets tx_key_info_size, not
 		 * rx_key_info_size.
 		 */
 		k_ctx->rx_key_info_size = k_ctx->tx_key_info_size;
 
 		error = tls_program_key_id(toep, k_ctx);
 		if (error) {
 			/* XXX: Only clear quiesce for KEY_WRITE_RX? */
 			t4_clear_rx_quiesce(toep);
 			return (error);
 		}
 	}
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = tls_ofld->rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	} else {
 		unsigned short pdus_per_ulp;
 
 		if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE)
 			tls_ofld->tx_key_addr = 1;
 
 		tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld);
 		tls_ofld->expn_per_ulp = tls_expansion_size(toep,
 				tls_ofld->fcplenmax, 1, &pdus_per_ulp);
 		tls_ofld->pdus_per_ulp = pdus_per_ulp;
 		tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp *
 			((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) +
 			 tls_ofld->k_ctx.frag_size);
 	}
 
 	return (0);
 }
 
 /*
  * In some cases a client connection can hang without sending the
  * ServerHelloDone message from the NIC to the host.  Send a dummy
  * RX_DATA_ACK with RX_MODULATE to unstick the connection.
  */
 static void
 tls_send_handshake_ack(void *arg)
 {
 	struct toepcb *toep = arg;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/*
 	 * XXX: Does not have the t4_get_tcb() checks to refine the
 	 * workaround.
 	 */
 	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
 
 	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
 	send_rx_modulate(sc, toep);
 }
 
 static void
 tls_start_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
 	    tls_send_handshake_ack, toep);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 void
 tls_stop_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_stop(&tls_ofld->handshake_timer);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 int
 t4_ctloutput_tls(struct socket *so, struct sockopt *sopt)
 {
 	struct tls_key_context uk_ctx;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	int error, optval;
 
 	error = 0;
 	if (sopt->sopt_dir == SOPT_SET &&
 	    sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) {
 		error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx),
 		    sizeof(uk_ctx));
 		if (error)
 			return (error);
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_SET_TLS_CONTEXT:
 			if (toep->tls.mode == TLS_MODE_KTLS)
 				error = EINVAL;
 			else {
 				error = program_key_context(tp, toep, &uk_ctx);
 				if (error == 0)
 					toep->tls.mode = TLS_MODE_TLSOM;
 			}
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_TLS_TOM:
 			if (toep->tls.mode == TLS_MODE_KTLS)
 				error = EINVAL;
 			else if (ulp_mode(toep) == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM",
 				    __func__, toep->tid);
 				tls_clr_ofld_mode(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_QUIES:
 			if (toep->tls.mode == TLS_MODE_KTLS)
 				error = EINVAL;
 			else if (ulp_mode(toep) == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES",
 				    __func__, toep->tid);
 				tls_clr_quiesce(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_GET_TLS_TOM:
 			/*
 			 * TLS TX is permitted on any TOE socket, but
 			 * TLS RX requires a TLS ULP mode.
 			 */
 			optval = TLS_TOM_NONE;
 			if (can_tls_offload(td_adapter(toep->td)) &&
 			    toep->tls.mode != TLS_MODE_KTLS) {
 				switch (ulp_mode(toep)) {
 				case ULP_MODE_NONE:
 				case ULP_MODE_TCPDDP:
 					optval = TLS_TOM_TXONLY;
 					break;
 				case ULP_MODE_TLS:
 					optval = TLS_TOM_BOTH;
 					break;
 				}
 			}
 			CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d",
 			    __func__, toep->tid, optval);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 #ifdef KERN_TLS
 static void
 init_ktls_key_context(struct ktls_session *tls, struct tls_key_context *k_ctx,
     int direction)
 {
 	struct auth_hash *axf;
 	u_int key_info_size, mac_key_size;
 	char *hash, *key;
 
 	k_ctx->l_p_key = V_KEY_GET_LOC(direction == KTLS_TX ? KEY_WRITE_TX :
 	    KEY_WRITE_RX);
 	k_ctx->proto_ver = tls->params.tls_vmajor << 8 | tls->params.tls_vminor;
 	k_ctx->cipher_secret_size = tls->params.cipher_key_len;
 	key_info_size = sizeof(struct tx_keyctx_hdr) +
 	    k_ctx->cipher_secret_size;
 	if (direction == KTLS_TX)
 		key = k_ctx->tx.key;
 	else
 		key = k_ctx->rx.key;
 	memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len);
 	hash = key + tls->params.cipher_key_len;
 	if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) {
 		k_ctx->state.auth_mode = SCMD_AUTH_MODE_GHASH;
 		k_ctx->state.enc_mode = SCMD_CIPH_MODE_AES_GCM;
 		k_ctx->iv_size = 4;
 		k_ctx->mac_first = 0;
 		k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NOP;
 		key_info_size += GMAC_BLOCK_LEN;
 		k_ctx->mac_secret_size = 0;
 		if (direction == KTLS_TX)
 			memcpy(k_ctx->tx.salt, tls->params.iv, SALT_SIZE);
 		else
 			memcpy(k_ctx->rx.salt, tls->params.iv, SALT_SIZE);
 		t4_init_gmac_hash(tls->params.cipher_key,
 		    tls->params.cipher_key_len, hash);
 	} else {
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			axf = &auth_hash_hmac_sha1;
 			mac_key_size = SHA1_HASH_LEN;
 			k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA1;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			axf = &auth_hash_hmac_sha2_256;
 			mac_key_size = SHA2_256_HASH_LEN;
 			k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA256;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			axf = &auth_hash_hmac_sha2_384;
 			mac_key_size = SHA2_512_HASH_LEN;
 			k_ctx->state.auth_mode = SCMD_AUTH_MODE_SHA512_384;
 			break;
 		default:
 			panic("bad auth mode");
 		}
 		k_ctx->state.enc_mode = SCMD_CIPH_MODE_AES_CBC;
 		k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		k_ctx->mac_first = 1;
 		k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC;
 		key_info_size += roundup2(mac_key_size, 16) * 2;
 		k_ctx->mac_secret_size = mac_key_size;
 		t4_init_hmac_digest(axf, mac_key_size, tls->params.auth_key,
 		    tls->params.auth_key_len, hash);
 	}
 
 	if (direction == KTLS_TX)
 		k_ctx->tx_key_info_size = key_info_size;
 	else
 		k_ctx->rx_key_info_size = key_info_size;
 	k_ctx->frag_size = tls->params.max_frame_len;
 	k_ctx->iv_ctrl = 1;
 }
 
 int
 tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct tls_key_context *k_ctx;
 	int error, key_offset;
 
 	if (toep->tls.mode == TLS_MODE_TLSOM)
 		return (EINVAL);
 	if (!can_tls_offload(td_adapter(toep->td)))
 		return (EINVAL);
 	switch (ulp_mode(toep)) {
 	case ULP_MODE_TLS:
 		break;
 	case ULP_MODE_NONE:
 	case ULP_MODE_TCPDDP:
 		if (direction != KTLS_TX)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			return (EPROTONOSUPPORT);
 		}
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE)
 			return (EINVAL);
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO)
 		return (EPROTONOSUPPORT);
 
 	/* Bail if we already have a key. */
 	if (direction == KTLS_TX) {
 		if (toep->tls.tx_key_addr != -1)
 			return (EOPNOTSUPP);
 	} else {
 		if (toep->tls.rx_key_addr != -1)
 			return (EOPNOTSUPP);
 	}
 
 	/*
 	 * XXX: This assumes no key renegotation.  If KTLS ever supports
 	 * that we will want to allocate TLS sessions dynamically rather
 	 * than as a static member of toep.
 	 */
 	k_ctx = &toep->tls.k_ctx;
 	init_ktls_key_context(tls, k_ctx, direction);
 
 	error = tls_program_key_id(toep, k_ctx);
 	if (error)
 		return (error);
 
 	if (direction == KTLS_TX) {
 		toep->tls.scmd0.seqno_numivs =
 			(V_SCMD_SEQ_NO_CTRL(3) |
 			 V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) |
 			 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 			 V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) |
 			 V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) |
 			 V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) |
 			 V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) |
 			 V_SCMD_IV_SIZE(k_ctx->iv_size));
 
 		toep->tls.scmd0.ivgen_hdrlen =
 			(V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) |
 			 V_SCMD_KEY_CTX_INLINE(0) |
 			 V_SCMD_TLS_FRAG_ENABLE(1));
 
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16)
 			toep->tls.iv_len = 8;
 		else
 			toep->tls.iv_len = AES_BLOCK_LEN;
 
 		toep->tls.mac_length = k_ctx->mac_secret_size;
 
 		toep->tls.fcplenmax = get_tp_plen_max(&toep->tls);
 		toep->tls.expn_per_ulp = tls->params.tls_hlen +
 		    tls->params.tls_tlen;
 		toep->tls.pdus_per_ulp = 1;
 		toep->tls.adjusted_plen = toep->tls.expn_per_ulp +
 		    toep->tls.k_ctx.frag_size;
 	} else {
 		/* Stop timer on handshake completion */
 		tls_stop_handshake_timer(toep);
 
 		toep->flags &= ~TPF_FORCE_CREDITS;
 
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	}
 
 	toep->tls.mode = TLS_MODE_KTLS;
 
 	return (0);
 }
 #endif
 
 void
 tls_init_toep(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	tls_ofld->mode = TLS_MODE_OFF;
 	tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR;
 	tls_ofld->rx_key_addr = -1;
 	tls_ofld->tx_key_addr = -1;
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		callout_init_mtx(&tls_ofld->handshake_timer,
 		    &tls_handshake_lock, 0);
 }
 
 void
 tls_establish(struct toepcb *toep)
 {
 
 	/*
 	 * Enable PDU extraction.
 	 *
 	 * XXX: Supposedly this should be done by the firmware when
 	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
 	 * in practice this seems to be required.
 	 */
 	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 
 	toep->flags |= TPF_FORCE_CREDITS;
 
 	tls_start_handshake_timer(toep);
 }
 
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		tls_stop_handshake_timer(toep);
 	clear_tls_keyid(toep);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define	MIN_OFLD_TLSTX_CREDITS(toep)					\
 	(howmany(sizeof(struct fw_tlstx_data_wr) +			\
 	    sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) +		\
 	    CIPHER_BLOCK_SIZE + 1, 16))
 
 static inline u_int
 max_imm_tls_space(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 	int space;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		space = (n * EQ_ESIZE);
 	else
 		space = tx_credits * 16;
 	return (space);
 }
 
 static int
 count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp)
 {
 	int max_nsegs_1mbuf, n, nsegs;
 
 	while (skip >= m->m_len) {
 		skip -= m->m_len;
 		m = m->m_next;
 	}
 
 	nsegs = 0;
 	max_nsegs_1mbuf = 0;
 	while (len > 0) {
 		n = sglist_count(mtod(m, char *) + skip, m->m_len - skip);
 		if (n > max_nsegs_1mbuf)
 			max_nsegs_1mbuf = n;
 		nsegs += n;
 		len -= m->m_len - skip;
 		skip = 0;
 		m = m->m_next;
 	}
 	*max_nsegs_1mbufp = max_nsegs_1mbuf;
 	return (nsegs);
 }
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
     unsigned int immdlen, unsigned int plen, unsigned int expn,
     unsigned int pdus, uint8_t credits, int shove, int imm_ivs)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) |
 	    V_FW_TLSTX_DATA_WR_COMPL(1) |
 	    V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) |
 	    V_FW_TLSTX_DATA_WR_LEN16(credits));
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
 	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) |
 	    V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4));
 	txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size);
 	txwr->adjustedplen_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen));
 	txwr->expinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp));
 	txwr->pdusinplenmax_pkd = 
 	    V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp);
 }
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
     struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	if (plen < tls_ofld->k_ctx.frag_size)
 		seglen = plen;
 	else
 		seglen = tls_ofld->k_ctx.frag_size;
 	data_type = tls_content_type(tls_hdr->type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT)
 		cpl->type_protover = htobe32(
 		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(pdus));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
 	cpl->scmd1 = htobe64(tls_ofld->tx_seq_no);
 	tls_ofld->tx_seq_no += pdus;
 }
 
 /*
  * Similar to write_tx_sgl() except that it accepts an optional
  * trailer buffer for IVs.
  */
 static void
 write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen,
     void *iv_buffer, int iv_len, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	for (m = start; skip >= m->m_len; m = m->m_next)
 		skip -= m->m_len;
 
 	i = -1;
 	for (m = start; plen > 0; m = m->m_next) {
 		rc = sglist_append(&sg, mtod(m, char *) + skip,
 		    m->m_len - skip);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 		plen -= m->m_len - skip;
 		skip = 0;
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (iv_buffer != NULL) {
 		rc = sglist_append(&sg, iv_buffer, iv_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p",
 	    __func__, nsegs, start, iv_buffer));
 }
 
 /*
  * Similar to t4_push_frames() but handles TLS sockets when TLS offload
  * is enabled.  Rather than transmitting bulk data, the socket buffer
  * contains TLS records.  The work request requires a full TLS record,
  * so batch mbufs up until a full TLS record is seen.  This requires
  * reading the TLS header out of the start of each record to determine
  * its length.
  */
 void
 t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr thdr;
 	struct mbuf *sndptr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len;
 	u_int expn_size, iv_len, pdus, sndptroff;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, /* compl,*/ sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool imm_ivs, imm_payload;
 	void *iv_buffer, *iv_dst, *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		space = max_imm_tls_space(tx_credits);
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) + key_size(toep);
 		if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) {
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 			    "%s: tid %d tx_credits %d min_wr %d space %d",
 			    __func__, toep->tid, tx_credits, wr_len +
 			    CIPHER_BLOCK_SIZE + 1, space);
 #endif
 			return;
 		}
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			MPASS(tls_ofld->sb_off >= drop);
 			tls_ofld->sb_off -= drop;
 			drop = 0;
 		}
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (sbavail(sb) == tls_ofld->sb_off &&
 		    toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) {
 			/*
 			 * A full TLS header is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the header so just discard the partial
 			 * header and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Read the header of the next TLS record. */
 		sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff);
 		m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr);
 		tls_size = htons(thdr.length);
 		plen = TLS_HEADER_LENGTH + tls_size;
 		pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size);
 		iv_len = pdus * CIPHER_BLOCK_SIZE;
 
 		if (sbavail(sb) < tls_ofld->sb_off + plen) {
 			/*
 			 * The full TLS record is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the record so just discard the partial
 			 * record and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR6(KTR_CXGBE,
 			    "%s: tid %d sbavail %d sb_off %d plen %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    plen, toep->flags & TPF_SEND_FIN ? "" :
 			    " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Shove if there is no additional data pending. */
 		shove = (sbavail(sb) == tls_ofld->sb_off + plen) &&
 		    !(tp->t_flags & TF_MORETOCOME);
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/* Determine whether to use immediate vs SGL. */
 		imm_payload = false;
 		imm_ivs = false;
 		if (wr_len + iv_len <= space) {
 			imm_ivs = true;
 			wr_len += iv_len;
 			if (wr_len + tls_size <= space) {
 				wr_len += tls_size;
 				imm_payload = true;
 			}
 		}
 
 		/* Allocate space for IVs if needed. */
 		if (!imm_ivs) {
 			iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT);
 			if (iv_buffer == NULL) {
 				/*
 				 * XXX: How to restart this?
 				 */
 				if (sowwakeup)
 					sowwakeup_locked(so);
 				else
 					SOCKBUF_UNLOCK(sb);
 				SOCKBUF_UNLOCK_ASSERT(sb);
 				CTR3(KTR_CXGBE,
 			    "%s: tid %d failed to alloc IV space len %d",
 				    __func__, toep->tid, iv_len);
 				return;
 			}
 		} else
 			iv_buffer = NULL;
 
 		/* Determine size of SGL. */
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		if (!imm_payload) {
 			nsegs = count_mbuf_segs(sndptr, sndptroff +
 			    TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf);
 			if (!imm_ivs) {
 				int n = sglist_count(iv_buffer, iv_len);
 				nsegs += n;
 				if (n > max_nsegs_1mbuf)
 					max_nsegs_1mbuf = n;
 			}
 
 			/* Account for SGL in work request length. */
 			wr_len += sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 		}
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d",
 		    __func__, toep->tid, thdr.type, tls_size, pdus);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
 		expn_size = tls_expansion_size(toep, tls_size, 0, NULL);
 		write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0,
 		    tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0);
 		write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus);
 		tls_copy_tx_key(toep, cpl + 1);
 
 		/* Generate random IVs */
 		buf = (char *)(cpl + 1) + key_size(toep);
 		if (imm_ivs) {
 			MPASS(iv_buffer == NULL);
 			iv_dst = buf;
 			buf = (char *)iv_dst + iv_len;
 		} else
 			iv_dst = iv_buffer;
 		arc4rand(iv_dst, iv_len, 0);
 
 		if (imm_payload) {
 			m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH,
 			    tls_size, buf);
 		} else {
 			write_tlstx_sgl(buf, sndptr,
 			    sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer,
 			    iv_len, nsegs, max_nsegs_1mbuf);
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		sbsndptr_adv(sb, sb->sb_sndptr, plen);
 		tls_ofld->sb_off += plen;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd->iv_buffer = iv_buffer;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		atomic_add_long(&toep->vi->pi->tx_toe_tls_records, 1);
 		atomic_add_long(&toep->vi->pi->tx_toe_tls_octets, plen);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 
 #ifdef KERN_TLS
 static int
 count_ext_pgs_segs(struct mbuf *m)
 {
 	vm_paddr_t nextpa;
 	u_int i, nsegs;
 
-	MPASS(m->m_ext_pgs.npgs > 0);
+	MPASS(m->m_epg_npgs > 0);
 	nsegs = 1;
 	nextpa = m->m_epg_pa[0] + PAGE_SIZE;
-	for (i = 1; i < m->m_ext_pgs.npgs; i++) {
+	for (i = 1; i < m->m_epg_npgs; i++) {
 		if (nextpa != m->m_epg_pa[i])
 			nsegs++;
 		nextpa = m->m_epg_pa[i] + PAGE_SIZE;
 	}
 	return (nsegs);
 }
 
 static void
 write_ktlstx_sgl(void *dst, struct mbuf *m, int nsegs)
 {
 	struct ulptx_sgl *usgl = dst;
 	vm_paddr_t pa;
 	uint32_t len;
 	int i, j;
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	/* Figure out the first S/G length. */
-	pa = m->m_epg_pa[0] + m->m_ext_pgs.first_pg_off;
+	pa = m->m_epg_pa[0] + m->m_epg_1st_off;
 	usgl->addr0 = htobe64(pa);
-	len = m_epg_pagelen(m, 0, m->m_ext_pgs.first_pg_off);
+	len = m_epg_pagelen(m, 0, m->m_epg_1st_off);
 	pa += len;
-	for (i = 1; i < m->m_ext_pgs.npgs; i++) {
+	for (i = 1; i < m->m_epg_npgs; i++) {
 		if (m->m_epg_pa[i] != pa)
 			break;
 		len += m_epg_pagelen(m, i, 0);
 		pa += m_epg_pagelen(m, i, 0);
 	}
 	usgl->len0 = htobe32(len);
 #ifdef INVARIANTS
 	nsegs--;
 #endif
 
 	j = -1;
-	for (; i < m->m_ext_pgs.npgs; i++) {
+	for (; i < m->m_epg_npgs; i++) {
 		if (j == -1 || m->m_epg_pa[i] != pa) {
 			if (j >= 0)
 				usgl->sge[j / 2].len[j & 1] = htobe32(len);
 			j++;
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 			pa = m->m_epg_pa[i];
 			usgl->sge[j / 2].addr[j & 1] = htobe64(pa);
 			len = m_epg_pagelen(m, i, 0);
 			pa += len;
 		} else {
 			len += m_epg_pagelen(m, i, 0);
 			pa += m_epg_pagelen(m, i, 0);
 		}
 	}
 	if (j >= 0) {
 		usgl->sge[j / 2].len[j & 1] = htobe32(len);
 
 		if ((j & 1) == 0)
 			usgl->sge[j / 2].len[1] = htobe32(0);
 	}
 	KASSERT(nsegs == 0, ("%s: nsegs %d, m %p", __func__, nsegs, m));
 }
 
 /*
  * Similar to t4_push_frames() but handles sockets that contain TLS
  * record mbufs.  Unlike TLSOM, each mbuf is a complete TLS record and
  * corresponds to a single work request.
  */
 void
 t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr *thdr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct wrqe *wr;
 	struct mbuf *m;
 	u_int nsegs, credits, wr_len;
 	u_int expn_size;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	char *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 
 		m = sb->sb_sndptr != NULL ? sb->sb_sndptr->m_next : sb->sb_mb;
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (m == NULL && toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		/*
 		 * If there is no ready data to send, wait until more
 		 * data arrives.
 		 */
 		if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 #ifdef VERBOSE_TRACES
 			CTR2(KTR_CXGBE, "%s: tid %d no ready data to send",
 			    __func__, toep->tid);
 #endif
 			return;
 		}
 
 		KASSERT(m->m_flags & M_NOMAP, ("%s: mbuf %p is not NOMAP",
 		    __func__, m));
-		KASSERT(m->m_ext_pgs.tls != NULL,
+		KASSERT(m->m_epg_tls != NULL,
 		    ("%s: mbuf %p doesn't have TLS session", __func__, m));
 
 		/* Calculate WR length. */
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) + key_size(toep);
 
 		/* Explicit IVs for AES-CBC and AES-GCM are <= 16. */
 		MPASS(toep->tls.iv_len <= AES_BLOCK_LEN);
 		wr_len += AES_BLOCK_LEN;
 
 		/* Account for SGL in work request length. */
 		nsegs = count_ext_pgs_segs(m);
 		wr_len += sizeof(struct ulptx_sgl) +
 		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 
 		/* Not enough credits for this work request. */
 		if (howmany(wr_len, 16) > tx_credits) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 	    "%s: tid %d mbuf %p requires %d credits, but only %d available",
 			    __func__, toep->tid, m, howmany(wr_len, 16),
 			    tx_credits);
 #endif
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 	
 		/* Shove if there is no additional data pending. */
 		shove = ((m->m_next == NULL ||
 		    (m->m_next->m_flags & M_NOTAVAIL) != 0)) &&
 		    (tp->t_flags & TF_MORETOCOME) == 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 		thdr = (struct tls_hdr *)&m->m_epg_hdr;
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %ju type %d len %#x",
-		    __func__, toep->tid, m->m_ext_pgs.seqno, thdr->type,
+		    __func__, toep->tid, m->m_epg_seqno, thdr->type,
 		    m->m_len);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
-		expn_size = m->m_ext_pgs.hdr_len +
-		    m->m_ext_pgs.trail_len;
+		expn_size = m->m_epg_hdrlen +
+		    m->m_epg_trllen;
 		tls_size = m->m_len - expn_size;
 		write_tlstx_wr(txwr, toep, 0,
 		    tls_size, expn_size, 1, credits, shove, 1);
-		toep->tls.tx_seq_no = m->m_ext_pgs.seqno;
+		toep->tls.tx_seq_no = m->m_epg_seqno;
 		write_tlstx_cpl(cpl, toep, thdr, tls_size, 1);
 		tls_copy_tx_key(toep, cpl + 1);
 
 		/* Copy IV. */
 		buf = (char *)(cpl + 1) + key_size(toep);
 		memcpy(buf, thdr + 1, toep->tls.iv_len);
 		buf += AES_BLOCK_LEN;
 
 		write_ktlstx_sgl(buf, m, nsegs);
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += m->m_len;
 		tp->snd_max += m->m_len;
 
 		SOCKBUF_LOCK(sb);
 		sb->sb_sndptr = m;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = m->m_len;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		atomic_add_long(&toep->vi->pi->tx_toe_tls_records, 1);
 		atomic_add_long(&toep->vi->pi->tx_toe_tls_octets, m->m_len);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 #endif
 
 /*
  * For TLS data we place received mbufs received via CPL_TLS_DATA into
  * an mbufq in the TLS offload state.  When CPL_RX_TLS_CMP is
  * received, the completed PDUs are placed into the socket receive
  * buffer.
  *
  * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs.
  */
 static int
 do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_tls_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	int len;
 
 	/* XXX: Should this match do_rx_data instead? */
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_toe_tls_octets, len);
 
 	KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	/* Save TCP sequence number. */
 	m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq);
 
 	if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) {
 #ifdef INVARIANTS
 		panic("Failed to queue TLS data packet");
 #else
 		printf("%s: Failed to queue TLS data packet\n", __func__);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 #endif
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_rcvtime = ticks;
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
 	    be32toh(cpl->seq));
 #endif
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *);
 	struct tlsrx_hdr_pkt *tls_hdr_pkt;
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
 #ifdef KERN_TLS
 	struct tls_get_record *tgr;
 	struct mbuf *control;
 #endif
 	int len, pdu_length, rx_credits;
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_toe_tls_records, 1);
 
 	KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length));
 
 	so = inp_inpcbtosocket(inp);
 	tp = intotcpcb(inp);
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u",
 	    __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt);
 #endif
 
 	tp->rcv_nxt += pdu_length;
 	if (tp->rcv_wnd < pdu_length) {
 		toep->tls.rcv_over += pdu_length - tp->rcv_wnd;
 		tp->rcv_wnd = 0;
 	} else
 		tp->rcv_wnd -= pdu_length;
 
 	/* XXX: Not sure what to do about urgent data. */
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
 	 * additional fields.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
 	tls_hdr_pkt = mtod(m, void *);
 
 	tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq);
 	if (tls_data != NULL) {
 		KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq,
 		    ("%s: sequence mismatch", __func__));
 	}
 
 #ifdef KERN_TLS
 	if (toep->tls.mode == TLS_MODE_KTLS) {
 		/* Report decryption errors as EBADMSG. */
 		if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) !=
 		    0) {
 			m_freem(m);
 			m_freem(tls_data);
 
 			CURVNET_SET(toep->vnet);
 			so->so_error = EBADMSG;
 			sorwakeup(so);
 
 			INP_WUNLOCK(inp);
 			CURVNET_RESTORE();
 
 			return (0);
 		}
 
 		/* Allocate the control message mbuf. */
 		control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD,
 		    IPPROTO_TCP);
 		if (control == NULL) {
 			m_freem(m);
 			m_freem(tls_data);
 
 			CURVNET_SET(toep->vnet);
 			so->so_error = ENOBUFS;
 			sorwakeup(so);
 
 			INP_WUNLOCK(inp);
 			CURVNET_RESTORE();
 
 			return (0);
 		}
 
 		tgr = (struct tls_get_record *)
 		    CMSG_DATA(mtod(control, struct cmsghdr *));
 		tgr->tls_type = tls_hdr_pkt->type;
 		tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8;
 		tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff;
 
 		m_freem(m);
 
 		if (tls_data != NULL) {
 			m_last(tls_data)->m_flags |= M_EOR;
 			tgr->tls_length = htobe16(tls_data->m_pkthdr.len);
 		} else
 			tgr->tls_length = 0;
 		m = tls_data;
 	} else
 #endif
 	{
 		/*
 		 * Only the TLS header is sent to OpenSSL, so report
 		 * errors by altering the record type.
 		 */
 		if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) !=
 		    0)
 			tls_hdr_pkt->type = CONTENT_TYPE_ERROR;
 
 		/* Trim this CPL's mbuf to only include the TLS header. */
 		KASSERT(m->m_len == len && m->m_next == NULL,
 		    ("%s: CPL spans multiple mbufs", __func__));
 		m->m_len = TLS_HEADER_LENGTH;
 		m->m_pkthdr.len = TLS_HEADER_LENGTH;
 
 		if (tls_data != NULL) {
 			/*
 			 * Update the TLS header length to be the length of
 			 * the payload data.
 			 */
 			tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len);
 
 			m->m_next = tls_data;
 			m->m_pkthdr.len += tls_data->m_len;
 		}
 
 #ifdef KERN_TLS
 		control = NULL;
 #endif
 	}
 
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		struct epoch_tracker et;
 
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
 #ifdef KERN_TLS
 		m_freem(control);
 #endif
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/*
 	 * Not all of the bytes on the wire are included in the socket buffer
 	 * (e.g. the MAC of the TLS record).  However, those bytes are included
 	 * in the TCP sequence space.
 	 */
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 #ifdef KERN_TLS
 	if (control != NULL)
 		sbappendcontrol_locked(sb, m, control, 0);
 	else
 #endif
 		sbappendstream_locked(sb, m, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u",
 	    __func__, tid, rx_credits, tp->rcv_wnd);
 #endif
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 t4_tls_mod_load(void)
 {
 
 	mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF);
 	t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp);
 }
 
 void
 t4_tls_mod_unload(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL);
 	mtx_destroy(&tls_handshake_lock);
 }
 #endif	/* TCP_OFFLOAD */
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c	(revision 360578)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c	(revision 360579)
@@ -1,832 +1,832 @@
 /*-
  * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 
 #include "en.h"
 
 #include <dev/mlx5/tls.h>
 
 #include <linux/delay.h>
 #include <sys/ktls.h>
 #include <opencrypto/cryptodev.h>
 
 #ifdef KERN_TLS
 
 MALLOC_DEFINE(M_MLX5E_TLS, "MLX5E_TLS", "MLX5 ethernet HW TLS");
 
 /* software TLS context */
 struct mlx5_ifc_sw_tls_cntx_bits {
 	struct mlx5_ifc_tls_static_params_bits param;
 	struct mlx5_ifc_tls_progress_params_bits progress;
 	struct {
 		uint8_t key_data[8][0x20];
 		uint8_t key_len[0x20];
 	} key;
 };
 
 CTASSERT(MLX5_ST_SZ_BYTES(sw_tls_cntx) <= sizeof(((struct mlx5e_tls_tag *)0)->crypto_params));
 CTASSERT(MLX5_ST_SZ_BYTES(mkc) == sizeof(((struct mlx5e_tx_umr_wqe *)0)->mkc));
 
 static const char *mlx5e_tls_stats_desc[] = {
 	MLX5E_TLS_STATS(MLX5E_STATS_DESC)
 };
 
 static void mlx5e_tls_work(struct work_struct *);
 
 static int
 mlx5e_tls_tag_zinit(void *mem, int size, int flags)
 {
 	struct mlx5e_tls_tag *ptag = mem;
 
 	MPASS(size == sizeof(*ptag));
 
 	memset(ptag, 0, sizeof(*ptag));
 	mtx_init(&ptag->mtx, "mlx5-tls-tag-mtx", NULL, MTX_DEF);
 	INIT_WORK(&ptag->work, mlx5e_tls_work);
 
 	return (0);
 }
 
 static void
 mlx5e_tls_tag_zfini(void *mem, int size)
 {
 	struct mlx5e_tls_tag *ptag = mem;
 	struct mlx5e_priv *priv;
 	struct mlx5e_tls *ptls;
 
 	ptls = ptag->tls;
 	priv = container_of(ptls, struct mlx5e_priv, tls);
 
 	flush_work(&ptag->work);
 
 	if (ptag->tisn != 0) {
 		mlx5_tls_close_tis(priv->mdev, ptag->tisn);
 		atomic_add_32(&ptls->num_resources, -1U);
 	}
 
 	mtx_destroy(&ptag->mtx);
 }
 
 static void
 mlx5e_tls_tag_zfree(struct mlx5e_tls_tag *ptag)
 {
 
 	/* reset some variables */
 	ptag->state = MLX5E_TLS_ST_INIT;
 	ptag->dek_index = 0;
 	ptag->dek_index_ok = 0;
 
 	/* avoid leaking keys */
 	memset(ptag->crypto_params, 0, sizeof(ptag->crypto_params));
 
 	/* update number of TIS contexts */
 	if (ptag->tisn == 0)
 		atomic_add_32(&ptag->tls->num_resources, -1U);
 
 	/* return tag to UMA */
 	uma_zfree(ptag->tls->zone, ptag);
 }
 
 int
 mlx5e_tls_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tls *ptls = &priv->tls;
 	struct sysctl_oid *node;
 	uint32_t x;
 
 	if (MLX5_CAP_GEN(priv->mdev, tls) == 0)
 		return (0);
 
 	ptls->wq = create_singlethread_workqueue("mlx5-tls-wq");
 	if (ptls->wq == NULL)
 		return (ENOMEM);
 
 	sysctl_ctx_init(&ptls->ctx);
 
 	snprintf(ptls->zname, sizeof(ptls->zname),
 	    "mlx5_%u_tls", device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	ptls->zone = uma_zcreate(ptls->zname, sizeof(struct mlx5e_tls_tag),
 	    NULL, NULL, mlx5e_tls_tag_zinit, mlx5e_tls_tag_zfini, UMA_ALIGN_CACHE, 0);
 
 	ptls->max_resources = 1U << MLX5_CAP_GEN(priv->mdev, log_max_dek);
 
 	for (x = 0; x != MLX5E_TLS_STATS_NUM; x++)
 		ptls->stats.arg[x] = counter_u64_alloc(M_WAITOK);
 
 	ptls->init = 1;
 
 	node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,
 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
 	    "tls", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hardware TLS offload");
 	if (node == NULL)
 		return (0);
 
 	mlx5e_create_counter_stats(&ptls->ctx,
 	    SYSCTL_CHILDREN(node), "stats",
 	    mlx5e_tls_stats_desc, MLX5E_TLS_STATS_NUM,
 	    ptls->stats.arg);
 
 	return (0);
 }
 
 void
 mlx5e_tls_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tls *ptls = &priv->tls;
 	uint32_t x;
 
 	if (MLX5_CAP_GEN(priv->mdev, tls) == 0)
 		return;
 
 	ptls->init = 0;
 	flush_workqueue(ptls->wq);
 	sysctl_ctx_free(&ptls->ctx);
 	uma_zdestroy(ptls->zone);
 	destroy_workqueue(ptls->wq);
 
 	/* check if all resources are freed */
 	MPASS(priv->tls.num_resources == 0);
 
 	for (x = 0; x != MLX5E_TLS_STATS_NUM; x++)
 		counter_u64_free(ptls->stats.arg[x]);
 }
 
 static void
 mlx5e_tls_work(struct work_struct *work)
 {
 	struct mlx5e_tls_tag *ptag;
 	struct mlx5e_priv *priv;
 	int err;
 
 	ptag = container_of(work, struct mlx5e_tls_tag, work);
 	priv = container_of(ptag->tls, struct mlx5e_priv, tls);
 
 	switch (ptag->state) {
 	case MLX5E_TLS_ST_SETUP:
 		/* try to open TIS, if not present */
 		if (ptag->tisn == 0) {
 			err = mlx5_tls_open_tis(priv->mdev, 0, priv->tdn,
 			    priv->pdn, &ptag->tisn);
 			if (err) {
 				MLX5E_TLS_STAT_INC(ptag, tx_error, 1);
 				break;
 			}
 		}
 		MLX5_SET(sw_tls_cntx, ptag->crypto_params, progress.pd, ptag->tisn);
 
 		/* try to allocate a DEK context ID */
 		err = mlx5_encryption_key_create(priv->mdev, priv->pdn,
 		    MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, key.key_data),
 		    MLX5_GET(sw_tls_cntx, ptag->crypto_params, key.key_len),
 		    &ptag->dek_index);
 		if (err) {
 			MLX5E_TLS_STAT_INC(ptag, tx_error, 1);
 			break;
 		}
 
 		MLX5_SET(sw_tls_cntx, ptag->crypto_params, param.dek_index, ptag->dek_index);
 
 		ptag->dek_index_ok = 1;
 
 		MLX5E_TLS_TAG_LOCK(ptag);
 		if (ptag->state == MLX5E_TLS_ST_SETUP)
 			ptag->state = MLX5E_TLS_ST_TXRDY;
 		MLX5E_TLS_TAG_UNLOCK(ptag);
 		break;
 
 	case MLX5E_TLS_ST_FREED:
 		/* wait for all refs to go away */
 		while (ptag->refs != 0)
 			msleep(1);
 
 		/* try to destroy DEK context by ID */
 		if (ptag->dek_index_ok)
 			err = mlx5_encryption_key_destroy(priv->mdev, ptag->dek_index);
 
 		/* free tag */
 		mlx5e_tls_tag_zfree(ptag);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_tls_set_params(void *ctx, const struct tls_session_params *en)
 {
 
 	MLX5_SET(sw_tls_cntx, ctx, param.const_2, 2);
 	if (en->tls_vminor == TLS_MINOR_VER_TWO)
 		MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 2); /* v1.2 */
 	else
 		MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 3); /* v1.3 */
 	MLX5_SET(sw_tls_cntx, ctx, param.const_1, 1);
 	MLX5_SET(sw_tls_cntx, ctx, param.encryption_standard, 1); /* TLS */
 
 	/* copy the initial vector in place */
 	if (en->iv_len == MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv)) {
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.gcm_iv),
 		    en->iv, MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv));
 	} else if (en->iv_len == (MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv) +
 				  MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv))) {
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.gcm_iv),
 		    (char *)en->iv + MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv),
 		    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv));
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.implicit_iv),
 		    en->iv,
 		    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv));
 	} else {
 		return (EINVAL);
 	}
 
 	if (en->cipher_key_len <= MLX5_FLD_SZ_BYTES(sw_tls_cntx, key.key_data)) {
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, key.key_data),
 		    en->cipher_key, en->cipher_key_len);
 		MLX5_SET(sw_tls_cntx, ctx, key.key_len, en->cipher_key_len);
 	} else {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /* Verify zero default */
 CTASSERT(MLX5E_TLS_ST_INIT == 0);
 
 int
 mlx5e_tls_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct if_snd_tag_alloc_rate_limit rl_params;
 	struct mlx5e_priv *priv;
 	struct mlx5e_tls_tag *ptag;
 	const struct tls_session_params *en;
 	int error;
 
 	priv = ifp->if_softc;
 
 	if (priv->tls.init == 0)
 		return (EOPNOTSUPP);
 
 	/* allocate new tag from zone, if any */
 	ptag = uma_zalloc(priv->tls.zone, M_NOWAIT);
 	if (ptag == NULL)
 		return (ENOMEM);
 
 	/* sanity check default values */
 	MPASS(ptag->state == MLX5E_TLS_ST_INIT);
 	MPASS(ptag->dek_index == 0);
 	MPASS(ptag->dek_index_ok == 0);
 
 	/* setup TLS tag */
 	ptag->tls = &priv->tls;
 	ptag->tag.type = params->hdr.type;
 
 	/* check if there is no TIS context */
 	if (ptag->tisn == 0) {
 		uint32_t value;
 
 		value = atomic_fetchadd_32(&priv->tls.num_resources, 1U);
 
 		/* check resource limits */
 		if (value >= priv->tls.max_resources) {
 			error = ENOMEM;
 			goto failure;
 		}
 	}
 
 	en = &params->tls.tls->params;
 
 	/* only TLS v1.2 and v1.3 is currently supported */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    (en->tls_vminor != TLS_MINOR_VER_TWO
 #ifdef TLS_MINOR_VER_THREE
 	     && en->tls_vminor != TLS_MINOR_VER_THREE
 #endif
 	     )) {
 		error = EPROTONOSUPPORT;
 		goto failure;
 	}
 
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		switch (en->cipher_key_len) {
 		case 128 / 8:
 			if (en->tls_vminor == TLS_MINOR_VER_TWO) {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_128) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			} else {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_128) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			}
 			error = mlx5e_tls_set_params(ptag->crypto_params, en);
 			if (error)
 				goto failure;
 			break;
 
 		case 256 / 8:
 			if (en->tls_vminor == TLS_MINOR_VER_TWO) {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_256) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			} else {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_256) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			}
 			error = mlx5e_tls_set_params(ptag->crypto_params, en);
 			if (error)
 				goto failure;
 			break;
 
 		default:
 			error = EINVAL;
 			goto failure;
 		}
 		break;
 	default:
 		error = EPROTONOSUPPORT;
 		goto failure;
 	}
 
 	switch (ptag->tag.type) {
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		memset(&rl_params, 0, sizeof(rl_params));
 		rl_params.hdr = params->tls_rate_limit.hdr;
 		rl_params.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT;
 		rl_params.max_rate = params->tls_rate_limit.max_rate;
 
 		error = mlx5e_rl_snd_tag_alloc(ifp,
 		    container_of(&rl_params, union if_snd_tag_alloc_params, rate_limit),
 		    &ptag->rl_tag);
 		if (error)
 			goto failure;
 		break;
 #endif
 	case IF_SND_TAG_TYPE_TLS:
 		memset(&rl_params, 0, sizeof(rl_params));
 		rl_params.hdr = params->tls.hdr;
 		rl_params.hdr.type = IF_SND_TAG_TYPE_UNLIMITED;
 
 		error = mlx5e_ul_snd_tag_alloc(ifp,
 		    container_of(&rl_params, union if_snd_tag_alloc_params, unlimited),
 		    &ptag->rl_tag);
 		if (error)
 			goto failure;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto failure;
 	}
 
 	/* store pointer to mbuf tag */
 	MPASS(ptag->tag.m_snd_tag.refcount == 0);
 	m_snd_tag_init(&ptag->tag.m_snd_tag, ifp);
 	*ppmt = &ptag->tag.m_snd_tag;
 	return (0);
 
 failure:
 	mlx5e_tls_tag_zfree(ptag);
 	return (error);
 }
 
 int
 mlx5e_tls_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
 {
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	struct if_snd_tag_rate_limit_params rl_params;
 	int error;
 #endif
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag.m_snd_tag);
 
 	switch (ptag->tag.type) {
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		memset(&rl_params, 0, sizeof(rl_params));
 		rl_params.max_rate = params->tls_rate_limit.max_rate;
 		error = mlx5e_rl_snd_tag_modify(ptag->rl_tag,
 		    container_of(&rl_params, union if_snd_tag_modify_params, rate_limit));
 		return (error);
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 int
 mlx5e_tls_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag.m_snd_tag);
 	int error;
 
 	switch (ptag->tag.type) {
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		error = mlx5e_rl_snd_tag_query(ptag->rl_tag, params);
 		break;
 #endif
 	case IF_SND_TAG_TYPE_TLS:
 		error = mlx5e_ul_snd_tag_query(ptag->rl_tag, params);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 void
 mlx5e_tls_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag.m_snd_tag);
 	struct mlx5e_priv *priv;
 
 	switch (ptag->tag.type) {
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		mlx5e_rl_snd_tag_free(ptag->rl_tag);
 		break;
 #endif
 	case IF_SND_TAG_TYPE_TLS:
 		mlx5e_ul_snd_tag_free(ptag->rl_tag);
 		break;
 	default:
 		break;
 	}
 
 	MLX5E_TLS_TAG_LOCK(ptag);
 	ptag->state = MLX5E_TLS_ST_FREED;
 	MLX5E_TLS_TAG_UNLOCK(ptag);
 
 	priv = ptag->tag.m_snd_tag.ifp->if_softc;
 	queue_work(priv->tls.wq, &ptag->work);
 }
 
 CTASSERT((MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) % 16) == 0);
 
 static void
 mlx5e_tls_send_static_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_umr_wqe) +
 	    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param), MLX5_SEND_WQE_DS);
 	struct mlx5e_tx_umr_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) |
 	    MLX5_OPCODE_UMR | (MLX5_OPCODE_MOD_UMR_TLS_TIS_STATIC_PARAMS << 24));
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	wqe->ctrl.imm = cpu_to_be32(ptag->tisn << 8);
 
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 	else
 		wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* fill out UMR control segment */
 	wqe->umr.flags = 0x80;	/* inline data */
 	wqe->umr.bsf_octowords = cpu_to_be16(MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) / 16);
 
 	/* copy in the static crypto parameters */
 	memcpy(wqe + 1, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, param),
 	    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param));
 
 	/* copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 CTASSERT(MLX5_FLD_SZ_BYTES(sw_tls_cntx, progress) ==
     sizeof(((struct mlx5e_tx_psv_wqe *)0)->psv));
 
 static void
 mlx5e_tls_send_progress_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_psv_wqe),
 	    MLX5_SEND_WQE_DS);
 	struct mlx5e_tx_psv_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) |
 	    MLX5_OPCODE_SET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIS_PROGRESS_PARAMS << 24));
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 	else
 		wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* copy in the PSV control segment */
 	memcpy(&wqe->psv, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, progress),
 	    sizeof(wqe->psv));
 
 	/* copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 static void
 mlx5e_tls_send_nop(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = MLX5_SEND_WQEBB_NUM_DS;
 	struct mlx5e_tx_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 	else
 		wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 #define	SBTLS_MBUF_NO_DATA ((struct mbuf *)1)
 
 static struct mbuf *
 sbtls_recover_record(struct mbuf *mb, int wait, uint32_t tcp_old, uint32_t *ptcp_seq)
 {
 	struct mbuf *mr, *top;
 	uint32_t offset;
 	uint32_t delta;
 
 	/* check format of incoming mbuf */
 	if (mb->m_next == NULL ||
 	    (mb->m_next->m_flags & (M_NOMAP | M_EXT)) != (M_NOMAP | M_EXT)) {
 		top = NULL;
 		goto done;
 	}
 
 	/* get unmapped data offset */
 	offset = mtod(mb->m_next, uintptr_t);
 
 	/* check if we don't need to re-transmit anything */
 	if (offset == 0) {
 		top = SBTLS_MBUF_NO_DATA;
 		goto done;
 	}
 
 	/* try to get a new  packet header */
 	top = m_gethdr(wait, MT_DATA);
 	if (top == NULL)
 		goto done;
 
 	mr = m_get(wait, MT_DATA);
 	if (mr == NULL) {
 		m_free(top);
 		top = NULL;
 		goto done;
 	}
 
 	top->m_next = mr;
 
 	mb_dupcl(mr, mb->m_next);
 
 	/* the beginning of the TLS record */
 	mr->m_data = NULL;
 
 	/* setup packet header length */
 	top->m_pkthdr.len = mr->m_len = offset;
 
 	/* check for partial re-transmit */
 	delta = *ptcp_seq - tcp_old;
 
 	if (delta < offset) {
 		m_adj(mr, offset - delta);
 		offset = delta;
 	}
 
 	/*
 	 * Rewind the TCP sequence number by the amount of data
 	 * retransmitted:
 	 */
 	*ptcp_seq -= offset;
 done:
 	return (top);
 }
 
 static int
 mlx5e_sq_tls_populate(struct mbuf *mb, uint64_t *pseq)
 {
 
 	for (; mb != NULL; mb = mb->m_next) {
 		if (!(mb->m_flags & M_NOMAP))
 			continue;
-		*pseq = mb->m_ext_pgs.seqno;
+		*pseq = mb->m_epg_seqno;
 		return (1);
 	}
 	return (0);
 }
 
 int
 mlx5e_sq_tls_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **ppmb)
 {
 	struct mlx5e_tls_tag *ptls_tag;
 	struct mlx5e_snd_tag *ptag;
 	struct tcphdr *th;
 	struct mbuf *mb = *ppmb;
 	u64 rcd_sn;
 	u32 header_size;
 	u32 mb_seq;
 
 	if ((mb->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0)
 		return (MLX5E_TLS_CONTINUE);
 
 	ptag = container_of(mb->m_pkthdr.snd_tag,
 	    struct mlx5e_snd_tag, m_snd_tag);
 
 	if (
 #if defined(RATELIMIT) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	    ptag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT &&
 #endif
 	    ptag->type != IF_SND_TAG_TYPE_TLS)
 		return (MLX5E_TLS_CONTINUE);
 
 	ptls_tag = container_of(ptag, struct mlx5e_tls_tag, tag);
 
 	header_size = mlx5e_get_full_header_size(mb, &th);
 	if (unlikely(header_size == 0 || th == NULL))
 		return (MLX5E_TLS_FAILURE);
 
 	/*
 	 * Send non-TLS TCP packets AS-IS:
 	 */
 	if (header_size == mb->m_pkthdr.len ||
 	    mlx5e_sq_tls_populate(mb, &rcd_sn) == 0) {
 		parg->tisn = 0;
 		parg->ihs = header_size;
 		return (MLX5E_TLS_CONTINUE);
 	}
 
 	mb_seq = ntohl(th->th_seq);
 
 	MLX5E_TLS_TAG_LOCK(ptls_tag);
 	switch (ptls_tag->state) {
 	case MLX5E_TLS_ST_INIT:
 		queue_work(sq->priv->tls.wq, &ptls_tag->work);
 		ptls_tag->state = MLX5E_TLS_ST_SETUP;
 		ptls_tag->expected_seq = ~mb_seq;	/* force setup */
 		MLX5E_TLS_TAG_UNLOCK(ptls_tag);
 		return (MLX5E_TLS_FAILURE);
 
 	case MLX5E_TLS_ST_SETUP:
 		MLX5E_TLS_TAG_UNLOCK(ptls_tag);
 		return (MLX5E_TLS_FAILURE);
 
 	default:
 		MLX5E_TLS_TAG_UNLOCK(ptls_tag);
 		break;
 	}
 
 	if (unlikely(ptls_tag->expected_seq != mb_seq)) {
 		struct mbuf *r_mb;
 		uint32_t tcp_seq = mb_seq;
 
 		r_mb = sbtls_recover_record(mb, M_NOWAIT, ptls_tag->expected_seq, &tcp_seq);
 		if (r_mb == NULL) {
 			MLX5E_TLS_STAT_INC(ptls_tag, tx_error, 1);
 			return (MLX5E_TLS_FAILURE);
 		}
 
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_packets_ooo, 1);
 
 		/* check if this is the first fragment of a TLS record */
 		if (r_mb == SBTLS_MBUF_NO_DATA || r_mb->m_data == NULL) {
 			/* setup TLS static parameters */
 			MLX5_SET64(sw_tls_cntx, ptls_tag->crypto_params,
 			    param.initial_record_number, rcd_sn);
 
 			/* setup TLS progress parameters */
 			MLX5_SET(sw_tls_cntx, ptls_tag->crypto_params,
 			    progress.next_record_tcp_sn, tcp_seq);
 
 			/*
 			 * NOTE: The sendqueue should have enough room to
 			 * carry both the static and the progress parameters
 			 * when we get here!
 			 */
 			mlx5e_tls_send_static_parameters(sq, ptls_tag);
 			mlx5e_tls_send_progress_parameters(sq, ptls_tag);
 
 			if (r_mb == SBTLS_MBUF_NO_DATA) {
 				mlx5e_tls_send_nop(sq, ptls_tag);
 				ptls_tag->expected_seq = mb_seq;
 				return (MLX5E_TLS_LOOP);
 			}
 		}
 
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes_ooo, r_mb->m_pkthdr.len);
 
 		/* setup transmit arguments */
 		parg->tisn = ptls_tag->tisn;
 		parg->pref = &ptls_tag->refs;
 
 		/* try to send DUMP data */
 		if (mlx5e_sq_dump_xmit(sq, parg, &r_mb) != 0) {
 			m_freem(r_mb);
 			ptls_tag->expected_seq = tcp_seq;
 			return (MLX5E_TLS_FAILURE);
 		} else {
 			ptls_tag->expected_seq = mb_seq;
 			return (MLX5E_TLS_LOOP);
 		}
 	} else {
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_packets, 1);
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes, mb->m_pkthdr.len);
 	}
 	ptls_tag->expected_seq += mb->m_pkthdr.len - header_size;
 
 	parg->tisn = ptls_tag->tisn;
 	parg->ihs = header_size;
 	parg->pref = &ptls_tag->refs;
 	return (MLX5E_TLS_CONTINUE);
 }
 
 #else
 
 int
 mlx5e_tls_init(struct mlx5e_priv *priv)
 {
 
 	return (0);
 }
 
 void
 mlx5e_tls_cleanup(struct mlx5e_priv *priv)
 {
 	/* NOP */
 }
 
 #endif		/* KERN_TLS */
Index: head/sys/kern/kern_mbuf.c
===================================================================
--- head/sys/kern/kern_mbuf.c	(revision 360578)
+++ head/sys/kern/kern_mbuf.c	(revision 360579)
@@ -1,1518 +1,1518 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004, 2005,
  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <vm/uma_dbg.h>
 
 /*
  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
  * Zones.
  *
  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
  * administrator so desires.
  *
  * Mbufs are allocated from a UMA Master Zone called the Mbuf
  * Zone.
  *
  * Additionally, FreeBSD provides a Packet Zone, which it
  * configures as a Secondary Zone to the Mbuf Master Zone,
  * thus sharing backend Slab kegs with the Mbuf Master Zone.
  *
  * Thus common-case allocations and locking are simplified:
  *
  *  m_clget()                m_getcl()
  *    |                         |
  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
  *    |   |             [     Packet   ]            |
  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
  *        |                       \________         |
  *  [ Cluster Keg   ]                      \       /
  *        |	                         [ Mbuf Keg   ]
  *  [ Cluster Slabs ]                         |
  *        |                              [ Mbuf Slabs ]
  *         \____________(VM)_________________/
  *
  *
  * Whenever an object is allocated with uma_zalloc() out of
  * one of the Zones its _ctor_ function is executed.  The same
  * for any deallocation through uma_zfree() the _dtor_ function
  * is executed.
  *
  * Caches are per-CPU and are filled from the Master Zone.
  *
  * Whenever an object is allocated from the underlying global
  * memory pool it gets pre-initialized with the _zinit_ functions.
  * When the Keg's are overfull objects get decommissioned with
  * _zfini_ functions and free'd back to the global memory pool.
  *
  */
 
 int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
 
 bool mb_use_ext_pgs;		/* use EXT_PGS mbufs for sendfile & TLS */
 SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN,
     &mb_use_ext_pgs, 0,
     "Use unmapped mbufs for sendfile(2) and TLS offload");
 
 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
 
 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
     "Maximum real memory allocatable to various mbuf types");
 
 static counter_u64_t snd_tag_count;
 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW,
     &snd_tag_count, "# of active mbuf send tags");
 
 /*
  * tunable_mbinit() has to be run before any mbuf allocations are done.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 	quad_t realmem;
 
 	/*
 	 * The default limit for all mbuf related memory is 1/2 of all
 	 * available kernel memory (physical or kmem).
 	 * At most it can be 3/4 of available kernel memory.
 	 */
 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
 	maxmbufmem = realmem / 2;
 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
 	if (maxmbufmem > realmem / 4 * 3)
 		maxmbufmem = realmem / 4 * 3;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	if (nmbclusters == 0)
 		nmbclusters = maxmbufmem / MCLBYTES / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
 	if (nmbjumbop == 0)
 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
 	if (nmbjumbo9 == 0)
 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
 	if (nmbjumbo16 == 0)
 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
 
 	/*
 	 * We need at least as many mbufs as we have clusters of
 	 * the various types added together.
 	 */
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
 }
 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
 
 static int
 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbclusters;
 
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
 		if (newnmbclusters > nmbclusters &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0,
     sysctl_nmbclusters, "IU",
     "Maximum number of mbuf clusters allowed");
 
 static int
 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbop;
 
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
 		if (newnmbjumbop > nmbjumbop &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0,
     sysctl_nmbjumbop, "IU",
     "Maximum number of mbuf page size jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo9;
 
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
 		if (newnmbjumbo9 > nmbjumbo9 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0,
     sysctl_nmbjumbo9, "IU",
     "Maximum number of mbuf 9k jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo16;
 
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
 		if (newnmbjumbo16 > nmbjumbo16 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0,
     sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
 static int
 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbufs;
 
 	newnmbufs = nmbufs;
 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
 		if (newnmbufs > nmbufs) {
 			nmbufs = newnmbufs;
 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 			EVENTHANDLER_INVOKE(nmbufs_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &nmbufs, 0, sysctl_nmbufs, "IU",
     "Maximum number of mbufs allowed");
 
 /*
  * Zones from which we allocate.
  */
 uma_zone_t	zone_mbuf;
 uma_zone_t	zone_clust;
 uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
 
 /*
  * Local prototypes.
  */
 static int	mb_ctor_mbuf(void *, int, void *, int);
 static int	mb_ctor_clust(void *, int, void *, int);
 static int	mb_ctor_pack(void *, int, void *, int);
 static void	mb_dtor_mbuf(void *, int, void *);
 static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
 _Static_assert(sizeof(struct mbuf) <= MSIZE,
     "size of mbuf exceeds MSIZE");
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
 static void
 mbuf_init(void *dummy)
 {
 
 	/*
 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
 	 */
 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
 	    mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET);
 	if (nmbufs > 0)
 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbclusters > 0)
 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
 
 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbop > 0)
 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo9 > 0)
 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo16 > 0)
 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
 	 * later pushes it back to VM).
 	 */
 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 
 	snd_tag_count = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
 
 #ifdef DEBUGNET
 /*
  * debugnet makes use of a pre-allocated pool of mbufs and clusters.  When
  * debugnet is configured, we initialize a set of UMA cache zones which return
  * items from this pool.  At panic-time, the regular UMA zone pointers are
  * overwritten with those of the cache zones so that drivers may allocate and
  * free mbufs and clusters without attempting to allocate physical memory.
  *
  * We keep mbufs and clusters in a pair of mbuf queues.  In particular, for
  * the purpose of caching clusters, we treat them as mbufs.
  */
 static struct mbufq dn_mbufq =
     { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX };
 static struct mbufq dn_clustq =
     { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX };
 
 static int dn_clsize;
 static uma_zone_t dn_zone_mbuf;
 static uma_zone_t dn_zone_clust;
 static uma_zone_t dn_zone_pack;
 
 static struct debugnet_saved_zones {
 	uma_zone_t dsz_mbuf;
 	uma_zone_t dsz_clust;
 	uma_zone_t dsz_pack;
 	uma_zone_t dsz_jumbop;
 	uma_zone_t dsz_jumbo9;
 	uma_zone_t dsz_jumbo16;
 	bool dsz_debugnet_zones_enabled;
 } dn_saved_zones;
 
 static int
 dn_buf_import(void *arg, void **store, int count, int domain __unused,
     int flags)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = mbufq_dequeue(q);
 		if (m == NULL)
 			break;
 		trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_buf_release(void *arg, void **store, int count)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		(void)mbufq_enqueue(q, m);
 	}
 }
 
 static int
 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused,
     int flags __unused)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = m_get(MT_DATA, M_NOWAIT);
 		if (m == NULL)
 			break;
 		clust = uma_zalloc(dn_zone_clust, M_NOWAIT);
 		if (clust == NULL) {
 			m_free(m);
 			break;
 		}
 		mb_ctor_clust(clust, dn_clsize, m, 0);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_pack_release(void *arg __unused, void **store, int count)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		clust = m->m_ext.ext_buf;
 		uma_zfree(dn_zone_clust, clust);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 }
 
 /*
  * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy
  * the corresponding UMA cache zones.
  */
 void
 debugnet_mbuf_drain(void)
 {
 	struct mbuf *m;
 	void *item;
 
 	if (dn_zone_mbuf != NULL) {
 		uma_zdestroy(dn_zone_mbuf);
 		dn_zone_mbuf = NULL;
 	}
 	if (dn_zone_clust != NULL) {
 		uma_zdestroy(dn_zone_clust);
 		dn_zone_clust = NULL;
 	}
 	if (dn_zone_pack != NULL) {
 		uma_zdestroy(dn_zone_pack);
 		dn_zone_pack = NULL;
 	}
 
 	while ((m = mbufq_dequeue(&dn_mbufq)) != NULL)
 		m_free(m);
 	while ((item = mbufq_dequeue(&dn_clustq)) != NULL)
 		uma_zfree(m_getzone(dn_clsize), item);
 }
 
 /*
  * Callback invoked immediately prior to starting a debugnet connection.
  */
 void
 debugnet_mbuf_start(void)
 {
 
 	MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	/* Save the old zone pointers to restore when debugnet is closed. */
 	dn_saved_zones = (struct debugnet_saved_zones) {
 		.dsz_debugnet_zones_enabled = true,
 		.dsz_mbuf = zone_mbuf,
 		.dsz_clust = zone_clust,
 		.dsz_pack = zone_pack,
 		.dsz_jumbop = zone_jumbop,
 		.dsz_jumbo9 = zone_jumbo9,
 		.dsz_jumbo16 = zone_jumbo16,
 	};
 
 	/*
 	 * All cluster zones return buffers of the size requested by the
 	 * drivers.  It's up to the driver to reinitialize the zones if the
 	 * MTU of a debugnet-enabled interface changes.
 	 */
 	printf("debugnet: overwriting mbuf zone pointers\n");
 	zone_mbuf = dn_zone_mbuf;
 	zone_clust = dn_zone_clust;
 	zone_pack = dn_zone_pack;
 	zone_jumbop = dn_zone_clust;
 	zone_jumbo9 = dn_zone_clust;
 	zone_jumbo16 = dn_zone_clust;
 }
 
 /*
  * Callback invoked when a debugnet connection is closed/finished.
  */
 void
 debugnet_mbuf_finish(void)
 {
 
 	MPASS(dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	printf("debugnet: restoring mbuf zone pointers\n");
 	zone_mbuf = dn_saved_zones.dsz_mbuf;
 	zone_clust = dn_saved_zones.dsz_clust;
 	zone_pack = dn_saved_zones.dsz_pack;
 	zone_jumbop = dn_saved_zones.dsz_jumbop;
 	zone_jumbo9 = dn_saved_zones.dsz_jumbo9;
 	zone_jumbo16 = dn_saved_zones.dsz_jumbo16;
 
 	memset(&dn_saved_zones, 0, sizeof(dn_saved_zones));
 }
 
 /*
  * Reinitialize the debugnet mbuf+cluster pool and cache zones.
  */
 void
 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize)
 {
 	struct mbuf *m;
 	void *item;
 
 	debugnet_mbuf_drain();
 
 	dn_clsize = clsize;
 
 	dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME,
 	    MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_mbufq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME,
 	    clsize, mb_ctor_clust, NULL, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_clustq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME,
 	    MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL,
 	    dn_pack_import, dn_pack_release,
 	    NULL, UMA_ZONE_NOBUCKET);
 
 	while (nmbuf-- > 0) {
 		m = m_get(MT_DATA, M_WAITOK);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 	while (nclust-- > 0) {
 		item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK);
 		uma_zfree(dn_zone_clust, item);
 	}
 }
 #endif /* DEBUGNET */
 
 /*
  * Constructor for Mbuf master zone.
  *
  * The 'arg' pointer points to a mb_args structure which
  * contains call-specific information required to support the
  * mbuf allocation API.  See mbuf.h.
  */
 static int
 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error;
 	int flags;
 	short type;
 
 	args = (struct mb_args *)arg;
 	type = args->type;
 
 	/*
 	 * The mbuf is initialized later.  The caller has the
 	 * responsibility to set up any MAC labels too.
 	 */
 	if (type == MT_NOINIT)
 		return (0);
 
 	m = (struct mbuf *)mem;
 	flags = args->flags;
 	MPASS((flags & M_NOFREE) == 0);
 
 	error = m_init(m, how, type, flags);
 
 	return (error);
 }
 
 /*
  * The Mbuf master zone destructor.
  */
 static void
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 	unsigned long flags;
 
 	m = (struct mbuf *)mem;
 	flags = (unsigned long)arg;
 
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 	if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
 }
 
 /*
  * The Mbuf Packet zone destructor.
  */
 static void
 mb_dtor_pack(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 	if ((m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
 
 	/* Make sure we've got a clean cluster back. */
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
 #ifdef INVARIANTS
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
 	 * to be freed up, cause them to be woken up by draining the
 	 * packet zone.  We are exposed to a race here (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted(zone_clust))
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 }
 
 /*
  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  *
  * Here the 'arg' pointer points to the Mbuf which we
  * are configuring cluster storage for.  If 'arg' is
  * empty we allocate just the cluster without setting
  * the mbuf to it.  See mbuf.h.
  */
 static int
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)arg;
 	if (m != NULL) {
 		m->m_ext.ext_buf = (char *)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
 		m->m_ext.ext_free = NULL;
 		m->m_ext.ext_arg1 = NULL;
 		m->m_ext.ext_arg2 = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = m_gettype(size);
 		m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 		m->m_ext.ext_count = 1;
 	}
 
 	return (0);
 }
 
 /*
  * The Packet secondary zone's init routine, executed on the
  * object's transition from mbuf keg slab to zone cache.
  */
 static int
 mb_zinit_pack(void *mem, int size, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
 #ifdef INVARIANTS
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
 	return (0);
 }
 
 /*
  * The Packet secondary zone's fini routine, executed on the
  * object's transition from zone cache to keg slab.
  */
 static void
 mb_zfini_pack(void *mem, int size)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 #ifdef INVARIANTS
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
 #ifdef INVARIANTS
 	trash_dtor(mem, size, NULL);
 #endif
 }
 
 /*
  * The "packet" keg constructor.
  */
 static int
 mb_ctor_pack(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error, flags;
 	short type;
 
 	m = (struct mbuf *)mem;
 	args = (struct mb_args *)arg;
 	flags = args->flags;
 	type = args->type;
 	MPASS((flags & M_NOFREE) == 0);
 
 #ifdef INVARIANTS
 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
 #endif
 
 	error = m_init(m, how, type, flags);
 
 	/* m_ext is already initialized. */
 	m->m_data = m->m_ext.ext_buf;
  	m->m_flags = (flags | M_EXT);
 
 	return (error);
 }
 
 /*
  * This is the protocol drain routine.  Called by UMA whenever any of the
  * mbuf zones is closed to its limit.
  *
  * No locks should be held when this is called.  The drain routines have to
  * presently acquire some locks which raises the possibility of lock order
  * reversal.
  */
 static void
 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 {
 	struct epoch_tracker et;
 	struct domain *dp;
 	struct protosw *pr;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
 
 	NET_EPOCH_ENTER(et);
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
 				(*pr->pr_drain)();
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Free "count" units of I/O from an mbuf chain.  They could be held
  * in EXT_PGS or just as a normal mbuf.  This code is intended to be
  * called in an error path (I/O error, closed connection, etc).
  */
 void
 mb_free_notready(struct mbuf *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count && m != NULL; i++) {
 		if ((m->m_flags & M_EXT) != 0 &&
 		    m->m_ext.ext_type == EXT_PGS) {
-			m->m_ext_pgs.nrdy--;
-			if (m->m_ext_pgs.nrdy != 0)
+			m->m_epg_nrdy--;
+			if (m->m_epg_nrdy != 0)
 				continue;
 		}
 		m = m_free(m);
 	}
 	KASSERT(i == count, ("Removed only %d items from %p", i, m));
 }
 
 /*
  * Compress an unmapped mbuf into a simple mbuf when it holds a small
  * amount of data.  This is used as a DOS defense to avoid having
  * small packets tie up wired pages, an ext_pgs structure, and an
  * mbuf.  Since this converts the existing mbuf in place, it can only
  * be used if there are no other references to 'm'.
  */
 int
 mb_unmapped_compress(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	char buf[MLEN];
 
 	/*
 	 * Assert that 'm' does not have a packet header.  If 'm' had
 	 * a packet header, it would only be able to hold MHLEN bytes
 	 * and m_data would have to be initialized differently.
 	 */
 	KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
 	    m->m_ext.ext_type == EXT_PGS,
             ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
 	KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
 
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt != 1)
 		return (EBUSY);
 
 	m_copydata(m, 0, m->m_len, buf);
 
 	/* Free the backing pages. */
 	m->m_ext.ext_free(m);
 
 	/* Turn 'm' into a "normal" mbuf. */
 	m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
 	m->m_data = m->m_dat;
 
 	/* Copy data back into m. */
 	bcopy(buf, mtod(m, char *), m->m_len);
 
 	return (0);
 }
 
 /*
  * These next few routines are used to permit downgrading an unmapped
  * mbuf to a chain of mapped mbufs.  This is used when an interface
  * doesn't supported unmapped mbufs or if checksums need to be
  * computed in software.
  *
  * Each unmapped mbuf is converted to a chain of mbufs.  First, any
  * TLS header data is stored in a regular mbuf.  Second, each page of
  * unmapped data is stored in an mbuf with an EXT_SFBUF external
  * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
  * associated physical page.  They also hold a reference on the
  * original EXT_PGS mbuf to ensure the physical page doesn't go away.
  * Finally, any TLS trailer data is stored in a regular mbuf.
  *
  * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
  * mbufs.  It frees the associated sf_buf and releases its reference
  * on the original EXT_PGS mbuf.
  *
  * _mb_unmapped_to_ext() is a helper function that converts a single
  * unmapped mbuf into a chain of mbufs.
  *
  * mb_unmapped_to_ext() is the public function that walks an mbuf
  * chain converting any unmapped mbufs to mapped mbufs.  It returns
  * the new chain of unmapped mbufs on success.  On failure it frees
  * the original mbuf chain and returns NULL.
  */
 static void
 mb_unmapped_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	struct mbuf *old_m;
 
 	sf = m->m_ext.ext_arg1;
 	sf_buf_free(sf);
 
 	/* Drop the reference on the backing EXT_PGS mbuf. */
 	old_m = m->m_ext.ext_arg2;
 	mb_free_ext(old_m);
 }
 
 static struct mbuf *
 _mb_unmapped_to_ext(struct mbuf *m)
 {
 	struct mbuf *m_new, *top, *prev, *mref;
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	volatile u_int *refcnt;
 	u_int ref_inc = 0;
 
 	MBUF_EXT_PGS_ASSERT(m);
 	len = m->m_len;
-	KASSERT(m->m_ext_pgs.tls == NULL, ("%s: can't convert TLS mbuf %p",
+	KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p",
 	    __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
 	top = NULL;
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			m_new = m_get(M_NOWAIT, MT_DATA);
 			if (m_new == NULL)
 				goto fail;
 			m_new->m_len = seglen;
 			prev = top = m_new;
 			memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff],
 			    seglen);
 		}
 	}
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL) {
 			top = prev = m_new;
 		} else {
 			prev->m_next = m_new;
 			prev = m_new;
 		}
 		sf = sf_buf_alloc(pg, SFB_NOWAIT);
 		if (sf == NULL)
 			goto fail;
 
 		ref_inc++;
 		m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
 		    mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
 		m_new->m_data += segoff;
 		m_new->m_len = seglen;
 
 		pgoff = 0;
 	};
 	if (len != 0) {
-		KASSERT((off + len) <= m->m_ext_pgs.trail_len,
+		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d)", off, len,
-		    m->m_ext_pgs.trail_len));
+		    m->m_epg_trllen));
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL)
 			top = m_new;
 		else
 			prev->m_next = m_new;
 		m_new->m_len = len;
 		memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len);
 	}
 
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be dropped
 		 * in mb_unmapped_free_mext().
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	return (top);
 
 fail:
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be
 		 * immediately dropped when these mbufs are freed
 		 * below.
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	m_freem(top);
 	return (NULL);
 }
 
 struct mbuf *
 mb_unmapped_to_ext(struct mbuf *top)
 {
 	struct mbuf *m, *next, *prev = NULL;
 
 	prev = NULL;
 	for (m = top; m != NULL; m = next) {
 		/* m might be freed, so cache the next pointer. */
 		next = m->m_next;
 		if (m->m_flags & M_NOMAP) {
 			if (prev != NULL) {
 				/*
 				 * Remove 'm' from the new chain so
 				 * that the 'top' chain terminates
 				 * before 'm' in case 'top' is freed
 				 * due to an error.
 				 */
 				prev->m_next = NULL;
 			}
 			m = _mb_unmapped_to_ext(m);
 			if (m == NULL) {
 				m_freem(top);
 				m_freem(next);
 				return (NULL);
 			}
 			if (prev == NULL) {
 				top = m;
 			} else {
 				prev->m_next = m;
 			}
 
 			/*
 			 * Replaced one mbuf with a chain, so we must
 			 * find the end of chain.
 			 */
 			prev = m_last(m);
 		} else {
 			if (prev != NULL) {
 				prev->m_next = m;
 			}
 			prev = m;
 		}
 	}
 	return (top);
 }
 
 /*
  * Allocate an empty EXT_PGS mbuf.  The ext_free routine is
  * responsible for freeing any pages backing this mbuf when it is
  * freed.
  */
 struct mbuf *
 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free)
 {
 	struct mbuf *m;
 
 	m = m_get(how, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
-	m->m_ext_pgs.npgs = 0;
-	m->m_ext_pgs.nrdy = 0;
-	m->m_ext_pgs.first_pg_off = 0;
-	m->m_ext_pgs.last_pg_len = 0;
-	m->m_ext_pgs.flags = 0;
-	m->m_ext_pgs.hdr_len = 0;
-	m->m_ext_pgs.trail_len = 0;
-	m->m_ext_pgs.tls = NULL;
-	m->m_ext_pgs.so = NULL;
+	m->m_epg_npgs = 0;
+	m->m_epg_nrdy = 0;
+	m->m_epg_1st_off = 0;
+	m->m_epg_last_len = 0;
+	m->m_epg_flags = 0;
+	m->m_epg_hdrlen = 0;
+	m->m_epg_trllen = 0;
+	m->m_epg_tls = NULL;
+	m->m_epg_so = NULL;
 	m->m_data = NULL;
 	m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
 	m->m_ext.ext_type = EXT_PGS;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_ext.ext_size = 0;
 	m->m_ext.ext_free = ext_free;
 	return (m);
 }
 
 /*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/*
 	 * Check if the header is embedded in the cluster.  It is
 	 * important that we can't touch any of the mbuf fields
 	 * after we have freed the external storage, since mbuf
 	 * could have been embedded in it.  For now, the mbufs
 	 * embedded into the cluster are always of type EXT_EXTREF,
 	 * and for this type we won't free the mref.
 	 */
 	if (m->m_flags & M_NOFREE) {
 		freembuf = 0;
 		KASSERT(m->m_ext.ext_type == EXT_EXTREF ||
 		    m->m_ext.ext_type == EXT_RXRING,
 		    ("%s: no-free mbuf %p has wrong type", __func__, m));
 	} else
 		freembuf = 1;
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:
 			/* The packet zone is special. */
 			if (*refcnt == 0)
 				*refcnt = 1;
 			uma_zfree(zone_pack, mref);
 			break;
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_PGS: {
 #ifdef KERN_TLS
 			struct ktls_session *tls;
 #endif
 
 			KASSERT(mref->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 #ifdef KERN_TLS
-			tls = mref->m_ext_pgs.tls;
+			tls = mref->m_epg_tls;
 			if (tls != NULL &&
 			    !refcount_release_if_not_last(&tls->refcount))
 				ktls_enqueue_to_free(mref);
 			else
 #endif
 				uma_zfree(zone_mbuf, mref);
 			break;
 		}
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			KASSERT(mref->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			m->m_ext.ext_free(m);
 			break;
 		case EXT_RXRING:
 			KASSERT(m->m_ext.ext_free == NULL,
 			    ("%s: ext_free is set", __func__));
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 			    ("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf && m != mref)
 		uma_zfree(zone_mbuf, m);
 }
 
 /*
  * Official mbuf(9) allocation KPI for stack and drivers:
  *
  * m_get()	- a single mbuf without any attachments, sys/mbuf.h.
  * m_gethdr()	- a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
  * m_getcl()	- an mbuf + 2k cluster, sys/mbuf.h.
  * m_clget()	- attach cluster to already allocated mbuf.
  * m_cljget()	- attach jumbo cluster to already allocated mbuf.
  * m_get2()	- allocate minimum mbuf that would fit size argument.
  * m_getm2()	- allocate a chain of mbufs/clusters.
  * m_extadd()	- attach external cluster to mbuf.
  *
  * m_free()	- free single mbuf with its tags and ext, sys/mbuf.h.
  * m_freem()	- free chain of mbufs.
  */
 
 int
 m_clget(struct mbuf *m, int how)
 {
 
 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 	    __func__, m));
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 	void *retval;
 
 	if (m != NULL) {
 		KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 		    __func__, m));
 		m->m_ext.ext_buf = NULL;
 	}
 
 	zone = m_getzone(size);
 	retval = uma_zalloc_arg(zone, m, how);
 
 	MBUF_PROBE4(m__cljget, m, how, size, retval);
 
 	return (retval);
 }
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 	return (m);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one and return a pointer to the provided mbuf.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
 	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
 	/* Validate flags. */
 	flags &= (M_PKTHDR | M_EOR);
 
 	/* Packet header mbuf must be first in chain. */
 	if ((flags & M_PKTHDR) && m != NULL)
 		flags &= ~M_PKTHDR;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	while (len > 0) {
 		if (len > MCLBYTES)
 			mb = m_getjcl(how, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		else if (len >= MINCLSIZE)
 			mb = m_getcl(how, type, (flags & M_PKTHDR));
 		else if (flags & M_PKTHDR)
 			mb = m_gethdr(how, type);
 		else
 			mb = m_get(how, type);
 
 		/* Fail the whole operation if one mbuf can't be allocated. */
 		if (mb == NULL) {
 			if (nm != NULL)
 				m_freem(nm);
 			return (NULL);
 		}
 
 		/* Book keeping. */
 		len -= M_SIZE(mb);
 		if (mtail != NULL)
 			mtail->m_next = mb;
 		else
 			nm = mb;
 		mtail = mb;
 		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
 	if (flags & M_EOR)
 		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 			;
 		mtail->m_next = nm;
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = nm;
 
 	return (m);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 void
 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
     void *arg1, void *arg2, int flags, int type)
 {
 
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 
 	if (type != EXT_EXTREF) {
 		mb->m_ext.ext_count = 1;
 		mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	} else
 		mb->m_ext.ext_flags = 0;
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	MBUF_PROBE1(m__freem, mb);
 	while (mb != NULL)
 		mb = m_free(mb);
 }
 
 void
 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp)
 {
 
 	if_ref(ifp);
 	mst->ifp = ifp;
 	refcount_init(&mst->refcount, 1);
 	counter_u64_add(snd_tag_count, 1);
 }
 
 void
 m_snd_tag_destroy(struct m_snd_tag *mst)
 {
 	struct ifnet *ifp;
 
 	ifp = mst->ifp;
 	ifp->if_snd_tag_free(mst);
 	if_rele(ifp);
 	counter_u64_add(snd_tag_count, -1);
 }
Index: head/sys/kern/kern_sendfile.c
===================================================================
--- head/sys/kern/kern_sendfile.c	(revision 360578)
+++ head/sys/kern/kern_sendfile.c	(revision 360579)
@@ -1,1364 +1,1364 @@
 /*-
  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory");
 
 #define	EXT_FLAG_SYNC		EXT_FLAG_VENDOR1
 #define	EXT_FLAG_NOCACHE	EXT_FLAG_VENDOR2
 #define	EXT_FLAG_CACHE_LAST	EXT_FLAG_VENDOR3
 
 /*
  * Structure describing a single sendfile(2) I/O, which may consist of
  * several underlying pager I/Os.
  *
  * The syscall context allocates the structure and initializes 'nios'
  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
  * paging operations, it increments 'nios'.
  *
  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
  * linking them and sending to socket.  Whoever reaches zero 'nios' is
  * responsible to * call pru_ready on the socket, to notify it of readyness
  * of the data.
  */
 struct sf_io {
 	volatile u_int	nios;
 	u_int		error;
 	int		npages;
 	struct socket	*so;
 	struct mbuf	*m;
 	vm_object_t	obj;
 	vm_pindex_t	pindex0;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	vm_page_t	pa[];
 };
 
 /*
  * Structure used to track requests with SF_SYNC flag.
  */
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
 	unsigned	count;
 	bool		waiting;
 };
 
 static void
 sendfile_sync_destroy(struct sendfile_sync *sfs)
 {
 	KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs));
 
 	cv_destroy(&sfs->cv);
 	mtx_destroy(&sfs->mtx);
 	free(sfs, M_SENDFILE);
 }
 
 static void
 sendfile_sync_signal(struct sendfile_sync *sfs)
 {
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs));
 	if (--sfs->count == 0) {
 		if (!sfs->waiting) {
 			/* The sendfile() waiter was interrupted by a signal. */
 			sendfile_sync_destroy(sfs);
 			return;
 		} else {
 			cv_signal(&sfs->cv);
 		}
 	}
 	mtx_unlock(&sfs->mtx);
 }
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0,
     sfstat_sysctl, "I",
     "sendfile statistics");
 
 static void
 sendfile_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int flags;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
 	    ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));
 
 	sf = m->m_ext.ext_arg1;
 	pg = sf_buf_page(sf);
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	sf_buf_free(sf);
 	vm_page_release(pg, flags);
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 static void
 sendfile_free_mext_pg(struct mbuf *m)
 {
 	vm_page_t pg;
 	int flags, i;
 	bool cache_last;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS,
 	    ("%s: m %p !M_EXT or !EXT_PGS", __func__, m));
 
 	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
-	for (i = 0; i < m->m_ext_pgs.npgs; i++) {
-		if (cache_last && i == m->m_ext_pgs.npgs - 1)
+	for (i = 0; i < m->m_epg_npgs; i++) {
+		if (cache_last && i == m->m_epg_npgs - 1)
 			flags = 0;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_release(pg, flags);
 	}
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg1;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 /*
  * Helper function to calculate how much data to put into page i of n.
  * Only first and last pages are special.
  */
 static inline off_t
 xfsize(int i, int n, off_t off, off_t len)
 {
 
 	if (i == 0)
 		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
 
 	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
 		return ((off + len) & PAGE_MASK);
 
 	return (PAGE_SIZE);
 }
 
 /*
  * Helper function to get offset within object for i page.
  */
 static inline vm_ooffset_t
 vmoff(int i, off_t off)
 {
 
 	if (i == 0)
 		return ((vm_ooffset_t)off);
 
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
 /*
  * Helper function used when allocation of a page or sf_buf failed.
  * Pretend as if we don't have enough space, subtract xfsize() of
  * all pages that failed.
  */
 static inline void
 fixspace(int old, int new, off_t off, int *space)
 {
 
 	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
 
 	/* Subtract last one. */
 	*space -= xfsize(old - 1, old, off, *space);
 	old--;
 
 	if (new == old)
 		/* There was only one page. */
 		return;
 
 	/* Subtract first one. */
 	if (new == 0) {
 		*space -= xfsize(0, old, off, *space);
 		new++;
 	}
 
 	/* Rest of pages are full sized. */
 	*space -= (old - new) * PAGE_SIZE;
 
 	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
 }
 
 /*
  * Wait for all in-flight ios to complete, we must not unwire pages
  * under them.
  */
 static void
 sendfile_iowait(struct sf_io *sfio, const char *wmesg)
 {
 	while (atomic_load_int(&sfio->nios) != 1)
 		pause(wmesg, 1);
 }
 
 /*
  * I/O completion callback.
  */
 static void
 sendfile_iodone(void *arg, vm_page_t *pa, int count, int error)
 {
 	struct sf_io *sfio = arg;
 	struct socket *so;
 	int i;
 
 	if (error != 0) {
 		sfio->error = error;
 		/*
 		 * Restore of the pg[] elements is done by
 		 * sendfile_swapin().
 		 */
 	} else {
 		/*
 		 * Restore the valid page pointers.  They are already
 		 * unbusied, but still wired.  For error != 0 case,
 		 * sendfile_swapin() handles unbusy.
 		 *
 		 * XXXKIB since pages are only wired, and we do not
 		 * own the object lock, other users might have
 		 * invalidated them in meantime.  Similarly, after we
 		 * unbusied the swapped-in pages, they can become
 		 * invalid under us.
 		 */
 		MPASS(count == 0 || pa[0] != bogus_page);
 		for (i = 0; i < count; i++) {
 			if (pa[i] == bogus_page) {
 				sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] =
 				    pa[i] = vm_page_relookup(sfio->obj,
 				    pa[0]->pindex + i);
 				KASSERT(pa[i] != NULL,
 				    ("%s: page %p[%d] disappeared",
 				    __func__, pa, i));
 			} else {
 				vm_page_xunbusy_unchecked(pa[i]);
 			}
 		}
 	}
 
 	if (!refcount_release(&sfio->nios))
 		return;
 
 #ifdef INVARIANTS
 	for (i = 1; i < sfio->npages; i++) {
 		if (sfio->pa[i] == NULL)
 			break;
 		KASSERT(vm_page_wired(sfio->pa[i]),
 		    ("sfio %p page %d %p not wired", sfio, i, sfio->pa[i]));
 		if (i == 0)
 			continue;
 		KASSERT(sfio->pa[0]->object == sfio->pa[i]->object,
 		    ("sfio %p page %d %p wrong owner %p %p", sfio, i,
 		    sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object));
 		KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex,
 		    ("sfio %p page %d %p wrong index %jx %jx", sfio, i,
 		    sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex,
 		    (uintmax_t)sfio->pa[i]->pindex));
 	}
 #endif
 
 	vm_object_pip_wakeup(sfio->obj);
 
 	if (sfio->m == NULL) {
 		/*
 		 * Either I/O operation failed, or we failed to allocate
 		 * buffers, or we bailed out on first busy page, or we
 		 * succeeded filling the request without any I/Os. Anyway,
 		 * pru_send hadn't been executed - nothing had been sent
 		 * to the socket yet.
 		 */
 		MPASS((curthread->td_pflags & TDP_KTHREAD) == 0);
 		free(sfio, M_SENDFILE);
 		return;
 	}
 
 #if defined(KERN_TLS) && defined(INVARIANTS)
 	if ((sfio->m->m_flags & M_EXT) != 0 &&
 	    sfio->m->m_ext.ext_type == EXT_PGS)
-		KASSERT(sfio->tls == sfio->m->m_ext_pgs.tls,
+		KASSERT(sfio->tls == sfio->m->m_epg_tls,
 		    ("TLS session mismatch"));
 	else
 		KASSERT(sfio->tls == NULL,
 		    ("non-ext_pgs mbuf with TLS session"));
 #endif
 	so = sfio->so;
 	CURVNET_SET(so->so_vnet);
 	if (__predict_false(sfio->error)) {
 		/*
 		 * I/O operation failed.  The state of data in the socket
 		 * is now inconsistent, and all what we can do is to tear
 		 * it down. Protocol abort method would tear down protocol
 		 * state, free all ready mbufs and detach not ready ones.
 		 * We will free the mbufs corresponding to this I/O manually.
 		 *
 		 * The socket would be marked with EIO and made available
 		 * for read, so that application receives EIO on next
 		 * syscall and eventually closes the socket.
 		 */
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 
 		mb_free_notready(sfio->m, sfio->npages);
 #ifdef KERN_TLS
 	} else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) {
 		/*
 		 * I/O operation is complete, but we still need to
 		 * encrypt.  We cannot do this in the interrupt thread
 		 * of the disk controller, so forward the mbufs to a
 		 * different thread.
 		 *
 		 * Donate the socket reference from sfio to rather
 		 * than explicitly invoking soref().
 		 */
 		ktls_enqueue(sfio->m, so, sfio->npages);
 		goto out_with_ref;
 #endif
 	} else
 		(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
 		    sfio->npages);
 
 	SOCK_LOCK(so);
 	sorele(so);
 #ifdef KERN_TLS
 out_with_ref:
 #endif
 	CURVNET_RESTORE();
 	free(sfio, M_SENDFILE);
 }
 
 /*
  * Iterate through pages vector and request paging for non-valid pages.
  */
 static int
 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off,
     off_t len, int npages, int rhpages, int flags)
 {
 	vm_page_t *pa;
 	int a, count, count1, grabbed, i, j, rv;
 
 	pa = sfio->pa;
 	*nios = 0;
 	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
 	sfio->pindex0 = OFF_TO_IDX(off);
 
 	/*
 	 * First grab all the pages and wire them.  Note that we grab
 	 * only required pages.  Readahead pages are dealt with later.
 	 */
 	grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off),
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages);
 	if (grabbed < npages) {
 		for (int i = grabbed; i < npages; i++)
 			pa[i] = NULL;
 		npages = grabbed;
 		rhpages = 0;
 	}
 
 	for (i = 0; i < npages;) {
 		/* Skip valid pages. */
 		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
 		    xfsize(i, npages, off, len))) {
 			vm_page_xunbusy(pa[i]);
 			SFSTAT_INC(sf_pages_valid);
 			i++;
 			continue;
 		}
 
 		/*
 		 * Next page is invalid.  Check if it belongs to pager.  It
 		 * may not be there, which is a regular situation for shmem
 		 * pager.  For vnode pager this happens only in case of
 		 * a sparse file.
 		 *
 		 * Important feature of vm_pager_has_page() is the hint
 		 * stored in 'a', about how many pages we can pagein after
 		 * this page in a single I/O.
 		 */
 		VM_OBJECT_RLOCK(obj);
 		if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
 		    &a)) {
 			VM_OBJECT_RUNLOCK(obj);
 			pmap_zero_page(pa[i]);
 			vm_page_valid(pa[i]);
 			MPASS(pa[i]->dirty == 0);
 			vm_page_xunbusy(pa[i]);
 			i++;
 			continue;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 
 		/*
 		 * We want to pagein as many pages as possible, limited only
 		 * by the 'a' hint and actual request.
 		 */
 		count = min(a + 1, npages - i);
 
 		/*
 		 * We should not pagein into a valid page because
 		 * there might be still unfinished write tracked by
 		 * e.g. a buffer, thus we substitute any valid pages
 		 * with the bogus one.
 		 *
 		 * We must not leave around xbusy pages which are not
 		 * part of the run passed to vm_pager_getpages(),
 		 * otherwise pager might deadlock waiting for the busy
 		 * status of the page, e.g. if it constitues the
 		 * buffer needed to validate other page.
 		 *
 		 * First trim the end of the run consisting of the
 		 * valid pages, then replace the rest of the valid
 		 * with bogus.
 		 */
 		count1 = count;
 		for (j = i + count - 1; j > i; j--) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				count--;
 			} else {
 				break;
 			}
 		}
 
 		/*
 		 * The last page in the run pa[i + count - 1] is
 		 * guaranteed to be invalid by the trim above, so it
 		 * is not replaced with bogus, thus -1 in the loop end
 		 * condition.
 		 */
 		MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL);
 		for (j = i + 1; j < i + count - 1; j++) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				SFSTAT_INC(sf_pages_bogus);
 				pa[j] = bogus_page;
 			}
 		}
 
 		refcount_acquire(&sfio->nios);
 		rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
 		    i + count == npages ? &rhpages : NULL,
 		    &sendfile_iodone, sfio);
 		if (__predict_false(rv != VM_PAGER_OK)) {
 			sendfile_iowait(sfio, "sferrio");
 
 			/*
 			 * Perform full pages recovery before returning EIO.
 			 * Pages from 0 to npages are wired.
 			 * Pages from (i + 1) to (i + count - 1) may be
 			 * substituted to bogus page, and not busied.
 			 * Pages from (i + count) to (i + count1 - 1) are
 			 * not busied.
 			 * Rest of the pages from i to npages are busied.
 			 */
 			for (j = 0; j < npages; j++) {
 				if (j >= i + count && j < i + count1)
 					;
 				else if (j > i && j < i + count - 1 &&
 				    pa[j] == bogus_page)
 					pa[j] = vm_page_relookup(obj,
 					    OFF_TO_IDX(vmoff(j, off)));
 				else if (j >= i)
 					vm_page_xunbusy(pa[j]);
 				KASSERT(pa[j] != NULL && pa[j] != bogus_page,
 				    ("%s: page %p[%d] I/O recovery failure",
 				    __func__, pa, j));
 				vm_page_unwire(pa[j], PQ_INACTIVE);
 			}
 			return (EIO);
 		}
 
 		SFSTAT_INC(sf_iocnt);
 		SFSTAT_ADD(sf_pages_read, count);
 		if (i + count == npages)
 			SFSTAT_ADD(sf_rhpages_read, rhpages);
 
 		i += count1;
 		(*nios)++;
 	}
 
 	if (*nios == 0 && npages != 0)
 		SFSTAT_INC(sf_noiocnt);
 
 	return (0);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	struct vattr va;
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		error = VOP_GETATTR(vp, &va, td->td_ucred);
 		if (error != 0)
 			goto out;
 		*obj_size = va.va_size;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		error = 0;
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	VM_OBJECT_WLOCK(obj);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_WUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp);
 	return (error);
 }
 
 static int
 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock_cap(td, s, &cap_send_rights,
 	    sock_fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	/*
 	 * SCTP one-to-one style sockets currently don't work with
 	 * sendfile(). So indicate EINVAL for now.
 	 */
 	if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP)
 		return (EINVAL);
 	if (SOLISTENING(*so))
 		return (ENOTCONN);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	vm_page_t pga;
 	struct socket *so;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
 	struct shmfd *shmfd;
 	struct sendfile_sync *sfs;
 	struct vattr va;
 	off_t off, sbytes, rem, obj_size;
 	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
 #ifdef KERN_TLS
 	int tls_enq_cnt;
 #endif
 	bool use_ext_pgs;
 
 	obj = NULL;
 	so = NULL;
 	m = mh = NULL;
 	sfs = NULL;
 #ifdef KERN_TLS
 	tls = NULL;
 #endif
 	hdrlen = sbytes = 0;
 	softerr = 0;
 	use_ext_pgs = false;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	SFSTAT_INC(sf_syscalls);
 	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
 
 	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
 		sfs->waiting = true;
 	}
 
 	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
 #ifdef KERN_TLS
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 #endif
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
 		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *m0, *mtail;
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto done;
 		}
 
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
 		 * the socket buffer by the size of the header mbuf chain.
 		 * Clear hdr_uio here and hdrlen at the end of the first loop.
 		 */
 		if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 			hdr_uio->uio_td = td;
 			hdr_uio->uio_rw = UIO_WRITE;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				mh = m_uiotombuf(hdr_uio, M_WAITOK, space,
 				    tls->params.max_frame_len, M_NOMAP);
 			else
 #endif
 				mh = m_uiotombuf(hdr_uio, M_WAITOK,
 				    space, 0, 0);
 			hdrlen = m_length(mh, &mhtail);
 			space -= hdrlen;
 			/*
 			 * If header consumed all the socket buffer space,
 			 * don't waste CPU cycles and jump to the end.
 			 */
 			if (space == 0) {
 				sfio = NULL;
 				nios = 0;
 				goto prepend_header;
 			}
 			hdr_uio = NULL;
 		}
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 			error = VOP_GETATTR(vp, &va, td->td_ucred);
 			if (error != 0 || off >= va.va_size) {
 				VOP_UNLOCK(vp);
 				goto done;
 			}
 			if (va.va_size != obj_size) {
 				obj_size = va.va_size;
 				rem = nbytes ?
 				    omin(nbytes + offset, obj_size) : obj_size;
 				rem -= off;
 			}
 		}
 
 		if (space > rem)
 			space = rem;
 		else if (space > PAGE_SIZE) {
 			/*
 			 * Use page boundaries when possible for large
 			 * requests.
 			 */
 			if (off & PAGE_MASK)
 				space -= (PAGE_SIZE - (off & PAGE_MASK));
 			space = trunc_page(space);
 			if (off & PAGE_MASK)
 				space += (PAGE_SIZE - (off & PAGE_MASK));
 		}
 
 		npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
 
 		/*
 		 * Calculate maximum allowed number of pages for readahead
 		 * at this iteration.  If SF_USER_READAHEAD was set, we don't
 		 * do any heuristics and use exactly the value supplied by
 		 * application.  Otherwise, we allow readahead up to "rem".
 		 * If application wants more, let it be, but there is no
 		 * reason to go above MAXPHYS.  Also check against "obj_size",
 		 * since vm_pager_has_page() can hint beyond EOF.
 		 */
 		if (flags & SF_USER_READAHEAD) {
 			rhpages = SF_READAHEAD(flags);
 		} else {
 			rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
 			    npages;
 			rhpages += SF_READAHEAD(flags);
 		}
 		rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages);
 		rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
 		    npages, rhpages);
 
 		sfio = malloc(sizeof(struct sf_io) +
 		    npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK);
 		refcount_init(&sfio->nios, 1);
 		sfio->obj = obj;
 		sfio->error = 0;
 		sfio->m = NULL;
 #ifdef KERN_TLS
 		/*
 		 * This doesn't use ktls_hold() because sfio->m will
 		 * also have a reference on 'tls' that will be valid
 		 * for all of sfio's lifetime.
 		 */
 		sfio->tls = tls;
 #endif
 		vm_object_pip_add(obj, 1);
 		error = sendfile_swapin(obj, sfio, &nios, off, space, npages,
 		    rhpages, flags);
 		if (error != 0) {
 			if (vp != NULL)
 				VOP_UNLOCK(vp);
 			sendfile_iodone(sfio, NULL, 0, error);
 			goto done;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		pa = sfio->pa;
 
 		/*
 		 * Use unmapped mbufs if enabled for TCP.  Unmapped
 		 * bufs are restricted to TCP as that is what has been
 		 * tested.  In particular, unmapped mbufs have not
 		 * been tested with UNIX-domain sockets.
 		 *
 		 * TLS frames always require unmapped mbufs.
 		 */
 		if ((mb_use_ext_pgs &&
 		    so->so_proto->pr_protocol == IPPROTO_TCP)
 #ifdef KERN_TLS
 		    || tls != NULL
 #endif
 		    ) {
 			use_ext_pgs = true;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				max_pgs = num_pages(tls->params.max_frame_len);
 			else
 #endif
 				max_pgs = MBUF_PEXT_MAX_PGS;
 
 			/* Start at last index, to wrap on first use. */
 			ext_pgs_idx = max_pgs - 1;
 		}
 
 		for (int i = 0; i < npages; i++) {
 			/*
 			 * If a page wasn't grabbed successfully, then
 			 * trim the array. Can happen only with SF_NODISKIO.
 			 */
 			if (pa[i] == NULL) {
 				SFSTAT_INC(sf_busy);
 				fixspace(npages, i, off, &space);
 				npages = i;
 				softerr = EBUSY;
 				break;
 			}
 			pga = pa[i];
 			if (pga == bogus_page)
 				pga = vm_page_relookup(obj, sfio->pindex0 + i);
 
 			if (use_ext_pgs) {
 				off_t xfs;
 
 				ext_pgs_idx++;
 				if (ext_pgs_idx == max_pgs) {
 					m0 = mb_alloc_ext_pgs(M_WAITOK,
 					    sendfile_free_mext_pg);
 
 					if (flags & SF_NOCACHE) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_NOCACHE;
 
 						/*
 						 * See comment below regarding
 						 * ignoring SF_NOCACHE for the
 						 * last page.
 						 */
 						if ((npages - i <= max_pgs) &&
 						    ((off + space) & PAGE_MASK) &&
 						    (rem > space || rhpages > 0))
 							m0->m_ext.ext_flags |=
 							    EXT_FLAG_CACHE_LAST;
 					}
 					if (sfs != NULL) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_SYNC;
 						if (m0->m_ext.ext_type ==
 						    EXT_PGS)
 							m0->m_ext.ext_arg1 =
 								sfs;
 						else
 							m0->m_ext.ext_arg2 =
 								sfs;
 						mtx_lock(&sfs->mtx);
 						sfs->count++;
 						mtx_unlock(&sfs->mtx);
 					}
 					ext_pgs_idx = 0;
 
 					/* Append to mbuf chain. */
 					if (mtail != NULL)
 						mtail->m_next = m0;
 					else
 						m = m0;
 					mtail = m0;
-					m0->m_ext_pgs.first_pg_off =
+					m0->m_epg_1st_off =
 					    vmoff(i, off) & PAGE_MASK;
 				}
 				if (nios) {
 					mtail->m_flags |= M_NOTREADY;
-					m0->m_ext_pgs.nrdy++;
+					m0->m_epg_nrdy++;
 				}
 
 				m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga);
-				m0->m_ext_pgs.npgs++;
+				m0->m_epg_npgs++;
 				xfs = xfsize(i, npages, off, space);
-				m0->m_ext_pgs.last_pg_len = xfs;
+				m0->m_epg_last_len = xfs;
 				MBUF_EXT_PGS_ASSERT_SANITY(m0);
 				mtail->m_len += xfs;
 				mtail->m_ext.ext_size += PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pga,
 			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				sendfile_iowait(sfio, "sfnosf");
 				for (int j = i; j < npages; j++)
 					vm_page_unwire(pa[j], PQ_INACTIVE);
 				if (m == NULL)
 					softerr = ENOBUFS;
 				fixspace(npages, i, off, &space);
 				npages = i;
 				break;
 			}
 
 			m0 = m_get(M_WAITOK, MT_DATA);
 			m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
 			m0->m_ext.ext_size = PAGE_SIZE;
 			m0->m_ext.ext_arg1 = sf;
 			m0->m_ext.ext_type = EXT_SFBUF;
 			m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
 			m0->m_ext.ext_free = sendfile_free_mext;
 			/*
 			 * SF_NOCACHE sets the page as being freed upon send.
 			 * However, we ignore it for the last page in 'space',
 			 * if the page is truncated, and we got more data to
 			 * send (rem > space), or if we have readahead
 			 * configured (rhpages > 0).
 			 */
 			if ((flags & SF_NOCACHE) &&
 			    (i != npages - 1 ||
 			    !((off + space) & PAGE_MASK) ||
 			    !(rem > space || rhpages > 0)))
 				m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE;
 			if (sfs != NULL) {
 				m0->m_ext.ext_flags |= EXT_FLAG_SYNC;
 				if (m0->m_ext.ext_type == EXT_PGS)
 					m0->m_ext.ext_arg1 = sfs;
 				else
 					m0->m_ext.ext_arg2 = sfs;
 				m0->m_ext.ext_arg2 = sfs;
 				mtx_lock(&sfs->mtx);
 				sfs->count++;
 				mtx_unlock(&sfs->mtx);
 			}
 			m0->m_ext.ext_count = 1;
 			m0->m_flags |= (M_EXT | M_RDONLY);
 			if (nios)
 				m0->m_flags |= M_NOTREADY;
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp);
 
 		/* Keep track of bytes processed. */
 		off += space;
 		rem -= space;
 
 		/*
 		 * Prepend header, if any.  Save pointer to first mbuf
 		 * with a page.
 		 */
 		if (hdrlen) {
 prepend_header:
 			m0 = mhtail->m_next = m;
 			m = mh;
 			mh = NULL;
 		} else
 			m0 = m;
 
 		if (m == NULL) {
 			KASSERT(softerr, ("%s: m NULL, no error", __func__));
 			error = softerr;
 			sendfile_iodone(sfio, NULL, 0, 0);
 			goto done;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
 		CURVNET_SET(so->so_vnet);
 #ifdef KERN_TLS
 		if (tls != NULL)
 			ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP);
 #endif
 		if (nios == 0) {
 			/*
 			 * If sendfile_swapin() didn't initiate any I/Os,
 			 * which happens if all data is cached in VM, or if
 			 * the header consumed all socket buffer space and
 			 * sfio is NULL, then we can send data right now
 			 * without the PRUS_NOTREADY flag.
 			 */
 			if (sfio != NULL)
 				sendfile_iodone(sfio, NULL, 0, 0);
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, PRUS_NOTREADY, m, NULL, NULL, td);
 				soref(so);
 				ktls_enqueue(m, so, tls_enq_cnt);
 			} else
 #endif
 				error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, td);
 		} else {
 			sfio->so = so;
 			sfio->m = m0;
 			sfio->npages = npages;
 			soref(so);
 			error = (*so->so_proto->pr_usrreqs->pru_send)
 			    (so, PRUS_NOTREADY, m, NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, 0);
 		}
 		CURVNET_RESTORE();
 
 		m = NULL;	/* pru_send always consumes */
 		if (error)
 			goto done;
 		sbytes += space + hdrlen;
 		if (hdrlen)
 			hdrlen = 0;
 		if (softerr) {
 			error = softerr;
 			goto done;
 		}
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		sbunlock(&so->so_snd);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	sbunlock(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 	if (mh)
 		m_freem(mh);
 
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (sfs->count != 0)
 			error = cv_wait_sig(&sfs->cv, &sfs->mtx);
 		if (sfs->count == 0) {
 			sendfile_sync_destroy(sfs);
 		} else {
 			sfs->waiting = false;
 			mtx_unlock(&sfs->mtx);
 		}
 	}
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	off_t sbytes;
 	int error;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	sbytes = 0;
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
 			    &hdr_uio);
 			if (error != 0)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
 			    &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	free(hdr_uio, M_IOV);
 	free(trl_uio, M_IOV);
 	return (error);
 }
 
 /*
  * sendfile(2)
  * 
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  * 
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
  
 	return (sendfile(td, uap, 0));
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
Index: head/sys/kern/subr_bus_dma.c
===================================================================
--- head/sys/kern/subr_bus_dma.c	(revision 360578)
+++ head/sys/kern/subr_bus_dma.c	(revision 360579)
@@ -1,686 +1,686 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 EMC Corp.
  * All rights reserved.
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/ktr.h>
 #include <sys/mbuf.h>
 #include <sys/memdesc.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/pmap.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 
 #include <opencrypto/cryptodev.h>
 
 #include <machine/bus.h>
 
 /*
  * Load up data starting at offset within a region specified by a
  * list of virtual address ranges until either length or the region
  * are exhausted.
  */
 static int
 _bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
     int flags, size_t offset, size_t length)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) {
 		char *addr;
 		size_t ds_len;
 
 		KASSERT((offset < list->ds_len),
 		    ("Invalid mid-segment offset"));
 		addr = (char *)(uintptr_t)list->ds_addr + offset;
 		ds_len = list->ds_len - offset;
 		offset = 0;
 		if (ds_len > length)
 			ds_len = length;
 		length -= ds_len;
 		KASSERT((ds_len != 0), ("Segment length is zero"));
 		error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap,
 		    flags, NULL, nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load a list of physical addresses.
  */
 static int
 _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0; sglist_cnt--, list++) {
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
 		    nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load an unmapped mbuf
  */
 static int
 _bus_dmamap_load_mbuf_epg(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	int error, i, off, len, pglen, pgoff, seglen, segoff;
 
 	MBUF_EXT_PGS_ASSERT(m);
 
 	len = m->m_len;
 	error = 0;
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = _bus_dmamap_load_buffer(dmat, map,
 			    &m->m_epg_hdr[segoff], seglen, kernel_pmap,
 			    flags, segs, nsegs);
 		}
 	}
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && error == 0 && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		error = _bus_dmamap_load_phys(dmat, map,
 		    m->m_epg_pa[i] + segoff, seglen, flags, segs, nsegs);
 		pgoff = 0;
 	};
 	if (len != 0 && error == 0) {
-		KASSERT((off + len) <= m->m_ext_pgs.trail_len,
+		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d)", off, len,
-		    m->m_ext_pgs.trail_len));
+		    m->m_epg_trllen));
 		error = _bus_dmamap_load_buffer(dmat, map,
 		    &m->m_epg_trail[off], len, kernel_pmap, flags, segs,
 		    nsegs);
 	}
 	return (error);
 }
 
 /*
  * Load an mbuf chain.
  */
 static int
 _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	struct mbuf *m;
 	int error;
 
 	error = 0;
 	for (m = m0; m != NULL && error == 0; m = m->m_next) {
 		if (m->m_len > 0) {
 			if ((m->m_flags & M_NOMAP) != 0)
 				error = _bus_dmamap_load_mbuf_epg(dmat,
 				    map, m, segs, nsegs, flags);
 			else
 				error = _bus_dmamap_load_buffer(dmat, map,
 				    m->m_data, m->m_len, kernel_pmap,
 				    flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
 		}
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, *nsegs);
 	return (error);
 }
 
 /*
  * Load from block io.
  */
 static int
 _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
     int *nsegs, int flags)
 {
 
 	if ((bio->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data;
 		return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n,
 		    kernel_pmap, nsegs, flags, bio->bio_ma_offset,
 		    bio->bio_bcount));
 	}
 
 	if ((bio->bio_flags & BIO_UNMAPPED) != 0)
 		return (_bus_dmamap_load_ma(dmat, map, bio->bio_ma,
 		    bio->bio_bcount, bio->bio_ma_offset, flags, NULL, nsegs));
 
 	return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data,
 	    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs));
 }
 
 int
 bus_dmamap_load_ma_triv(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
     bus_dma_segment_t *segs, int *segp)
 {
 	vm_paddr_t paddr;
 	bus_size_t len;
 	int error, i;
 
 	error = 0;
 	for (i = 0; tlen > 0; i++, tlen -= len) {
 		len = min(PAGE_SIZE - ma_offs, tlen);
 		paddr = VM_PAGE_TO_PHYS(ma[i]) + ma_offs;
 		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
 		    flags, segs, segp);
 		if (error != 0)
 			break;
 		ma_offs = 0;
 	}
 	return (error);
 }
 
 /*
  * Load a cam control block.
  */
 static int
 _bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    int *nsegs, int flags)
 {
 	struct ccb_hdr *ccb_h;
 	void *data_ptr;
 	int error;
 	uint32_t dxfer_len;
 	uint16_t sglist_cnt;
 
 	error = 0;
 	ccb_h = &ccb->ccb_h;
 	switch (ccb_h->func_code) {
 	case XPT_SCSI_IO: {
 		struct ccb_scsiio *csio;
 
 		csio = &ccb->csio;
 		data_ptr = csio->data_ptr;
 		dxfer_len = csio->dxfer_len;
 		sglist_cnt = csio->sglist_cnt;
 		break;
 	}
 	case XPT_CONT_TARGET_IO: {
 		struct ccb_scsiio *ctio;
 
 		ctio = &ccb->ctio;
 		data_ptr = ctio->data_ptr;
 		dxfer_len = ctio->dxfer_len;
 		sglist_cnt = ctio->sglist_cnt;
 		break;
 	}
 	case XPT_ATA_IO: {
 		struct ccb_ataio *ataio;
 
 		ataio = &ccb->ataio;
 		data_ptr = ataio->data_ptr;
 		dxfer_len = ataio->dxfer_len;
 		sglist_cnt = 0;
 		break;
 	}
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN: {
 		struct ccb_nvmeio *nvmeio;
 
 		nvmeio = &ccb->nvmeio;
 		data_ptr = nvmeio->data_ptr;
 		dxfer_len = nvmeio->dxfer_len;
 		sglist_cnt = nvmeio->sglist_cnt;
 		break;
 	}
 	default:
 		panic("_bus_dmamap_load_ccb: Unsupported func code %d",
 		    ccb_h->func_code);
 	}
 
 	switch ((ccb_h->flags & CAM_DATA_MASK)) {
 	case CAM_DATA_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
 		    kernel_pmap, flags, NULL, nsegs);
 		break;
 	case CAM_DATA_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
 		    nsegs);
 		break;
 	case CAM_DATA_SG:
 		error = _bus_dmamap_load_vlist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
 		    nsegs, flags, 0, dxfer_len);
 		break;
 	case CAM_DATA_SG_PADDR:
 		error = _bus_dmamap_load_plist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
 		break;
 	case CAM_DATA_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
 		    nsegs, flags);
 		break;
 	default:
 		panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
 		    ccb_h->flags);
 	}
 	return (error);
 }
 
 /*
  * Load a uio.
  */
 static int
 _bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     int *nsegs, int flags)
 {
 	bus_size_t resid;
 	bus_size_t minlen;
 	struct iovec *iov;
 	pmap_t pmap;
 	caddr_t addr;
 	int error, i;
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 			("bus_dmamap_load_uio: USERSPACE but no proc"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = kernel_pmap;
 	resid = uio->uio_resid;
 	iov = uio->uio_iov;
 	error = 0;
 
 	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
 		 * until we have exhausted the residual count.
 		 */
 
 		addr = (caddr_t) iov[i].iov_base;
 		minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
 		if (minlen > 0) {
 			error = _bus_dmamap_load_buffer(dmat, map, addr,
 			    minlen, pmap, flags, NULL, nsegs);
 			resid -= minlen;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Map the buffer buf into bus space using the dmamap map.
  */
 int
 bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
     bus_size_t buflen, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_vaddr(buf, buflen);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 
 	nsegs = -1;
 	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
 	    flags, NULL, &nsegs);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	M_ASSERTPKTHDR(m0);
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
 	++nsegs;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	int error;
 
 	flags |= BUS_DMA_NOWAIT;
 	*nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
 	++*nsegs;
 	_bus_dmamap_complete(dmat, map, segs, *nsegs, error);
 	return (error);
 }
 
 int
 bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
 	nsegs++;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct ccb_hdr *ccb_h;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	ccb_h = &ccb->ccb_h;
 	if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
 		callback(callback_arg, NULL, 0, 0);
 		return (0);
 	}
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_ccb(ccb);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_bio(bio);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct memdesc *mem, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0)
 		_bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
 
 	nsegs = -1;
 	error = 0;
 	switch (mem->md_type) {
 	case MEMDESC_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
 		    mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
 		    mem->md_opaque, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_VLIST:
 		error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX);
 		break;
 	case MEMDESC_PLIST:
 		error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, &nsegs, flags);
 		break;
 	case MEMDESC_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_UIO:
 		error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_MBUF:
 		error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
 		    NULL, &nsegs, flags);
 		break;
 	case MEMDESC_CCB:
 		error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
 		    flags);
 		break;
 	}
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_crp(bus_dma_tag_t dmat, bus_dmamap_t map, struct cryptop *crp,
     bus_dmamap_callback_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int error;
 	int nsegs;
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = 0;
 	switch (crp->crp_buf_type) {
 	case CRYPTO_BUF_CONTIG:
 		error = _bus_dmamap_load_buffer(dmat, map, crp->crp_buf,
 		    crp->crp_ilen, kernel_pmap, flags, NULL, &nsegs);
 		break;
 	case CRYPTO_BUF_MBUF:
 		error = _bus_dmamap_load_mbuf_sg(dmat, map, crp->crp_mbuf,
 		    NULL, &nsegs, flags);
 		break;
 	case CRYPTO_BUF_UIO:
 		error = _bus_dmamap_load_uio(dmat, map, crp->crp_uio, &nsegs,
 		    flags);
 		break;
 	}
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
Index: head/sys/kern/subr_sglist.c
===================================================================
--- head/sys/kern/subr_sglist.c	(revision 360578)
+++ head/sys/kern/subr_sglist.c	(revision 360579)
@@ -1,971 +1,971 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <sys/ktr.h>
 
 static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
 
 /*
  * Convenience macros to save the state of an sglist so it can be restored
  * if an append attempt fails.  Since sglist's only grow we only need to
  * save the current count of segments and the length of the ending segment.
  * Earlier segments will not be changed by an append, and the only change
  * that can occur to the ending segment is that it can be extended.
  */
 struct sgsave {
 	u_short sg_nseg;
 	size_t ss_len;
 };
 
 #define	SGLIST_SAVE(sg, sgsave) do {					\
 	(sgsave).sg_nseg = (sg)->sg_nseg;				\
 	if ((sgsave).sg_nseg > 0)					\
 		(sgsave).ss_len = (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len; \
 	else								\
 		(sgsave).ss_len = 0;					\
 } while (0)
 
 #define	SGLIST_RESTORE(sg, sgsave) do {					\
 	(sg)->sg_nseg = (sgsave).sg_nseg;				\
 	if ((sgsave).sg_nseg > 0)					\
 		(sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len = (sgsave).ss_len; \
 } while (0)
 
 /*
  * Append a single (paddr, len) to a sglist.  sg is the list and ss is
  * the current segment in the list.  If we run out of segments then
  * EFBIG will be returned.
  */
 static __inline int
 _sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
     vm_paddr_t paddr, size_t len)
 {
 	struct sglist_seg *ss;
 
 	ss = *ssp;
 	if (ss->ss_paddr + ss->ss_len == paddr)
 		ss->ss_len += len;
 	else {
 		if (sg->sg_nseg == sg->sg_maxseg)
 			return (EFBIG);
 		ss++;
 		ss->ss_paddr = paddr;
 		ss->ss_len = len;
 		sg->sg_nseg++;
 		*ssp = ss;
 	}
 	return (0);
 }
 
 /*
  * Worker routine to append a virtual address range (either kernel or
  * user) to a scatter/gather list.
  */
 static __inline int
 _sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
     size_t *donep)
 {
 	struct sglist_seg *ss;
 	vm_offset_t vaddr, offset;
 	vm_paddr_t paddr;
 	size_t seglen;
 	int error;
 
 	if (donep)
 		*donep = 0;
 	if (len == 0)
 		return (0);
 
 	/* Do the first page.  It may have an offset. */
 	vaddr = (vm_offset_t)buf;
 	offset = vaddr & PAGE_MASK;
 	if (pmap != NULL)
 		paddr = pmap_extract(pmap, vaddr);
 	else
 		paddr = pmap_kextract(vaddr);
 	seglen = MIN(len, PAGE_SIZE - offset);
 	if (sg->sg_nseg == 0) {
 		ss = sg->sg_segs;
 		ss->ss_paddr = paddr;
 		ss->ss_len = seglen;
 		sg->sg_nseg = 1;
 	} else {
 		ss = &sg->sg_segs[sg->sg_nseg - 1];
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error)
 			return (error);
 	}
 	vaddr += seglen;
 	len -= seglen;
 	if (donep)
 		*donep += seglen;
 
 	while (len > 0) {
 		seglen = MIN(len, PAGE_SIZE);
 		if (pmap != NULL)
 			paddr = pmap_extract(pmap, vaddr);
 		else
 			paddr = pmap_kextract(vaddr);
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error)
 			return (error);
 		vaddr += seglen;
 		len -= seglen;
 		if (donep)
 			*donep += seglen;
 	}
 
 	return (0);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe a kernel virtual address range.
  */
 int
 sglist_count(void *buf, size_t len)
 {
 	vm_offset_t vaddr, vendaddr;
 	vm_paddr_t lastaddr, paddr;
 	int nsegs;
 
 	if (len == 0)
 		return (0);
 
 	vaddr = trunc_page((vm_offset_t)buf);
 	vendaddr = (vm_offset_t)buf + len;
 	nsegs = 1;
 	lastaddr = pmap_kextract(vaddr);
 	vaddr += PAGE_SIZE;
 	while (vaddr < vendaddr) {
 		paddr = pmap_kextract(vaddr);
 		if (lastaddr + PAGE_SIZE != paddr)
 			nsegs++;
 		lastaddr = paddr;
 		vaddr += PAGE_SIZE;
 	}
 	return (nsegs);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe a buffer backed by an array of VM pages.
  */
 int
 sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
 {
 	vm_paddr_t lastaddr, paddr;
 	int i, nsegs;
 
 	if (len == 0)
 		return (0);
 
 	len += pgoff;
 	nsegs = 1;
 	lastaddr = VM_PAGE_TO_PHYS(m[0]);
 	for (i = 1; len > PAGE_SIZE; len -= PAGE_SIZE, i++) {
 		paddr = VM_PAGE_TO_PHYS(m[i]);
 		if (lastaddr + PAGE_SIZE != paddr)
 			nsegs++;
 		lastaddr = paddr;
 	}
 	return (nsegs);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe an EXT_PGS buffer.
  */
 int
 sglist_count_mbuf_epg(struct mbuf *m, size_t off, size_t len)
 {
 	vm_paddr_t nextaddr, paddr;
 	size_t seglen, segoff;
 	int i, nsegs, pglen, pgoff;
 
 	if (len == 0)
 		return (0);
 
 	nsegs = 0;
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = MIN(seglen, len);
 			off = 0;
 			len -= seglen;
 			nsegs += sglist_count(&m->m_epg_hdr[segoff],
 			    seglen);
 		}
 	}
 	nextaddr = 0;
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = MIN(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		if (paddr != nextaddr)
 			nsegs++;
 		nextaddr = paddr + seglen;
 		pgoff = 0;
 	};
 	if (len != 0) {
-		seglen = MIN(len, m->m_ext_pgs.trail_len - off);
+		seglen = MIN(len, m->m_epg_trllen - off);
 		len -= seglen;
 		nsegs += sglist_count(&m->m_epg_trail[off], seglen);
 	}
 	KASSERT(len == 0, ("len != 0"));
 	return (nsegs);
 }
 
 /*
  * Allocate a scatter/gather list along with 'nsegs' segments.  The
  * 'mflags' parameters are the same as passed to malloc(9).  The caller
  * should use sglist_free() to free this list.
  */
 struct sglist *
 sglist_alloc(int nsegs, int mflags)
 {
 	struct sglist *sg;
 
 	sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
 	    M_SGLIST, mflags);
 	if (sg == NULL)
 		return (NULL);
 	sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
 	return (sg);
 }
 
 /*
  * Free a scatter/gather list allocated via sglist_allc().
  */
 void
 sglist_free(struct sglist *sg)
 {
 
 	if (sg == NULL)
 		return;
 
 	if (refcount_release(&sg->sg_refs))
 		free(sg, M_SGLIST);
 }
 
 /*
  * Append the segments to describe a single kernel virtual address
  * range to a scatter/gather list.  If there are insufficient
  * segments, then this fails with EFBIG.
  */
 int
 sglist_append(struct sglist *sg, void *buf, size_t len)
 {
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_buf(sg, buf, len, NULL, NULL);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append the segments to describe a bio's data to a scatter/gather list.
  * If there are insufficient segments, then this fails with EFBIG.
  *
  * NOTE: This function expects bio_bcount to be initialized.
  */
 int
 sglist_append_bio(struct sglist *sg, struct bio *bp)
 {
 	int error;
 
 	if ((bp->bio_flags & BIO_UNMAPPED) == 0)
 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
 	else
 		error = sglist_append_vmpages(sg, bp->bio_ma,
 		    bp->bio_ma_offset, bp->bio_bcount);
 	return (error);
 }
 
 /*
  * Append a single physical address range to a scatter/gather list.
  * If there are insufficient segments, then this fails with EFBIG.
  */
 int
 sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
 {
 	struct sglist_seg *ss;
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	if (len == 0)
 		return (0);
 
 	if (sg->sg_nseg == 0) {
 		sg->sg_segs[0].ss_paddr = paddr;
 		sg->sg_segs[0].ss_len = len;
 		sg->sg_nseg = 1;
 		return (0);
 	}
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_range(sg, &ss, paddr, len);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append the segments of single multi-page mbuf.
  * If there are insufficient segments, then this fails with EFBIG.
  */
 int
 sglist_append_mbuf_epg(struct sglist *sg, struct mbuf *m, size_t off,
     size_t len)
 {
 	size_t seglen, segoff;
 	vm_paddr_t paddr;
 	int error, i, pglen, pgoff;
 
 	MBUF_EXT_PGS_ASSERT(m);
 
 	error = 0;
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = MIN(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = sglist_append(sg,
 			    &m->m_epg_hdr[segoff], seglen);
 		}
 	}
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && error == 0 && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = MIN(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		error = sglist_append_phys(sg, paddr, seglen);
 		pgoff = 0;
 	};
 	if (error == 0 && len > 0) {
-		seglen = MIN(len, m->m_ext_pgs.trail_len - off);
+		seglen = MIN(len, m->m_epg_trllen - off);
 		len -= seglen;
 		error = sglist_append(sg,
 		    &m->m_epg_trail[off], seglen);
 	}
 	if (error == 0)
 		KASSERT(len == 0, ("len != 0"));
 	return (error);
 }
 
 /*
  * Append the segments that describe a single mbuf chain to a
  * scatter/gather list.  If there are insufficient segments, then this
  * fails with EFBIG.
  */
 int
 sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
 {
 	struct sgsave save;
 	struct mbuf *m;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	error = 0;
 	SGLIST_SAVE(sg, save);
 	for (m = m0; m != NULL; m = m->m_next) {
 		if (m->m_len > 0) {
 			if ((m->m_flags & M_NOMAP) != 0)
 				error = sglist_append_mbuf_epg(sg, m,
 				    mtod(m, vm_offset_t), m->m_len);
 			else
 				error = sglist_append(sg, m->m_data,
 				    m->m_len);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe a buffer spanning an array of VM
  * pages.  The buffer begins at an offset of 'pgoff' in the first
  * page.
  */
 int
 sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff,
     size_t len)
 {
 	struct sgsave save;
 	struct sglist_seg *ss;
 	vm_paddr_t paddr;
 	size_t seglen;
 	int error, i;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	if (len == 0)
 		return (0);
 
 	SGLIST_SAVE(sg, save);
 	i = 0;
 	if (sg->sg_nseg == 0) {
 		seglen = min(PAGE_SIZE - pgoff, len);
 		sg->sg_segs[0].ss_paddr = VM_PAGE_TO_PHYS(m[0]) + pgoff;
 		sg->sg_segs[0].ss_len = seglen;
 		sg->sg_nseg = 1;
 		pgoff = 0;
 		len -= seglen;
 		i++;
 	}
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	for (; len > 0; i++, len -= seglen) {
 		seglen = min(PAGE_SIZE - pgoff, len);
 		paddr = VM_PAGE_TO_PHYS(m[i]) + pgoff;
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error) {
 			SGLIST_RESTORE(sg, save);
 			return (error);
 		}
 		pgoff = 0;
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe a single user address range to a
  * scatter/gather list.  If there are insufficient segments, then this
  * fails with EFBIG.
  */
 int
 sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
 {
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_buf(sg, buf, len,
 	    vmspace_pmap(td->td_proc->p_vmspace), NULL);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append a subset of an existing scatter/gather list 'source' to a
  * the scatter/gather list 'sg'.  If there are insufficient segments,
  * then this fails with EFBIG.
  */
 int
 sglist_append_sglist(struct sglist *sg, struct sglist *source, size_t offset,
     size_t length)
 {
 	struct sgsave save;
 	struct sglist_seg *ss;
 	size_t seglen;
 	int error, i;
 
 	if (sg->sg_maxseg == 0 || length == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = EINVAL;
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	for (i = 0; i < source->sg_nseg; i++) {
 		if (offset >= source->sg_segs[i].ss_len) {
 			offset -= source->sg_segs[i].ss_len;
 			continue;
 		}
 		seglen = source->sg_segs[i].ss_len - offset;
 		if (seglen > length)
 			seglen = length;
 		error = _sglist_append_range(sg, &ss,
 		    source->sg_segs[i].ss_paddr + offset, seglen);
 		if (error)
 			break;
 		offset = 0;
 		length -= seglen;
 		if (length == 0)
 			break;
 	}
 	if (length != 0)
 		error = EINVAL;
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append the segments that describe a single uio to a scatter/gather
  * list.  If there are insufficient segments, then this fails with
  * EFBIG.
  */
 int
 sglist_append_uio(struct sglist *sg, struct uio *uio)
 {
 	struct iovec *iov;
 	struct sgsave save;
 	size_t resid, minlen;
 	pmap_t pmap;
 	int error, i;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	resid = uio->uio_resid;
 	iov = uio->uio_iov;
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 		    ("sglist_append_uio: USERSPACE but no thread"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = NULL;
 
 	error = 0;
 	SGLIST_SAVE(sg, save);
 	for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
 		 * until we have exhausted the residual count.
 		 */
 		minlen = MIN(resid, iov[i].iov_len);
 		if (minlen > 0) {
 			error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
 			    pmap, NULL);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);
 			}
 			resid -= minlen;
 		}
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe at most 'resid' bytes from a
  * single uio to a scatter/gather list.  If there are insufficient
  * segments, then only the amount that fits is appended.
  */
 int
 sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid)
 {
 	struct iovec *iov;
 	size_t done;
 	pmap_t pmap;
 	int error, len;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 		    ("sglist_consume_uio: USERSPACE but no thread"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = NULL;
 
 	error = 0;
 	while (resid > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		len = iov->iov_len;
 		if (len == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (len > resid)
 			len = resid;
 
 		/*
 		 * Try to append this iovec.  If we run out of room,
 		 * then break out of the loop.
 		 */
 		error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
 		iov->iov_base = (char *)iov->iov_base + done;
 		iov->iov_len -= done;
 		uio->uio_resid -= done;
 		uio->uio_offset += done;
 		resid -= done;
 		if (error)
 			break;
 	}
 	return (0);
 }
 
 /*
  * Allocate and populate a scatter/gather list to describe a single
  * kernel virtual address range.
  */
 struct sglist *
 sglist_build(void *buf, size_t len, int mflags)
 {
 	struct sglist *sg;
 	int nsegs;
 
 	if (len == 0)
 		return (NULL);
 
 	nsegs = sglist_count(buf, len);
 	sg = sglist_alloc(nsegs, mflags);
 	if (sg == NULL)
 		return (NULL);
 	if (sglist_append(sg, buf, len) != 0) {
 		sglist_free(sg);
 		return (NULL);
 	}
 	return (sg);
 }
 
 /*
  * Clone a new copy of a scatter/gather list.
  */
 struct sglist *
 sglist_clone(struct sglist *sg, int mflags)
 {
 	struct sglist *new;
 
 	if (sg == NULL)
 		return (NULL);
 	new = sglist_alloc(sg->sg_maxseg, mflags);
 	if (new == NULL)
 		return (NULL);
 	new->sg_nseg = sg->sg_nseg;
 	bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
 	    sg->sg_nseg);
 	return (new);
 }
 
 /*
  * Calculate the total length of the segments described in a
  * scatter/gather list.
  */
 size_t
 sglist_length(struct sglist *sg)
 {
 	size_t space;
 	int i;
 
 	space = 0;
 	for (i = 0; i < sg->sg_nseg; i++)
 		space += sg->sg_segs[i].ss_len;
 	return (space);
 }
 
 /*
  * Split a scatter/gather list into two lists.  The scatter/gather
  * entries for the first 'length' bytes of the 'original' list are
  * stored in the '*head' list and are removed from 'original'.
  *
  * If '*head' is NULL, then a new list will be allocated using
  * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
  * ENOMEM will be returned.
  *
  * If '*head' is not NULL, it should point to an empty sglist.  If it
  * does not have enough room for the remaining space, then EFBIG will
  * be returned.  If '*head' is not empty, then EINVAL will be
  * returned.
  *
  * If 'original' is shared (refcount > 1), then EDOOFUS will be
  * returned.
  */
 int
 sglist_split(struct sglist *original, struct sglist **head, size_t length,
     int mflags)
 {
 	struct sglist *sg;
 	size_t space, split;
 	int count, i;
 
 	if (original->sg_refs > 1)
 		return (EDOOFUS);
 
 	/* Figure out how big of a sglist '*head' has to hold. */
 	count = 0;
 	space = 0;
 	split = 0;
 	for (i = 0; i < original->sg_nseg; i++) {
 		space += original->sg_segs[i].ss_len;
 		count++;
 		if (space >= length) {
 			/*
 			 * If 'length' falls in the middle of a
 			 * scatter/gather list entry, then 'split'
 			 * holds how much of that entry will remain in
 			 * 'original'.
 			 */
 			split = space - length;
 			break;
 		}
 	}
 
 	/* Nothing to do, so leave head empty. */
 	if (count == 0)
 		return (0);
 
 	if (*head == NULL) {
 		sg = sglist_alloc(count, mflags);
 		if (sg == NULL)
 			return (ENOMEM);
 		*head = sg;
 	} else {
 		sg = *head;
 		if (sg->sg_maxseg < count)
 			return (EFBIG);
 		if (sg->sg_nseg != 0)
 			return (EINVAL);
 	}
 
 	/* Copy 'count' entries to 'sg' from 'original'. */
 	bcopy(original->sg_segs, sg->sg_segs, count *
 	    sizeof(struct sglist_seg));
 	sg->sg_nseg = count;
 
 	/*
 	 * If we had to split a list entry, fixup the last entry in
 	 * 'sg' and the new first entry in 'original'.  We also
 	 * decrement 'count' by 1 since we will only be removing
 	 * 'count - 1' segments from 'original' now.
 	 */
 	if (split != 0) {
 		count--;
 		sg->sg_segs[count].ss_len -= split;
 		original->sg_segs[count].ss_paddr =
 		    sg->sg_segs[count].ss_paddr + split;
 		original->sg_segs[count].ss_len = split;
 	}
 
 	/* Trim 'count' entries from the front of 'original'. */
 	original->sg_nseg -= count;
 	bcopy(original->sg_segs + count, original->sg_segs, count *
 	    sizeof(struct sglist_seg));
 	return (0);
 }
 
 /*
  * Append the scatter/gather list elements in 'second' to the
  * scatter/gather list 'first'.  If there is not enough space in
  * 'first', EFBIG is returned.
  */
 int
 sglist_join(struct sglist *first, struct sglist *second)
 {
 	struct sglist_seg *flast, *sfirst;
 	int append;
 
 	/* If 'second' is empty, there is nothing to do. */
 	if (second->sg_nseg == 0)
 		return (0);
 
 	/*
 	 * If the first entry in 'second' can be appended to the last entry
 	 * in 'first' then set append to '1'.
 	 */
 	append = 0;
 	flast = &first->sg_segs[first->sg_nseg - 1];
 	sfirst = &second->sg_segs[0];
 	if (first->sg_nseg != 0 &&
 	    flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
 		append = 1;
 
 	/* Make sure 'first' has enough room. */
 	if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
 		return (EFBIG);
 
 	/* Merge last in 'first' and first in 'second' if needed. */
 	if (append)
 		flast->ss_len += sfirst->ss_len;
 
 	/* Append new segments from 'second' to 'first'. */
 	bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
 	    (second->sg_nseg - append) * sizeof(struct sglist_seg));
 	first->sg_nseg += second->sg_nseg - append;
 	sglist_reset(second);
 	return (0);
 }
 
 /*
  * Generate a new scatter/gather list from a range of an existing
  * scatter/gather list.  The 'offset' and 'length' parameters specify
  * the logical range of the 'original' list to extract.  If that range
  * is not a subset of the length of 'original', then EINVAL is
  * returned.  The new scatter/gather list is stored in '*slice'.
  *
  * If '*slice' is NULL, then a new list will be allocated using
  * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
  * ENOMEM will be returned.
  *
  * If '*slice' is not NULL, it should point to an empty sglist.  If it
  * does not have enough room for the remaining space, then EFBIG will
  * be returned.  If '*slice' is not empty, then EINVAL will be
  * returned.
  */
 int
 sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
     size_t length, int mflags)
 {
 	struct sglist *sg;
 	size_t space, end, foffs, loffs;
 	int count, i, fseg;
 
 	/* Nothing to do. */
 	if (length == 0)
 		return (0);
 
 	/* Figure out how many segments '*slice' needs to have. */
 	end = offset + length;
 	space = 0;
 	count = 0;
 	fseg = 0;
 	foffs = loffs = 0;
 	for (i = 0; i < original->sg_nseg; i++) {
 		space += original->sg_segs[i].ss_len;
 		if (space > offset) {
 			/*
 			 * When we hit the first segment, store its index
 			 * in 'fseg' and the offset into the first segment
 			 * of 'offset' in 'foffs'.
 			 */
 			if (count == 0) {
 				fseg = i;
 				foffs = offset - (space -
 				    original->sg_segs[i].ss_len);
 				CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
 				    foffs);
 			}
 			count++;
 
 			/*
 			 * When we hit the last segment, break out of
 			 * the loop.  Store the amount of extra space
 			 * at the end of this segment in 'loffs'.
 			 */
 			if (space >= end) {
 				loffs = space - end;
 				CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
 				    loffs);
 				break;
 			}
 		}
 	}
 
 	/* If we never hit 'end', then 'length' ran off the end, so fail. */
 	if (space < end)
 		return (EINVAL);
 
 	if (*slice == NULL) {
 		sg = sglist_alloc(count, mflags);
 		if (sg == NULL)
 			return (ENOMEM);
 		*slice = sg;
 	} else {
 		sg = *slice;
 		if (sg->sg_maxseg < count)
 			return (EFBIG);
 		if (sg->sg_nseg != 0)
 			return (EINVAL);
 	}
 
 	/*
 	 * Copy over 'count' segments from 'original' starting at
 	 * 'fseg' to 'sg'.
 	 */
 	bcopy(original->sg_segs + fseg, sg->sg_segs,
 	    count * sizeof(struct sglist_seg));
 	sg->sg_nseg = count;
 
 	/* Fixup first and last segments if needed. */
 	if (foffs != 0) {
 		sg->sg_segs[0].ss_paddr += foffs;
 		sg->sg_segs[0].ss_len -= foffs;
 		CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
 		    (long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
 	}
 	if (loffs != 0) {
 		sg->sg_segs[count - 1].ss_len -= loffs;
 		CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
 		    sg->sg_segs[count - 1].ss_len);
 	}
 	return (0);
 }
Index: head/sys/kern/uipc_ktls.c
===================================================================
--- head/sys/kern/uipc_ktls.c	(revision 360578)
+++ head/sys/kern/uipc_ktls.c	(revision 360579)
@@ -1,1652 +1,1652 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/nhop.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/xform.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) head;
 	bool		running;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 LIST_HEAD(, ktls_crypto_backend) ktls_backends;
 static struct rmlock ktls_backends_lock;
 static uma_zone_t ktls_session_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 static int ktls_allow_unload;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN,
     &ktls_allow_unload, 0, "Allow software crypto modules to unload");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores or domains at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static counter_u64_t ktls_tasks_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static counter_u64_t ktls_cnt_on;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD,
     &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto");
 
 static counter_u64_t ktls_offload_total;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static counter_u64_t ktls_offload_enable_calls;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static counter_u64_t ktls_offload_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static counter_u64_t ktls_offload_failed_crypto;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static counter_u64_t ktls_switch_to_ifnet;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static counter_u64_t ktls_switch_to_sw;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static counter_u64_t ktls_switch_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static counter_u64_t ktls_sw_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_sw_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_ifnet_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_reset;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_dropped;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static counter_u64_t ktls_toe_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_toe_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 
 int
 ktls_crypto_backend_register(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *curr_be, *tmp;
 
 	if (be->api_version != KTLS_API_VERSION) {
 		printf("KTLS: API version mismatch (%d vs %d) for %s\n",
 		    be->api_version, KTLS_API_VERSION,
 		    be->name);
 		return (EINVAL);
 	}
 
 	rm_wlock(&ktls_backends_lock);
 	printf("KTLS: Registering crypto method %s with prio %d\n",
 	       be->name, be->prio);
 	if (LIST_EMPTY(&ktls_backends)) {
 		LIST_INSERT_HEAD(&ktls_backends, be, next);
 	} else {
 		LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) {
 			if (curr_be->prio < be->prio) {
 				LIST_INSERT_BEFORE(curr_be, be, next);
 				break;
 			}
 			if (LIST_NEXT(curr_be, next) == NULL) {
 				LIST_INSERT_AFTER(curr_be, be, next);
 				break;
 			}
 		}
 	}
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 int
 ktls_crypto_backend_deregister(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *tmp;
 
 	/*
 	 * Don't error if the backend isn't registered.  This permits
 	 * MOD_UNLOAD handlers to use this function unconditionally.
 	 */
 	rm_wlock(&ktls_backends_lock);
 	LIST_FOREACH(tmp, &ktls_backends, next) {
 		if (tmp == be)
 			break;
 	}
 	if (tmp == NULL) {
 		rm_wunlock(&ktls_backends_lock);
 		return (0);
 	}
 
 	if (!ktls_allow_unload) {
 		rm_wunlock(&ktls_backends_lock);
 		printf(
 		    "KTLS: Deregistering crypto method %s is not supported\n",
 		    be->name);
 		return (EBUSY);
 	}
 
 	if (be->use_count) {
 		rm_wunlock(&ktls_backends_lock);
 		return (EBUSY);
 	}
 
 	LIST_REMOVE(be, next);
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that some crypto backends rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 	cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static void
 ktls_init(void *dummy __unused)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	cpuset_t mask;
 	int error, i;
 
 	ktls_tasks_active = counter_u64_alloc(M_WAITOK);
 	ktls_cnt_on = counter_u64_alloc(M_WAITOK);
 	ktls_offload_total = counter_u64_alloc(M_WAITOK);
 	ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK);
 	ktls_offload_active = counter_u64_alloc(M_WAITOK);
 	ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_sw = counter_u64_alloc(M_WAITOK);
 	ktls_switch_failed = counter_u64_alloc(M_WAITOK);
 	ktls_sw_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_sw_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK);
 #ifdef TCP_OFFLOAD
 	ktls_toe_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_toe_gcm = counter_u64_alloc(M_WAITOK);
 #endif
 
 	rm_init(&ktls_backends_lock, "ktls backends");
 	LIST_INIT(&ktls_backends);
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error)
 			panic("Can't add KTLS thread %d error %d", i, error);
 
 		/*
 		 * Bind threads to cores.  If ktls_bind_threads is >
 		 * 1, then we bind to the NUMA domain.
 		 */
 		if (ktls_bind_threads) {
 			if (ktls_bind_threads > 1) {
 				pc = pcpu_find(i);
 				CPU_COPY(&cpuset_domain[pc->pc_domain], &mask);
 			} else {
 				CPU_SETOF(i, &mask);
 			}
 			error = cpuset_setthread(td->td_tid, &mask);
 			if (error)
 				panic(
 			    "Unable to bind KTLS thread for CPU %d error %d",
 				     i, error);
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 	printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 }
 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
 			en->iv_len != TLS_AEAD_GCM_LEN) ||
 		    (en->tls_vminor == TLS_MINOR_VER_THREE &&
 			en->iv_len != TLS_1_3_GCM_IV_LEN))
 			return (EINVAL);
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			/*
 			 * TLS 1.0 requires an implicit IV.  TLS 1.1+
 			 * all use explicit IVs.
 			 */
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 					return (EINVAL);
 				break;
 			}
 
 			/* FALLTHROUGH */
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 
 		/*
 		 * TLS 1.3 includes optional padding which we
 		 * do not support, and also puts the "real" record
 		 * type at the end of the encrypted data.
 		 */
 		if (en->tls_vminor == TLS_MINOR_VER_THREE)
 			tls->params.tls_tlen += sizeof(uint8_t);
 
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for GCM and
 	 * the initial implicit IV for TLS 1.0.  The explicit portions
 	 * of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2, generate an 8-byte nonce as a counter
 		 * to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		MPASS(tls->be != NULL);
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		}
 		tls->free(tls);
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		}
 		m_snd_tag_rele(tls->snd_tag);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->params.auth_key != NULL) {
 		explicit_bzero(tls->params.auth_key, tls->params.auth_key_len);
 		free(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		explicit_bzero(tls->params.cipher_key,
 		    tls->params.cipher_key_len);
 		free(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (tp->tod == NULL) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	params.tls.inp = inp;
 	params.tls.tls = tls;
 	INP_RUNLOCK(inp);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {	
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = ifp->if_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_IFNET;
 		tls->snd_tag = mst;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, 1);
 			break;
 		}
 	}
 	return (error);
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 	struct ktls_crypto_backend *be;
 
 	/*
 	 * Choose the best software crypto backend.  Backends are
 	 * stored in sorted priority order (larget value == most
 	 * important at the head of the list), so this just stops on
 	 * the first backend that claims the session by returning
 	 * success.
 	 */
 	if (ktls_allow_unload)
 		rm_rlock(&ktls_backends_lock, &prio);
 	LIST_FOREACH(be, &ktls_backends, next) {
 		if (be->try(so, tls) == 0)
 			break;
 		KASSERT(tls->cipher == NULL,
 		    ("ktls backend leaked a cipher pointer"));
 	}
 	if (be != NULL) {
 		if (ktls_allow_unload)
 			be->use_count++;
 		tls->be = be;
 	}
 	if (ktls_allow_unload)
 		rm_runlock(&ktls_backends_lock, &prio);
 	if (be == NULL)
 		return (EOPNOTSUPP);
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	}
 	return (0);
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* TLS RX offload is only supported on TOE currently. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 #else
 	error = EOPNOTSUPP;
 #endif
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_info = tls;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, false);
 	if (error)
 		error = ktls_try_sw(so, tls);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_rcv);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	return (mode);
 }
 
 int
 ktls_get_tx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (mode);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
 		error = ktls_try_sw(so, tls_new);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		sbunlock(&so->so_snd);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	INP_WLOCK(inp);
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 
 	ktls_cleanup(tls);
 	if (tls->be != NULL && ktls_allow_unload) {
 		rm_rlock(&ktls_backends_lock, &prio);
 		tls->be->use_count--;
 		rm_runlock(&ktls_backends_lock, &prio);
 	}
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_NOMAP) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
-		m->m_ext_pgs.seqno = sb->sb_tls_seqno;
+		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be non-empty TLS
 		 * records whose payload does not exceed the maximum
 		 * frame length.
 		 */
 		KASSERT(m->m_len <= maxlen && m->m_len > 0,
 		    ("ktls_frame: m %p len %d\n", m, m->m_len));
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_NOMAP) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
-		m->m_ext_pgs.tls = ktls_hold(tls);
+		m->m_epg_tls = ktls_hold(tls);
 
-		m->m_ext_pgs.hdr_len = tls->params.tls_hlen;
-		m->m_ext_pgs.trail_len = tls->params.tls_tlen;
+		m->m_epg_hdrlen = tls->params.tls_hlen;
+		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.sb_tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
-			m->m_ext_pgs.trail_len -= delta;
+			m->m_epg_trllen -= delta;
 		}
-		m->m_len += m->m_ext_pgs.hdr_len + m->m_ext_pgs.trail_len;
+		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
-			m->m_ext_pgs.record_type = record_type;
+			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
-			m->m_ext_pgs.nrdy = m->m_ext_pgs.npgs;
-			*enq_cnt += m->m_ext_pgs.npgs;
+			m->m_epg_nrdy = m->m_epg_npgs;
+			*enq_cnt += m->m_epg_npgs;
 		}
 	}
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
-	m->m_ext_pgs.flags |= EPG_FLAG_2FREE;
-	wq = &ktls_wq[m->m_ext_pgs.tls->wq_index];
+	m->m_epg_flags |= EPG_FLAG_2FREE;
+	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
-	STAILQ_INSERT_TAIL(&wq->head, m, m_ext_pgs.stailq);
+	STAILQ_INSERT_TAIL(&wq->head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_NOMAP | M_NOTREADY)) ==
 	    (M_NOMAP | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
-	KASSERT(m->m_ext_pgs.tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
+	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
-	m->m_ext_pgs.enc_cnt = page_count;
+	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
-	m->m_ext_pgs.so = so;
+	m->m_epg_so = so;
 
-	wq = &ktls_wq[m->m_ext_pgs.tls->wq_index];
+	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
-	STAILQ_INSERT_TAIL(&wq->head, m, m_ext_pgs.stailq);
+	STAILQ_INSERT_TAIL(&wq->head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_on, 1);
 }
 
 static __noinline void
 ktls_encrypt(struct mbuf *top)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	vm_page_t pg;
 	int error, i, len, npages, off, total_pages;
 	bool is_anon;
 
-	so = top->m_ext_pgs.so;
-	tls = top->m_ext_pgs.tls;
+	so = top->m_epg_so;
+	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
-	top->m_ext_pgs.so = NULL;
+	top->m_epg_so = NULL;
 #endif
-	total_pages = top->m_ext_pgs.enc_cnt;
+	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
-		KASSERT(m->m_ext_pgs.tls == tls,
+		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
-		    tls, m->m_ext_pgs.tls));
+		    tls, m->m_epg_tls));
 		KASSERT((m->m_flags & (M_NOMAP | M_NOTREADY)) ==
 		    (M_NOMAP | M_NOTREADY),
 		    ("%p not unready & nomap mbuf (top = %p)\n", m, top));
-		KASSERT(npages + m->m_ext_pgs.npgs <= total_pages,
+		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		/*
 		 * Generate source and destination ivoecs to pass to
 		 * the SW encryption backend.  For writable mbufs, the
 		 * destination iovec is a copy of the source and
 		 * encryption is done in place.  For file-backed mbufs
 		 * (from sendfile), anonymous wired pages are
 		 * allocated and assigned to the destination iovec.
 		 */
-		is_anon = (m->m_ext_pgs.flags & EPG_FLAG_ANON) != 0;
+		is_anon = (m->m_epg_flags & EPG_FLAG_ANON) != 0;
 
-		off = m->m_ext_pgs.first_pg_off;
-		for (i = 0; i < m->m_ext_pgs.npgs; i++, off = 0) {
+		off = m->m_epg_1st_off;
+		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			len = m_epg_pagelen(m, i, off);
 			src_iov[i].iov_len = len;
 			src_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) +
 				off;
 
 			if (is_anon) {
 				dst_iov[i].iov_base = src_iov[i].iov_base;
 				dst_iov[i].iov_len = src_iov[i].iov_len;
 				continue;
 			}
 retry_page:
 			pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
 			if (pg == NULL) {
 				vm_wait(NULL);
 				goto retry_page;
 			}
 			parray[i] = VM_PAGE_TO_PHYS(pg);
 			dst_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(parray[i]) + off;
 			dst_iov[i].iov_len = len;
 		}
 
 		npages += i;
 
 		error = (*tls->sw_encrypt)(tls,
 		    (const struct tls_record_layer *)m->m_epg_hdr,
-		    m->m_epg_trail, src_iov, dst_iov, i, m->m_ext_pgs.seqno,
-		    m->m_ext_pgs.record_type);
+		    m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno,
+		    m->m_epg_record_type);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		/*
 		 * For file-backed mbufs, release the file-backed
 		 * pages and replace them in the ext_pgs array with
 		 * the anonymous wired pages allocated above.
 		 */
 		if (!is_anon) {
 			/* Free the old pages. */
 			m->m_ext.ext_free(m);
 
 			/* Replace them with the new pages. */
-			for (i = 0; i < m->m_ext_pgs.npgs; i++)
+			for (i = 0; i < m->m_epg_npgs; i++)
 				m->m_epg_pa[i] = parray[i];
 
 			/* Use the basic free routine. */
 			m->m_ext.ext_free = mb_free_mext_pgs;
 
 			/* Pages are now writable. */
-			m->m_ext_pgs.flags |= EPG_FLAG_ANON;
+			m->m_epg_flags |= EPG_FLAG_ANON;
 		}
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
-		m->m_ext_pgs.tls = NULL;
+		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	STAILQ_HEAD(, mbuf) local_head;
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_head);
 		STAILQ_CONCAT(&local_head, &wq->head);
 		mtx_unlock(&wq->mtx);
 
-		STAILQ_FOREACH_SAFE(m, &local_head, m_ext_pgs.stailq, n) {
-			if (m->m_ext_pgs.flags & EPG_FLAG_2FREE) {
-				ktls_free(m->m_ext_pgs.tls);
+		STAILQ_FOREACH_SAFE(m, &local_head, m_epg_stailq, n) {
+			if (m->m_epg_flags & EPG_FLAG_2FREE) {
+				ktls_free(m->m_epg_tls);
 				uma_zfree(zone_mbuf, m);
 			} else {
 				ktls_encrypt(m);
 				counter_u64_add(ktls_cnt_on, -1);
 			}
 		}
 	}
 }
Index: head/sys/kern/uipc_mbuf.c
===================================================================
--- head/sys/kern/uipc_mbuf.c	(revision 360578)
+++ head/sys/kern/uipc_mbuf.c	(revision 360579)
@@ -1,2134 +1,2134 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mbuf_profiling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/sdt.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t",
     "void*", "void*");
 
 SDT_PROBE_DEFINE(sdt, , , m__cljset);
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
         "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
     "struct mbuf *", "mbufinfo_t *");
 
 #include <security/mac/mac_framework.h>
 
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
 int	max_datalen;
 #ifdef MBUF_STRESS_TEST
 int	m_defragpackets;
 int	m_defragbytes;
 int	m_defraguseless;
 int	m_defragfailure;
 int	m_defragrandomfailures;
 #endif
 
 /*
  * sysctl(8) exported objects
  */
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 0, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
 	   &max_protohdr, 0, "Size of largest protocol layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
 	   &max_hdr, 0, "Size of largest link plus protocol header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
 #ifdef MBUF_STRESS_TEST
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 	   &m_defragpackets, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 	   &m_defragbytes, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 	   &m_defraguseless, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 	   &m_defragfailure, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif
 
 /*
  * Ensure the correct size of various mbuf parameters.  It could be off due
  * to compiler-induced padding and alignment artifacts.
  */
 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
 
 /*
  * mbuf data storage should be 64-bit aligned regardless of architectural
  * pointer size; check this is the case with and without a packet header.
  */
 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
 
 /*
  * While the specific values here don't matter too much (i.e., +/- a few
  * words), we do want to ensure that changes to these values are carefully
  * reasoned about and properly documented.  This is especially the case as
  * network-protocol and device-driver modules encode these layouts, and must
  * be recompiled if the structures change.  Check these values at compile time
  * against the ones documented in comments in mbuf.h.
  *
  * NB: Possibly they should be documented there via #define's and not just
  * comments.
  */
 #if defined(__LP64__)
 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
 CTASSERT(sizeof(struct pkthdr) == 56);
 CTASSERT(sizeof(struct m_ext) == 160);
 #else
 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
 CTASSERT(sizeof(struct pkthdr) == 48);
 CTASSERT(sizeof(struct m_ext) == 180);
 #endif
 
 /*
  * Assert that the queue(3) macros produce code of the same size as an old
  * plain pointer does.
  */
 #ifdef INVARIANTS
 static struct mbuf __used m_assertbuf;
 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
 #endif
 
 /*
  * Attach the cluster from *m to *n, set up m_ext in *n
  * and bump the refcount of the cluster.
  */
 void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
 	volatile u_int *refcnt;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
 
 	/*
 	 * Cache access optimization.
 	 *
 	 * o Regular M_EXT storage doesn't need full copy of m_ext, since
 	 *   the holder of the 'ext_count' is responsible to carry the free
 	 *   routine and its arguments.
 	 * o EXT_PGS data is split between main part of mbuf and m_ext, the
 	 *   main part is copied in full, the m_ext part is similar to M_EXT.
 	 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
 	 *   special - it needs full copy of m_ext into each mbuf, since any
 	 *   copy could end up as the last to free.
 	 */
 	switch (m->m_ext.ext_type) {
 	case EXT_PGS:
-		bcopy(&m->m_ext, &n->m_ext, m_epg_copylen);
-		bcopy(&m->m_ext_pgs, &n->m_ext_pgs,
-		    sizeof(struct mbuf_ext_pgs));
+		bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
+		    __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
+		bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
 		break;
 	case EXT_EXTREF:
 		bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
 		break;
 	default:
 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
 	}
 
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt == 1)
 		*refcnt += 1;
 	else
 		atomic_add_int(refcnt, 1);
 }
 
 void
 m_demote_pkthdr(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	m_tag_delete_chain(m, NULL);
 	m->m_flags &= ~M_PKTHDR;
 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
 }
 
 /*
  * Clean up mbuf (chain) from any tags and packet headers.
  * If "all" is set then the first mbuf in the chain will be
  * cleaned too.
  */
 void
 m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
 		    M_NOMAP | flags);
 	}
 }
 
 /*
  * Sanity checks on mbuf (chain) for use in KASSERT() and general
  * debugging.
  * Returns 0 or panics when bad and 1 on all tests passed.
  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
  * blow up later.
  */
 int
 m_sanity(struct mbuf *m0, int sanitize)
 {
 	struct mbuf *m;
 	caddr_t a, b;
 	int pktlen = 0;
 
 #ifdef INVARIANTS
 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
 #else
 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
 #endif
 
 	for (m = m0; m != NULL; m = m->m_next) {
 		/*
 		 * Basic pointer checks.  If any of these fails then some
 		 * unrelated kernel memory before or after us is trashed.
 		 * No way to recover from that.
 		 */
 		a = M_START(m);
 		b = a + M_SIZE(m);
 		if ((caddr_t)m->m_data < a)
 			M_SANITY_ACTION("m_data outside mbuf data range left");
 		if ((caddr_t)m->m_data > b)
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
 			if (sanitize) {
 				m_freem(m->m_nextpkt);
 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
 			} else
 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
 		}
 
 		/* packet length (not mbuf length!) calculation */
 		if (m0->m_flags & M_PKTHDR)
 			pktlen += m->m_len;
 
 		/* m_tags may only be attached to first mbuf in chain. */
 		if (m != m0 && m->m_flags & M_PKTHDR &&
 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
 			if (sanitize) {
 				m_tag_delete_chain(m, NULL);
 				/* put in 0xDEADC0DE perhaps? */
 			} else
 				M_SANITY_ACTION("m_tags on in-chain mbuf");
 		}
 
 		/* M_PKTHDR may only be set on first mbuf in chain */
 		if (m != m0 && m->m_flags & M_PKTHDR) {
 			if (sanitize) {
 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 				m->m_flags &= ~M_PKTHDR;
 				/* put in 0xDEADCODE and leave hdr flag in */
 			} else
 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
 		}
 	}
 	m = m0;
 	if (pktlen && pktlen != m->m_pkthdr.len) {
 		if (sanitize)
 			m->m_pkthdr.len = 0;
 		else
 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
 	}
 	return 1;
 
 #undef	M_SANITY_ACTION
 }
 
 /*
  * Non-inlined part of m_init().
  */
 int
 m_pkthdr_init(struct mbuf *m, int how)
 {
 #ifdef MAC
 	int error;
 #endif
 	m->m_data = m->m_pktdat;
 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 #ifdef NUMA
 	m->m_pkthdr.numa_domain = M_NODOM;
 #endif
 #ifdef MAC
 	/* If the label init fails, fail the alloc */
 	error = mac_mbuf_init(m, how);
 	if (error)
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
 void
 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	/* see below for why these are not enabled */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
 	    ("m_move_pkthdr: to has tags"));
 #endif
 #ifdef MAC
 	/*
 	 * XXXMAC: It could be this should also occur for non-MAC?
 	 */
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
 	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
 	from->m_flags &= ~M_PKTHDR;
 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		from->m_pkthdr.snd_tag = NULL;
 	}
 }
 
 /*
  * Duplicate "from"'s mbuf pkthdr in "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * In particular, this does a deep copy of the packet tags.
  */
 int
 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
 {
 
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
 	 * when the mbuf is allocated with m_gethdr(). Many users
 	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
 #endif
 	MBUF_CHECKSLEEP(how);
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
 	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		m_snd_tag_ref(from->m_pkthdr.snd_tag);
 	SLIST_INIT(&to->m_pkthdr.tags);
 	return (m_tag_copy_chain(to, from, how));
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
 		mn = m_gethdr(how, m->m_type);
 	else
 		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
 		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
 	if (len < M_SIZE(m))
 		M_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	MBUF_CHECKSLEEP(wait);
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = NULL;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		if (copyhdr)
 			n = m_gethdr(wait, m->m_type);
 		else
 			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, wait))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
 	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	if (!m_dup_pkthdr(n, m, how))
 		goto nospace;
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
 		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 static void
 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	struct iovec iov;
 	struct uio uio;
 	int error;
 
 	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
 	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
 	KASSERT(off < m->m_len,
 	    ("m_copyfromunmapped: len exceeds mbuf length"));
 	iov.iov_base = cp;
 	iov.iov_len = len;
 	uio.uio_resid = len;
 	uio.uio_iov = &iov;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_rw = UIO_READ;
 	error = m_unmappedtouio(m, off, &uio, len);
 	KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
 	   len));
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		if ((m->m_flags & M_NOMAP) != 0)
 			m_copyfromunmapped(m, off, count, cp);
 		else
 			bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(const struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	MBUF_CHECKSLEEP(how);
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	M_ASSERTPKTHDR(m);
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		if (remain >= MINCLSIZE) {
 			n = m_getcl(how, m->m_type, 0);
 			nsize = MCLBYTES;
 		} else {
 			n = m_get(how, m->m_type);
 			nsize = MLEN;
 		}
 		if (n == NULL)
 			goto nospace;
 
 		if (top == NULL) {		/* First one, must be PKTHDR */
 			if (!m_dup_pkthdr(n, m, how)) {
 				m_free(n);
 				goto nospace;
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
 			n->m_flags &= ~M_RDONLY;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
 		    (n->m_flags & M_NOMAP) != 0 ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 /*
  * Concatenate two pkthdr mbuf chains.
  */
 void
 m_catpkt(struct mbuf *m, struct mbuf *n)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERTPKTHDR(n);
 
 	m->m_pkthdr.len += n->m_pkthdr.len;
 	m_demote(n, 1, 0);
 
 	m_cat(m, n);
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		if (mp->m_flags & M_PKTHDR)
 			mp->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				if (m->m_next != NULL) {
 					m_freem(m->m_next);
 					m->m_next = NULL;
 				}
 				break;
 			}
 			count -= m->m_len;
 		}
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod will work
  * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	KASSERT((n->m_flags & M_NOMAP) == 0,
 	    ("%s: unmapped mbuf %p", __func__, n));
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		if (n->m_flags & M_PKTHDR)
 			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Like m_pullup(), except a new mbuf is always allocated, and we allow
  * the amount of empty space before the data in the new mbuf to be specified
  * (in the event that the caller expects to prepend later).
  */
 struct mbuf *
 m_copyup(struct mbuf *n, int len, int dstoff)
 {
 	struct mbuf *m;
 	int count, space;
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
 	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
 	if (n->m_flags & M_PKTHDR)
 		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
 		    (unsigned)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
  bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	MBUF_CHECKSLEEP(wait);
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR && remain == 0) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_next = m->m_next;
 		m->m_next = NULL;
 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 			n->m_pkthdr.snd_tag =
 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 		} else
 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		return (n);
 	} else if (m0->m_flags & M_PKTHDR) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 			n->m_pkthdr.snd_tag =
 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 		} else
 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
 			M_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
 			M_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + len;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
     void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
 				if (m && totlen + off + max_linkhdr <= MHLEN) {
 					m->m_data += max_linkhdr;
 					len -= max_linkhdr;
 				}
 			}
 			if (m == NULL)
 				return NULL;
 			m->m_pkthdr.rcvif = ifp;
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
 				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
 				m_freem(top);
 				return NULL;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		if (m->m_next == NULL && (len > m->m_len - off)) {
 			m->m_len += min(len - (m->m_len - off),
 			    M_TRAILINGSPACE(m));
 		}
 		mlen = min (m->m_len - off, len);
 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 int
 m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space, remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len, remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 	return (remainder == 0);
 }
 
 /*
  * Apply function f to the data in an mbuf chain starting "off" bytes from
  * the beginning, continuing for "len" bytes.
  */
 int
 m_apply(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	u_int count;
 	int rval;
 
 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 		m = m->m_next;
 	}
 	return (0);
 }
 
 /*
  * Return a pointer to mbuf/offset of location in mbuf chain.
  */
 struct mbuf *
 m_getptr(struct mbuf *m, int loc, int *off)
 {
 
 	while (loc >= 0) {
 		/* Normal end of search. */
 		if (m->m_len > loc) {
 			*off = loc;
 			return (m);
 		} else {
 			loc -= m->m_len;
 			if (m->m_next == NULL) {
 				if (loc == 0) {
 					/* Point at the end of valid data. */
 					*off = m->m_len;
 					return (m);
 				}
 				return (NULL);
 			}
 			m = m->m_next;
 		}
 	}
 	return (NULL);
 }
 
 void
 m_print(const struct mbuf *m, int maxlen)
 {
 	int len;
 	int pdata;
 	const struct mbuf *m2;
 
 	if (m == NULL) {
 		printf("mbuf: %p\n", m);
 		return;
 	}
 
 	if (m->m_flags & M_PKTHDR)
 		len = m->m_pkthdr.len;
 	else
 		len = -1;
 	m2 = m;
 	while (m2 != NULL && (len == -1 || len)) {
 		pdata = m2->m_len;
 		if (maxlen != -1 && pdata > maxlen)
 			pdata = maxlen;
 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	if (len > 0)
 		printf("%d bytes unaccounted for.\n", len);
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
 
 /*
  * Defragment a mbuf chain, returning the shortest possible
  * chain of mbufs and clusters.  If allocation fails and
  * this cannot be completed, NULL will be returned, but
  * the passed in chain will be unchanged.  Upon success,
  * the original chain will be freed, and the new chain
  * will be returned.
  *
  * If a non-packet header is passed in, the original
  * mbuf (chain?) will be returned unharmed.
  */
 struct mbuf *
 m_defrag(struct mbuf *m0, int how)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0, length;
 
 	MBUF_CHECKSLEEP(how);
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 #ifdef MBUF_STRESS_TEST
 	if (m_defragrandomfailures) {
 		int temp = arc4random() & 0xff;
 		if (temp == 0xba)
 			goto nospace;
 	}
 #endif
 
 	if (m0->m_pkthdr.len > MHLEN)
 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m_final = m_gethdr(how, MT_DATA);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	while (progress < m0->m_pkthdr.len) {
 		length = m0->m_pkthdr.len - progress;
 		if (length > MCLBYTES)
 			length = MCLBYTES;
 
 		if (m_new == NULL) {
 			if (length > MLEN)
 				m_new = m_getcl(how, MT_DATA, 0);
 			else
 				m_new = m_get(how, MT_DATA);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
 		progress += length;
 		m_new->m_len = length;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 #ifdef MBUF_STRESS_TEST
 	if (m0->m_next == NULL)
 		m_defraguseless++;
 #endif
 	m_freem(m0);
 	m0 = m_final;
 #ifdef MBUF_STRESS_TEST
 	m_defragpackets++;
 	m_defragbytes += m0->m_pkthdr.len;
 #endif
 	return (m0);
 nospace:
 #ifdef MBUF_STRESS_TEST
 	m_defragfailure++;
 #endif
 	if (m_final)
 		m_freem(m_final);
 	return (NULL);
 }
 
 /*
  * Return the number of fragments an mbuf will use.  This is usually
  * used as a proxy for the number of scatter/gather elements needed by
  * a DMA engine to access an mbuf.  In general mapped mbufs are
  * assumed to be backed by physically contiguous buffers that only
  * need a single fragment.  Unmapped mbufs, on the other hand, can
  * span disjoint physical pages.
  */
 static int
 frags_per_mbuf(struct mbuf *m)
 {
 	int frags;
 
 	if ((m->m_flags & M_NOMAP) == 0)
 		return (1);
 
 	/*
 	 * The header and trailer are counted as a single fragment
 	 * each when present.
 	 *
 	 * XXX: This overestimates the number of fragments by assuming
 	 * all the backing physical pages are disjoint.
 	 */
 	frags = 0;
-	if (m->m_ext_pgs.hdr_len != 0)
+	if (m->m_epg_hdrlen != 0)
 		frags++;
-	frags += m->m_ext_pgs.npgs;
-	if (m->m_ext_pgs.trail_len != 0)
+	frags += m->m_epg_npgs;
+	if (m->m_epg_trllen != 0)
 		frags++;
 
 	return (frags);
 }
 
 /*
  * Defragment an mbuf chain, returning at most maxfrags separate
  * mbufs+clusters.  If this is not possible NULL is returned and
  * the original mbuf chain is left in its present (potentially
  * modified) state.  We use two techniques: collapsing consecutive
  * mbufs and replacing consecutive mbufs by a cluster.
  *
  * NB: this should really be named m_defrag but that name is taken
  */
 struct mbuf *
 m_collapse(struct mbuf *m0, int how, int maxfrags)
 {
 	struct mbuf *m, *n, *n2, **prev;
 	u_int curfrags;
 
 	/*
 	 * Calculate the current number of frags.
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
 		curfrags += frags_per_mbuf(m);
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
 	 * pkthdr.  This may be suboptimal if the first mbuf has much
 	 * less data than the following.
 	 */
 	m = m0;
 again:
 	for (;;) {
 		n = m->m_next;
 		if (n == NULL)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			m_copydata(n, 0, n->m_len,
 			    mtod(m, char *) + m->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			curfrags -= frags_per_mbuf(n);
 			m_free(n);
 			if (curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
 	}
 	KASSERT(maxfrags > 1,
 		("maxfrags %u, but normal collapse failed", maxfrags));
 	/*
 	 * Collapse consecutive mbufs to a cluster.
 	 */
 	prev = &m0->m_next;		/* NB: not the first mbuf */
 	while ((n = *prev) != NULL) {
 		if ((n2 = n->m_next) != NULL &&
 		    n->m_len + n2->m_len < MCLBYTES) {
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
 			m_copydata(n, 0,  n->m_len, mtod(m, char *));
 			m_copydata(n2, 0,  n2->m_len,
 			    mtod(m, char *) + n->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
 			curfrags += 1;  /* For the new cluster */
 			curfrags -= frags_per_mbuf(n);
 			curfrags -= frags_per_mbuf(n2);
 			m_free(n);
 			m_free(n2);
 			if (curfrags <= maxfrags)
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
 			 * again before we allocate another cluster.
 			 */
 			goto again;
 		}
 		prev = &n->m_next;
 	}
 	/*
 	 * No place where we can collapse to a cluster; punt.
 	 * This can occur if, for example, you request 2 frags
 	 * but the packet requires that both be clusters (we
 	 * never reallocate the first mbuf to avoid moving the
 	 * packet header).
 	 */
 bad:
 	return NULL;
 }
 
 #ifdef MBUF_STRESS_TEST
 
 /*
  * Fragment an mbuf chain.  There's no reason you'd ever want to do
  * this in normal usage, but it's great for stress testing various
  * mbuf consumers.
  *
  * If fragmentation is not possible, the original chain will be
  * returned.
  *
  * Possible length values:
  * 0	 no fragmentation will occur
  * > 0	each fragment will be of the specified length
  * -1	each fragment will be the same random value in length
  * -2	each fragment's length will be entirely random
  * (Random values range from 1 to 256)
  */
 struct mbuf *
 m_fragment(struct mbuf *m0, int how, int length)
 {
 	struct mbuf *m_first, *m_last;
 	int divisor = 255, progress = 0, fraglen;
 
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	if (length == 0 || length < -2)
 		return (m0);
 	if (length > MCLBYTES)
 		length = MCLBYTES;
 	if (length < 0 && divisor > MCLBYTES)
 		divisor = MCLBYTES;
 	if (length == -1)
 		length = 1 + (arc4random() % divisor);
 	if (length > 0)
 		fraglen = length;
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 	m_first = m_getcl(how, MT_DATA, M_PKTHDR);
 	if (m_first == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_first, m0, how) == 0)
 		goto nospace;
 
 	m_last = m_first;
 
 	while (progress < m0->m_pkthdr.len) {
 		if (length == -2)
 			fraglen = 1 + (arc4random() % divisor);
 		if (fraglen > m0->m_pkthdr.len - progress)
 			fraglen = m0->m_pkthdr.len - progress;
 
 		if (progress != 0) {
 			struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
 			if (m_new == NULL)
 				goto nospace;
 
 			m_last->m_next = m_new;
 			m_last = m_new;
 		}
 
 		m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
 		progress += fraglen;
 		m_last->m_len = fraglen;
 	}
 	m_freem(m0);
 	m0 = m_first;
 	return (m0);
 nospace:
 	if (m_first)
 		m_freem(m_first);
 	/* Return the original chain on failure */
 	return (m0);
 }
 
 #endif
 
 /*
  * Free pages from mbuf_ext_pgs, assuming they were allocated via
  * vm_page_alloc() and aren't associated with any object.  Complement
  * to allocator from m_uiotombuf_nomap().
  */
 void
 mb_free_mext_pgs(struct mbuf *m)
 {
 	vm_page_t pg;
 
 	MBUF_EXT_PGS_ASSERT(m);
-	for (int i = 0; i < m->m_ext_pgs.npgs; i++) {
+	for (int i = 0; i < m->m_epg_npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire_noq(pg);
 		vm_page_free(pg);
 	}
 }
 
 static struct mbuf *
 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
 {
 	struct mbuf *m, *mb, *prev;
 	vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
 	int error, length, i, needed;
 	ssize_t total;
 	int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP |
 	    VM_ALLOC_WIRED;
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = MIN(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	if (maxseg == 0)
 		maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
 
 	/*
 	 * Allocate the pages
 	 */
 	m = NULL;
 	MPASS((flags & M_PKTHDR) == 0);
 	while (total > 0) {
 		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
 		if (mb == NULL)
 			goto failed;
 		if (m == NULL)
 			m = mb;
 		else
 			prev->m_next = mb;
 		prev = mb;
-		mb->m_ext_pgs.flags = EPG_FLAG_ANON;
+		mb->m_epg_flags = EPG_FLAG_ANON;
 		needed = length = MIN(maxseg, total);
 		for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
 retry_page:
 			pg_array[i] = vm_page_alloc(NULL, 0, pflags);
 			if (pg_array[i] == NULL) {
 				if (how & M_NOWAIT) {
 					goto failed;
 				} else {
 					vm_wait(NULL);
 					goto retry_page;
 				}
 			}
 			pg_array[i]->flags &= ~PG_ZERO;
 			mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
-			mb->m_ext_pgs.npgs++;
+			mb->m_epg_npgs++;
 		}
-		mb->m_ext_pgs.last_pg_len = length - PAGE_SIZE * (mb->m_ext_pgs.npgs - 1);
+		mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
 		MBUF_EXT_PGS_ASSERT_SANITY(mb);
 		total -= length;
 		error = uiomove_fromphys(pg_array, 0, length, uio);
 		if (error != 0)
 			goto failed;
 		mb->m_len = length;
-		mb->m_ext.ext_size += PAGE_SIZE * mb->m_ext_pgs.npgs;
+		mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	return (m);
 
 failed:
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  */
 struct mbuf *
 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
 	struct mbuf *m, *mb;
 	int error, length;
 	ssize_t total;
 	int progress = 0;
 
 	if (flags & M_NOMAP)
 		return (m_uiotombuf_nomap(uio, how, len, align, flags));
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = (uio->uio_resid < len) ? uio->uio_resid : len;
 	else
 		total = uio->uio_resid;
 
 	/*
 	 * The smallest unit returned by m_getm2() is a single mbuf
 	 * with pkthdr.  We can't align past it.
 	 */
 	if (align >= MHLEN)
 		return (NULL);
 
 	/*
 	 * Give us the full allocation or nothing.
 	 * If len is zero return the smallest empty mbuf.
 	 */
 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
 	if (m == NULL)
 		return (NULL);
 	m->m_data += align;
 
 	/* Fill all mbufs with uio data and update header information. */
 	for (mb = m; mb != NULL; mb = mb->m_next) {
 		length = min(M_TRAILINGSPACE(mb), total - progress);
 
 		error = uiomove(mtod(mb, void *), length, uio);
 		if (error) {
 			m_freem(m);
 			return (NULL);
 		}
 
 		mb->m_len = length;
 		progress += length;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	KASSERT(progress == total, ("%s: progress != total", __func__));
 
 	return (m);
 }
 
 /*
  * Copy data from an unmapped mbuf into a uio limited by len if set.
  */
 int
 m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len)
 {
 	vm_page_t pg;
 	int error, i, off, pglen, pgoff, seglen, segoff;
 
 	MBUF_EXT_PGS_ASSERT(m);
 	error = 0;
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
 	off += m_off;
-	if (m->m_ext_pgs.hdr_len != 0) {
-		if (off >= m->m_ext_pgs.hdr_len) {
-			off -= m->m_ext_pgs.hdr_len;
+	if (m->m_epg_hdrlen != 0) {
+		if (off >= m->m_epg_hdrlen) {
+			off -= m->m_epg_hdrlen;
 		} else {
-			seglen = m->m_ext_pgs.hdr_len - off;
+			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = uiomove(__DECONST(void *,
 			    &m->m_epg_hdr[segoff]), seglen, uio);
 		}
 	}
-	pgoff = m->m_ext_pgs.first_pg_off;
-	for (i = 0; i < m->m_ext_pgs.npgs && error == 0 && len > 0; i++) {
+	pgoff = m->m_epg_1st_off;
+	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		error = uiomove_fromphys(&pg, segoff, seglen, uio);
 		pgoff = 0;
 	};
 	if (len != 0 && error == 0) {
-		KASSERT((off + len) <= m->m_ext_pgs.trail_len,
+		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
-		    m->m_ext_pgs.trail_len, m_off));
+		    m->m_epg_trllen, m_off));
 		error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
 		    len, uio);
 	}
 	return (error);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 int
 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		if ((m->m_flags & M_NOMAP) != 0)
 			error = m_unmappedtouio(m, 0, uio, length);
 		else
 			error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
 }
 
 /*
  * Create a writable copy of the mbuf chain.  While doing this
  * we compact the chain with a goal of producing a chain with
  * at most two mbufs.  The second mbuf in this chain is likely
  * to be a cluster.  The primary purpose of this work is to create
  * a writable packet for encryption, compression, etc.  The
  * secondary goal is to linearize the data so the data can be
  * passed to crypto hardware in the most efficient manner possible.
  */
 struct mbuf *
 m_unshare(struct mbuf *m0, int how)
 {
 	struct mbuf *m, *mprev;
 	struct mbuf *n, *mfirst, *mlast;
 	int len, off;
 
 	mprev = NULL;
 	for (m = m0; m != NULL; m = mprev->m_next) {
 		/*
 		 * Regular mbufs are ignored unless there's a cluster
 		 * in front of it that we can use to coalesce.  We do
 		 * the latter mainly so later clusters can be coalesced
 		 * also w/o having to handle them specially (i.e. convert
 		 * mbuf+cluster -> cluster).  This optimization is heavily
 		 * influenced by the assumption that we're running over
 		 * Ethernet where MCLBYTES is large enough that the max
 		 * packet size will permit lots of coalescing into a
 		 * single cluster.  This in turn permits efficient
 		 * crypto operations, especially when using hardware.
 		 */
 		if ((m->m_flags & M_EXT) == 0) {
 			if (mprev && (mprev->m_flags & M_EXT) &&
 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
 				/* XXX: this ignores mbuf types */
 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 				    mtod(m, caddr_t), m->m_len);
 				mprev->m_len += m->m_len;
 				mprev->m_next = m->m_next;	/* unlink from chain */
 				m_free(m);			/* reclaim mbuf */
 			} else {
 				mprev = m;
 			}
 			continue;
 		}
 		/*
 		 * Writable mbufs are left alone (for now).
 		 */
 		if (M_WRITABLE(m)) {
 			mprev = m;
 			continue;
 		}
 
 		/*
 		 * Not writable, replace with a copy or coalesce with
 		 * the previous mbuf if possible (since we have to copy
 		 * it anyway, we try to reduce the number of mbufs and
 		 * clusters so that future work is easier).
 		 */
 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
 		/* NB: we only coalesce into a cluster or larger */
 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
 			/* XXX: this ignores mbuf types */
 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 			    mtod(m, caddr_t), m->m_len);
 			mprev->m_len += m->m_len;
 			mprev->m_next = m->m_next;	/* unlink from chain */
 			m_free(m);			/* reclaim mbuf */
 			continue;
 		}
 
 		/*
 		 * Allocate new space to hold the copy and copy the data.
 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
 		 * splitting them into clusters.  We could just malloc a
 		 * buffer and make it external but too many device drivers
 		 * don't know how to break up the non-contiguous memory when
 		 * doing DMA.
 		 */
 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 		if (n == NULL) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m->m_flags & M_PKTHDR) {
 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
 			    __func__, m0, m));
 			m_move_pkthdr(n, m);
 		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
 		mlast = NULL;
 		for (;;) {
 			int cc = min(len, MCLBYTES);
 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
 			n->m_len = cc;
 			if (mlast != NULL)
 				mlast->m_next = n;
 			mlast = n;
 #if 0
 			newipsecstat.ips_clcopied++;
 #endif
 
 			len -= cc;
 			if (len <= 0)
 				break;
 			off += cc;
 
 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);
 				return (NULL);
 			}
 		}
 		n->m_next = m->m_next;
 		if (mprev == NULL)
 			m0 = mfirst;		/* new head of chain */
 		else
 			mprev->m_next = mfirst;	/* replace old mbuf */
 		m_free(m);			/* release old mbuf */
 		mprev = mfirst;
 	}
 	return (m0);
 }
 
 #ifdef MBUF_PROFILING
 
 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
 struct mbufprofile {
 	uintmax_t wasted[MP_BUCKETS];
 	uintmax_t used[MP_BUCKETS];
 	uintmax_t segments[MP_BUCKETS];
 } mbprof;
 
 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
 #define MP_NUMLINES 6
 #define MP_NUMSPERLINE 16
 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
 /* work out max space needed and add a bit of spare space too */
 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
 
 char mbprofbuf[MP_BUFSIZE];
 
 void
 m_profile(struct mbuf *m)
 {
 	int segments = 0;
 	int used = 0;
 	int wasted = 0;
 
 	while (m) {
 		segments++;
 		used += m->m_len;
 		if (m->m_flags & M_EXT) {
 			wasted += MHLEN - sizeof(m->m_ext) +
 			    m->m_ext.ext_size - m->m_len;
 		} else {
 			if (m->m_flags & M_PKTHDR)
 				wasted += MHLEN - m->m_len;
 			else
 				wasted += MLEN - m->m_len;
 		}
 		m = m->m_next;
 	}
 	/* be paranoid.. it helps */
 	if (segments > MP_BUCKETS - 1)
 		segments = MP_BUCKETS - 1;
 	if (used > 100000)
 		used = 100000;
 	if (wasted > 100000)
 		wasted = 100000;
 	/* store in the appropriate bucket */
 	/* don't bother locking. if it's slightly off, so what? */
 	mbprof.segments[segments]++;
 	mbprof.used[fls(used)]++;
 	mbprof.wasted[fls(wasted)]++;
 }
 
 static void
 mbprof_textify(void)
 {
 	int offset;
 	char *c;
 	uint64_t *p;
 
 	p = &mbprof.wasted[0];
 	c = mbprofbuf;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "wasted:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.wasted[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.used[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "used:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.used[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.segments[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "segments:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.segments[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 }
 
 static int
 mbprof_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	mbprof_textify();
 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
 	return (error);
 }
 
 static int
 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(&mbprof, sizeof(mbprof));
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
     mbprof_handler, "A",
     "mbuf profiling statistics");
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0,
     mbprof_clr_handler, "I",
     "clear mbuf profiling statistics");
 #endif
 
Index: head/sys/kern/uipc_sockbuf.c
===================================================================
--- head/sys/kern/uipc_sockbuf.c	(revision 360578)
+++ head/sys/kern/uipc_sockbuf.c	(revision 360579)
@@ -1,1555 +1,1555 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kern_tls.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/aio.h> /* for aio_swake proto */
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 /*
  * Function pointer set by the AIO routines so that the socket buffer code
  * can call back into the AIO module if it is loaded.
  */
 void	(*aio_swake)(struct socket *, struct sockbuf *);
 
 /*
  * Primitive routines for operating on socket buffers
  */
 
 u_long	sb_max = SB_MAX;
 u_long sb_max_adj =
        (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
 
 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
 static struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
 static void	sbflush_internal(struct sockbuf *sb);
 
 /*
  * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY.
  */
 static void
 sbm_clrprotoflags(struct mbuf *m, int flags)
 {
 	int mask;
 
 	mask = ~M_PROTOFLAGS;
 	if (flags & PRUS_NOTREADY)
 		mask |= M_NOTREADY;
 	while (m) {
 		m->m_flags &= mask;
 		m = m->m_next;
 	}
 }
 
 /*
  * Compress M_NOTREADY mbufs after they have been readied by sbready().
  *
  * sbcompress() skips M_NOTREADY mbufs since the data is not available to
  * be copied at the time of sbcompress().  This function combines small
  * mbufs similar to sbcompress() once mbufs are ready.  'm0' is the first
  * mbuf sbready() marked ready, and 'end' is the first mbuf still not
  * ready.
  */
 static void
 sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end)
 {
 	struct mbuf *m, *n;
 	int ext_size;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if ((sb->sb_flags & SB_NOCOALESCE) != 0)
 		return;
 
 	for (m = m0; m != end; m = m->m_next) {
 		MPASS((m->m_flags & M_NOTREADY) == 0);
 		/*
 		 * NB: In sbcompress(), 'n' is the last mbuf in the
 		 * socket buffer and 'm' is the new mbuf being copied
 		 * into the trailing space of 'n'.  Here, the roles
 		 * are reversed and 'n' is the next mbuf after 'm'
 		 * that is being copied into the trailing space of
 		 * 'm'.
 		 */
 		n = m->m_next;
 #ifdef KERN_TLS
 		/* Try to coalesce adjacent ktls mbuf hdr/trailers. */
 		if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
 		    (m->m_flags & M_NOMAP) &&
 		    (n->m_flags & M_NOMAP) &&
 		    !mbuf_has_tls_session(m) &&
 		    !mbuf_has_tls_session(n)) {
 			int hdr_len, trail_len;
 
-			hdr_len = n->m_ext_pgs.hdr_len;
-			trail_len = m->m_ext_pgs.trail_len;
+			hdr_len = n->m_epg_hdrlen;
+			trail_len = m->m_epg_trllen;
 			if (trail_len != 0 && hdr_len != 0 &&
 			    trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) {
 				/* copy n's header to m's trailer */
 				memcpy(&m->m_epg_trail[trail_len],
 				    n->m_epg_hdr, hdr_len);
-				m->m_ext_pgs.trail_len += hdr_len;
+				m->m_epg_trllen += hdr_len;
 				m->m_len += hdr_len;
-				n->m_ext_pgs.hdr_len = 0;
+				n->m_epg_hdrlen = 0;
 				n->m_len -= hdr_len;
 			}
 		}
 #endif
 
 		/* Compress small unmapped mbufs into plain mbufs. */
 		if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN &&
 		    !mbuf_has_tls_session(m)) {
 			MPASS(m->m_flags & M_EXT);
 			ext_size = m->m_ext.ext_size;
 			if (mb_unmapped_compress(m) == 0) {
 				sb->sb_mbcnt -= ext_size;
 				sb->sb_ccnt -= 1;
 			}
 		}
 
 		while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
 		    M_WRITABLE(m) &&
 		    (m->m_flags & M_NOMAP) == 0 &&
 		    !mbuf_has_tls_session(n) &&
 		    !mbuf_has_tls_session(m) &&
 		    n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
 		    n->m_len <= M_TRAILINGSPACE(m) &&
 		    m->m_type == n->m_type) {
 			KASSERT(sb->sb_lastrecord != n,
 		    ("%s: merging start of record (%p) into previous mbuf (%p)",
 			    __func__, n, m));
 			m_copydata(n, 0, n->m_len, mtodo(m, m->m_len));
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			m->m_flags |= n->m_flags & M_EOR;
 			if (sb->sb_mbtail == n)
 				sb->sb_mbtail = m;
 
 			sb->sb_mbcnt -= MSIZE;
 			sb->sb_mcnt -= 1;
 			if (n->m_flags & M_EXT) {
 				sb->sb_mbcnt -= n->m_ext.ext_size;
 				sb->sb_ccnt -= 1;
 			}
 			m_free(n);
 			n = m->m_next;
 		}
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 }
 
 /*
  * Mark ready "count" units of I/O starting with "m".  Most mbufs
  * count as a single unit of I/O except for EXT_PGS-backed mbufs which
  * can be backed by multiple pages.
  */
 int
 sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 {
 	struct mbuf *m;
 	u_int blocker;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
 	KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
 
 	m = m0;
 	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
 
 	while (count > 0) {
 		KASSERT(m->m_flags & M_NOTREADY,
 		    ("%s: m %p !M_NOTREADY", __func__, m));
 		if ((m->m_flags & M_EXT) != 0 &&
 		    m->m_ext.ext_type == EXT_PGS) {
-			if (count < m->m_ext_pgs.nrdy) {
-				m->m_ext_pgs.nrdy -= count;
+			if (count < m->m_epg_nrdy) {
+				m->m_epg_nrdy -= count;
 				count = 0;
 				break;
 			}
-			count -= m->m_ext_pgs.nrdy;
-			m->m_ext_pgs.nrdy = 0;
+			count -= m->m_epg_nrdy;
+			m->m_epg_nrdy = 0;
 		} else
 			count--;
 
 		m->m_flags &= ~(M_NOTREADY | blocker);
 		if (blocker)
 			sb->sb_acc += m->m_len;
 		m = m->m_next;
 	}
 
 	/*
 	 * If the first mbuf is still not fully ready because only
 	 * some of its backing pages were readied, no further progress
 	 * can be made.
 	 */
 	if (m0 == m) {
 		MPASS(m->m_flags & M_NOTREADY);
 		return (EINPROGRESS);
 	}
 
 	if (!blocker) {
 		sbready_compress(sb, m0, m);
 		return (EINPROGRESS);
 	}
 
 	/* This one was blocking all the queue. */
 	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
 		KASSERT(m->m_flags & M_BLOCKED,
 		    ("%s: m %p !M_BLOCKED", __func__, m));
 		m->m_flags &= ~M_BLOCKED;
 		sb->sb_acc += m->m_len;
 	}
 
 	sb->sb_fnrdy = m;
 	sbready_compress(sb, m0, m);
 
 	return (0);
 }
 
 /*
  * Adjust sockbuf state reflecting allocation of m.
  */
 void
 sballoc(struct sockbuf *sb, struct mbuf *m)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sb->sb_ccc += m->m_len;
 
 	if (sb->sb_fnrdy == NULL) {
 		if (m->m_flags & M_NOTREADY)
 			sb->sb_fnrdy = m;
 		else
 			sb->sb_acc += m->m_len;
 	} else
 		m->m_flags |= M_BLOCKED;
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 		sb->sb_ctl += m->m_len;
 
 	sb->sb_mbcnt += MSIZE;
 	sb->sb_mcnt += 1;
 
 	if (m->m_flags & M_EXT) {
 		sb->sb_mbcnt += m->m_ext.ext_size;
 		sb->sb_ccnt += 1;
 	}
 }
 
 /*
  * Adjust sockbuf state reflecting freeing of m.
  */
 void
 sbfree(struct sockbuf *sb, struct mbuf *m)
 {
 
 #if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
 	SOCKBUF_LOCK_ASSERT(sb);
 #endif
 
 	sb->sb_ccc -= m->m_len;
 
 	if (!(m->m_flags & M_NOTAVAIL))
 		sb->sb_acc -= m->m_len;
 
 	if (m == sb->sb_fnrdy) {
 		struct mbuf *n;
 
 		KASSERT(m->m_flags & M_NOTREADY,
 		    ("%s: m %p !M_NOTREADY", __func__, m));
 
 		n = m->m_next;
 		while (n != NULL && !(n->m_flags & M_NOTREADY)) {
 			n->m_flags &= ~M_BLOCKED;
 			sb->sb_acc += n->m_len;
 			n = n->m_next;
 		}
 		sb->sb_fnrdy = n;
 	}
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 		sb->sb_ctl -= m->m_len;
 
 	sb->sb_mbcnt -= MSIZE;
 	sb->sb_mcnt -= 1;
 	if (m->m_flags & M_EXT) {
 		sb->sb_mbcnt -= m->m_ext.ext_size;
 		sb->sb_ccnt -= 1;
 	}
 
 	if (sb->sb_sndptr == m) {
 		sb->sb_sndptr = NULL;
 		sb->sb_sndptroff = 0;
 	}
 	if (sb->sb_sndptroff != 0)
 		sb->sb_sndptroff -= m->m_len;
 }
 
 /*
  * Socantsendmore indicates that no more data will be sent on the socket; it
  * would normally be applied to a socket when the user informs the system
  * that no more data is to be sent, by the protocol code (in case
  * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
  * received, and will normally be applied to the socket by a protocol when it
  * detects that the peer will send no more data.  Data queued for reading in
  * the socket may yet be read.
  */
 void
 socantsendmore_locked(struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 
 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
 	sowwakeup_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
 }
 
 void
 socantsendmore(struct socket *so)
 {
 
 	SOCKBUF_LOCK(&so->so_snd);
 	socantsendmore_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
 }
 
 void
 socantrcvmore_locked(struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
 	sorwakeup_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
 }
 
 void
 socantrcvmore(struct socket *so)
 {
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	socantrcvmore_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
 }
 
 /*
  * Wait for data to arrive at/drain from a socket buffer.
  */
 int
 sbwait(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sb->sb_flags |= SB_WAIT;
 	return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx,
 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 	    sb->sb_timeo, 0, 0));
 }
 
 int
 sblock(struct sockbuf *sb, int flags)
 {
 
 	KASSERT((flags & SBL_VALID) == flags,
 	    ("sblock: flags invalid (0x%x)", flags));
 
 	if (flags & SBL_WAIT) {
 		if ((sb->sb_flags & SB_NOINTR) ||
 		    (flags & SBL_NOINTR)) {
 			sx_xlock(&sb->sb_sx);
 			return (0);
 		}
 		return (sx_xlock_sig(&sb->sb_sx));
 	} else {
 		if (sx_try_xlock(&sb->sb_sx) == 0)
 			return (EWOULDBLOCK);
 		return (0);
 	}
 }
 
 void
 sbunlock(struct sockbuf *sb)
 {
 
 	sx_xunlock(&sb->sb_sx);
 }
 
 /*
  * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
  * via SIGIO if the socket has the SS_ASYNC flag set.
  *
  * Called with the socket buffer lock held; will release the lock by the end
  * of the function.  This allows the caller to acquire the socket buffer lock
  * while testing for the need for various sorts of wakeup and hold it through
  * to the point where it's no longer required.  We currently hold the lock
  * through calls out to other subsystems (with the exception of kqueue), and
  * then release it to avoid lock order issues.  It's not clear that's
  * correct.
  */
 void
 sowakeup(struct socket *so, struct sockbuf *sb)
 {
 	int ret;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	selwakeuppri(sb->sb_sel, PSOCK);
 	if (!SEL_WAITING(sb->sb_sel))
 		sb->sb_flags &= ~SB_SEL;
 	if (sb->sb_flags & SB_WAIT) {
 		sb->sb_flags &= ~SB_WAIT;
 		wakeup(&sb->sb_acc);
 	}
 	KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
 	if (sb->sb_upcall != NULL) {
 		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
 		if (ret == SU_ISCONNECTED) {
 			KASSERT(sb == &so->so_rcv,
 			    ("SO_SND upcall returned SU_ISCONNECTED"));
 			soupcall_clear(so, SO_RCV);
 		}
 	} else
 		ret = SU_OK;
 	if (sb->sb_flags & SB_AIO)
 		sowakeup_aio(so, sb);
 	SOCKBUF_UNLOCK(sb);
 	if (ret == SU_ISCONNECTED)
 		soisconnected(so);
 	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGIO, 0);
 	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
 }
 
 /*
  * Socket buffer (struct sockbuf) utility routines.
  *
  * Each socket contains two socket buffers: one for sending data and one for
  * receiving data.  Each buffer contains a queue of mbufs, information about
  * the number of mbufs and amount of data in the queue, and other fields
  * allowing select() statements and notification on data availability to be
  * implemented.
  *
  * Data stored in a socket buffer is maintained as a list of records.  Each
  * record is a list of mbufs chained together with the m_next field.  Records
  * are chained together with the m_nextpkt field. The upper level routine
  * soreceive() expects the following conventions to be observed when placing
  * information in the receive buffer:
  *
  * 1. If the protocol requires each message be preceded by the sender's name,
  *    then a record containing that name must be present before any
  *    associated data (mbuf's must be of type MT_SONAME).
  * 2. If the protocol supports the exchange of ``access rights'' (really just
  *    additional data associated with the message), and there are ``rights''
  *    to be received, then a record containing this data should be present
  *    (mbuf's must be of type MT_RIGHTS).
  * 3. If a name or rights record exists, then it must be followed by a data
  *    record, perhaps of zero length.
  *
  * Before using a new socket structure it is first necessary to reserve
  * buffer space to the socket, by calling sbreserve().  This should commit
  * some of the available buffer space in the system buffer pool for the
  * socket (currently, it does nothing but enforce limits).  The space should
  * be released by calling sbrelease() when the socket is destroyed.
  */
 int
 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
 {
 	struct thread *td = curthread;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
 		goto bad;
 	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
 		goto bad2;
 	if (so->so_rcv.sb_lowat == 0)
 		so->so_rcv.sb_lowat = 1;
 	if (so->so_snd.sb_lowat == 0)
 		so->so_snd.sb_lowat = MCLBYTES;
 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 bad2:
 	sbrelease_locked(&so->so_snd, so);
 bad:
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (ENOBUFS);
 }
 
 static int
 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	u_long tmp_sb_max = sb_max;
 
 	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
 	if (error || !req->newptr)
 		return (error);
 	if (tmp_sb_max < MSIZE + MCLBYTES)
 		return (EINVAL);
 	sb_max = tmp_sb_max;
 	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 	return (0);
 }
 	
 /*
  * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
  * become limiting if buffering efficiency is near the normal case.
  */
 int
 sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
     struct thread *td)
 {
 	rlim_t sbsize_limit;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	/*
 	 * When a thread is passed, we take into account the thread's socket
 	 * buffer size limit.  The caller will generally pass curthread, but
 	 * in the TCP input path, NULL will be passed to indicate that no
 	 * appropriate thread resource limits are available.  In that case,
 	 * we don't apply a process limit.
 	 */
 	if (cc > sb_max_adj)
 		return (0);
 	if (td != NULL) {
 		sbsize_limit = lim_cur(td, RLIMIT_SBSIZE);
 	} else
 		sbsize_limit = RLIM_INFINITY;
 	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
 	    sbsize_limit))
 		return (0);
 	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 	if (sb->sb_lowat > sb->sb_hiwat)
 		sb->sb_lowat = sb->sb_hiwat;
 	return (1);
 }
 
 int
 sbsetopt(struct socket *so, int cmd, u_long cc)
 {
 	struct sockbuf *sb;
 	short *flags;
 	u_int *hiwat, *lowat;
 	int error;
 
 	sb = NULL;
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		switch (cmd) {
 			case SO_SNDLOWAT:
 			case SO_SNDBUF:
 				lowat = &so->sol_sbsnd_lowat;
 				hiwat = &so->sol_sbsnd_hiwat;
 				flags = &so->sol_sbsnd_flags;
 				break;
 			case SO_RCVLOWAT:
 			case SO_RCVBUF:
 				lowat = &so->sol_sbrcv_lowat;
 				hiwat = &so->sol_sbrcv_hiwat;
 				flags = &so->sol_sbrcv_flags;
 				break;
 		}
 	} else {
 		switch (cmd) {
 			case SO_SNDLOWAT:
 			case SO_SNDBUF:
 				sb = &so->so_snd;
 				break;
 			case SO_RCVLOWAT:
 			case SO_RCVBUF:
 				sb = &so->so_rcv;
 				break;
 		}
 		flags = &sb->sb_flags;
 		hiwat = &sb->sb_hiwat;
 		lowat = &sb->sb_lowat;
 		SOCKBUF_LOCK(sb);
 	}
 
 	error = 0;
 	switch (cmd) {
 	case SO_SNDBUF:
 	case SO_RCVBUF:
 		if (SOLISTENING(so)) {
 			if (cc > sb_max_adj) {
 				error = ENOBUFS;
 				break;
 			}
 			*hiwat = cc;
 			if (*lowat > *hiwat)
 				*lowat = *hiwat;
 		} else {
 			if (!sbreserve_locked(sb, cc, so, curthread))
 				error = ENOBUFS;
 		}
 		if (error == 0)
 			*flags &= ~SB_AUTOSIZE;
 		break;
 	case SO_SNDLOWAT:
 	case SO_RCVLOWAT:
 		/*
 		 * Make sure the low-water is never greater than the
 		 * high-water.
 		 */
 		*lowat = (cc > *hiwat) ? *hiwat : cc;
 		break;
 	}
 
 	if (!SOLISTENING(so))
 		SOCKBUF_UNLOCK(sb);
 	SOCK_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Free mbufs held by a socket, and reserved mbuf space.
  */
 void
 sbrelease_internal(struct sockbuf *sb, struct socket *so)
 {
 
 	sbflush_internal(sb);
 	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
 	    RLIM_INFINITY);
 	sb->sb_mbmax = 0;
 }
 
 void
 sbrelease_locked(struct sockbuf *sb, struct socket *so)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sbrelease_internal(sb, so);
 }
 
 void
 sbrelease(struct sockbuf *sb, struct socket *so)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbrelease_locked(sb, so);
 	SOCKBUF_UNLOCK(sb);
 }
 
 void
 sbdestroy(struct sockbuf *sb, struct socket *so)
 {
 
 	sbrelease_internal(sb, so);
 #ifdef KERN_TLS
 	if (sb->sb_tls_info != NULL)
 		ktls_free(sb->sb_tls_info);
 	sb->sb_tls_info = NULL;
 #endif
 }
 
 /*
  * Routines to add and remove data from an mbuf queue.
  *
  * The routines sbappend() or sbappendrecord() are normally called to append
  * new mbufs to a socket buffer, after checking that adequate space is
  * available, comparing the function sbspace() with the amount of data to be
  * added.  sbappendrecord() differs from sbappend() in that data supplied is
  * treated as the beginning of a new record.  To place a sender's address,
  * optional access rights, and data in a socket receive buffer,
  * sbappendaddr() should be used.  To place access rights and data in a
  * socket receive buffer, sbappendrights() should be used.  In either case,
  * the new data begins a new record.  Note that unlike sbappend() and
  * sbappendrecord(), these routines check for the caller that there will be
  * enough space to store the data.  Each fails if there is not enough space,
  * or if it cannot find mbufs to store additional information in.
  *
  * Reliable protocols may use the socket send buffer to hold data awaiting
  * acknowledgement.  Data is normally copied from a socket send buffer in a
  * protocol with m_copy for output to a peer, and then removing the data from
  * the socket buffer with sbdrop() or sbdroprecord() when the data is
  * acknowledged by the peer.
  */
 #ifdef SOCKBUF_DEBUG
 void
 sblastrecordchk(struct sockbuf *sb, const char *file, int line)
 {
 	struct mbuf *m = sb->sb_mb;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m && m->m_nextpkt)
 		m = m->m_nextpkt;
 
 	if (m != sb->sb_lastrecord) {
 		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
 			__func__, sb->sb_mb, sb->sb_lastrecord, m);
 		printf("packet chain:\n");
 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
 			printf("\t%p\n", m);
 		panic("%s from %s:%u", __func__, file, line);
 	}
 }
 
 void
 sblastmbufchk(struct sockbuf *sb, const char *file, int line)
 {
 	struct mbuf *m = sb->sb_mb;
 	struct mbuf *n;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m && m->m_nextpkt)
 		m = m->m_nextpkt;
 
 	while (m && m->m_next)
 		m = m->m_next;
 
 	if (m != sb->sb_mbtail) {
 		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
 			__func__, sb->sb_mb, sb->sb_mbtail, m);
 		printf("packet tree:\n");
 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
 			printf("\t");
 			for (n = m; n != NULL; n = n->m_next)
 				printf("%p ", n);
 			printf("\n");
 		}
 		panic("%s from %s:%u", __func__, file, line);
 	}
 }
 #endif /* SOCKBUF_DEBUG */
 
 #define SBLINKRECORD(sb, m0) do {					\
 	SOCKBUF_LOCK_ASSERT(sb);					\
 	if ((sb)->sb_lastrecord != NULL)				\
 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
 	else								\
 		(sb)->sb_mb = (m0);					\
 	(sb)->sb_lastrecord = (m0);					\
 } while (/*CONSTCOND*/0)
 
 /*
  * Append mbuf chain m to the last record in the socket buffer sb.  The
  * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
  * are discarded and mbufs are compacted where possible.
  */
 void
 sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 	struct mbuf *n;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m == NULL)
 		return;
 	sbm_clrprotoflags(m, flags);
 	SBLASTRECORDCHK(sb);
 	n = sb->sb_mb;
 	if (n) {
 		while (n->m_nextpkt)
 			n = n->m_nextpkt;
 		do {
 			if (n->m_flags & M_EOR) {
 				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
 				return;
 			}
 		} while (n->m_next && (n = n->m_next));
 	} else {
 		/*
 		 * XXX Would like to simply use sb_mbtail here, but
 		 * XXX I need to verify that I won't miss an EOR that
 		 * XXX way.
 		 */
 		if ((n = sb->sb_lastrecord) != NULL) {
 			do {
 				if (n->m_flags & M_EOR) {
 					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
 					return;
 				}
 			} while (n->m_next && (n = n->m_next));
 		} else {
 			/*
 			 * If this is the first record in the socket buffer,
 			 * it's also the last record.
 			 */
 			sb->sb_lastrecord = m;
 		}
 	}
 	sbcompress(sb, m, n);
 	SBLASTRECORDCHK(sb);
 }
 
 /*
  * Append mbuf chain m to the last record in the socket buffer sb.  The
  * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
  * are discarded and mbufs are compacted where possible.
  */
 void
 sbappend(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappend_locked(sb, m, flags);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * This version of sbappend() should only be used when the caller absolutely
  * knows that there will never be more than one record in the socket buffer,
  * that is, a stream protocol (such as TCP).
  */
 void
 sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
 	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
 
 	SBLASTMBUFCHK(sb);
 
 #ifdef KERN_TLS
 	if (sb->sb_tls_info != NULL)
 		ktls_seq(sb, m);
 #endif
 
 	/* Remove all packet headers and mbuf tags to get a pure data chain. */
 	m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0);
 
 	sbcompress(sb, m, sb->sb_mbtail);
 
 	sb->sb_lastrecord = sb->sb_mb;
 	SBLASTRECORDCHK(sb);
 }
 
 /*
  * This version of sbappend() should only be used when the caller absolutely
  * knows that there will never be more than one record in the socket buffer,
  * that is, a stream protocol (such as TCP).
  */
 void
 sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappendstream_locked(sb, m, flags);
 	SOCKBUF_UNLOCK(sb);
 }
 
 #ifdef SOCKBUF_DEBUG
 void
 sbcheck(struct sockbuf *sb, const char *file, int line)
 {
 	struct mbuf *m, *n, *fnrdy;
 	u_long acc, ccc, mbcnt;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	acc = ccc = mbcnt = 0;
 	fnrdy = NULL;
 
 	for (m = sb->sb_mb; m; m = n) {
 	    n = m->m_nextpkt;
 	    for (; m; m = m->m_next) {
 		if (m->m_len == 0) {
 			printf("sb %p empty mbuf %p\n", sb, m);
 			goto fail;
 		}
 		if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) {
 			if (m != sb->sb_fnrdy) {
 				printf("sb %p: fnrdy %p != m %p\n",
 				    sb, sb->sb_fnrdy, m);
 				goto fail;
 			}
 			fnrdy = m;
 		}
 		if (fnrdy) {
 			if (!(m->m_flags & M_NOTAVAIL)) {
 				printf("sb %p: fnrdy %p, m %p is avail\n",
 				    sb, sb->sb_fnrdy, m);
 				goto fail;
 			}
 		} else
 			acc += m->m_len;
 		ccc += m->m_len;
 		mbcnt += MSIZE;
 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 			mbcnt += m->m_ext.ext_size;
 	    }
 	}
 	if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) {
 		printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n",
 		    acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt);
 		goto fail;
 	}
 	return;
 fail:
 	panic("%s from %s:%u", __func__, file, line);
 }
 #endif
 
 /*
  * As above, except the mbuf chain begins a new record.
  */
 void
 sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
 {
 	struct mbuf *m;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m0 == NULL)
 		return;
 	m_clrprotoflags(m0);
 	/*
 	 * Put the first mbuf on the queue.  Note this permits zero length
 	 * records.
 	 */
 	sballoc(sb, m0);
 	SBLASTRECORDCHK(sb);
 	SBLINKRECORD(sb, m0);
 	sb->sb_mbtail = m0;
 	m = m0->m_next;
 	m0->m_next = 0;
 	if (m && (m0->m_flags & M_EOR)) {
 		m0->m_flags &= ~M_EOR;
 		m->m_flags |= M_EOR;
 	}
 	/* always call sbcompress() so it can do SBLASTMBUFCHK() */
 	sbcompress(sb, m, m0);
 }
 
 /*
  * As above, except the mbuf chain begins a new record.
  */
 void
 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappendrecord_locked(sb, m0);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /* Helper routine that appends data, control, and address to a sockbuf. */
 static int
 sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last)
 {
 	struct mbuf *m, *n, *nlast;
 #if MSIZE <= 256
 	if (asa->sa_len > MLEN)
 		return (0);
 #endif
 	m = m_get(M_NOWAIT, MT_SONAME);
 	if (m == NULL)
 		return (0);
 	m->m_len = asa->sa_len;
 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
 	if (m0) {
 		m_clrprotoflags(m0);
 		m_tag_delete_chain(m0, NULL);
 		/*
 		 * Clear some persistent info from pkthdr.
 		 * We don't use m_demote(), because some netgraph consumers
 		 * expect M_PKTHDR presence.
 		 */
 		m0->m_pkthdr.rcvif = NULL;
 		m0->m_pkthdr.flowid = 0;
 		m0->m_pkthdr.csum_flags = 0;
 		m0->m_pkthdr.fibnum = 0;
 		m0->m_pkthdr.rsstype = 0;
 	}
 	if (ctrl_last)
 		ctrl_last->m_next = m0;	/* concatenate data to control */
 	else
 		control = m0;
 	m->m_next = control;
 	for (n = m; n->m_next != NULL; n = n->m_next)
 		sballoc(sb, n);
 	sballoc(sb, n);
 	nlast = n;
 	SBLINKRECORD(sb, m);
 
 	sb->sb_mbtail = nlast;
 	SBLASTMBUFCHK(sb);
 
 	SBLASTRECORDCHK(sb);
 	return (1);
 }
 
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
  * with total length.  Returns 0 if no space in sockbuf or insufficient
  * mbufs.
  */
 int
 sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
 	struct mbuf *ctrl_last;
 	int space = asa->sa_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 		panic("sbappendaddr_locked");
 	if (m0)
 		space += m0->m_pkthdr.len;
 	space += m_length(control, &ctrl_last);
 
 	if (space > sbspace(sb))
 		return (0);
 	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
 }
 
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
  * with total length.  Returns 0 if insufficient mbufs.  Does not validate space
  * on the receiving sockbuf.
  */
 int
 sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
 	struct mbuf *ctrl_last;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	ctrl_last = (control == NULL) ? NULL : m_last(control);
 	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
 }
 
 /*
  * Append address and data, and optionally, control (ancillary) data to the
  * receive queue of a socket.  If present, m0 must include a packet header
  * with total length.  Returns 0 if no space in sockbuf or insufficient
  * mbufs.
  */
 int
 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
 	int retval;
 
 	SOCKBUF_LOCK(sb);
 	retval = sbappendaddr_locked(sb, asa, m0, control);
 	SOCKBUF_UNLOCK(sb);
 	return (retval);
 }
 
 void
 sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
     struct mbuf *control, int flags)
 {
 	struct mbuf *m, *mlast;
 
 	sbm_clrprotoflags(m0, flags);
 	m_last(control)->m_next = m0;
 
 	SBLASTRECORDCHK(sb);
 
 	for (m = control; m->m_next; m = m->m_next)
 		sballoc(sb, m);
 	sballoc(sb, m);
 	mlast = m;
 	SBLINKRECORD(sb, control);
 
 	sb->sb_mbtail = mlast;
 	SBLASTMBUFCHK(sb);
 
 	SBLASTRECORDCHK(sb);
 }
 
 void
 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
     int flags)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbappendcontrol_locked(sb, m0, control, flags);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
  * (n).  If (n) is NULL, the buffer is presumed empty.
  *
  * When the data is compressed, mbufs in the chain may be handled in one of
  * three ways:
  *
  * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
  *     record boundary, and no change in data type).
  *
  * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
  *     an mbuf already in the socket buffer.  This can occur if an
  *     appropriate mbuf exists, there is room, both mbufs are not marked as
  *     not ready, and no merging of data types will occur.
  *
  * (3) The mbuf may be appended to the end of the existing mbuf chain.
  *
  * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
  * end-of-record.
  */
 void
 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
 {
 	int eor = 0;
 	struct mbuf *o;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	while (m) {
 		eor |= m->m_flags & M_EOR;
 		if (m->m_len == 0 &&
 		    (eor == 0 ||
 		     (((o = m->m_next) || (o = n)) &&
 		      o->m_type == m->m_type))) {
 			if (sb->sb_lastrecord == m)
 				sb->sb_lastrecord = m->m_next;
 			m = m_free(m);
 			continue;
 		}
 		if (n && (n->m_flags & M_EOR) == 0 &&
 		    M_WRITABLE(n) &&
 		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
 		    !(m->m_flags & M_NOTREADY) &&
 		    !(n->m_flags & (M_NOTREADY | M_NOMAP)) &&
 		    !mbuf_has_tls_session(m) &&
 		    !mbuf_has_tls_session(n) &&
 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
 		    m->m_len <= M_TRAILINGSPACE(n) &&
 		    n->m_type == m->m_type) {
 			m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
 			n->m_len += m->m_len;
 			sb->sb_ccc += m->m_len;
 			if (sb->sb_fnrdy == NULL)
 				sb->sb_acc += m->m_len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 				/* XXX: Probably don't need.*/
 				sb->sb_ctl += m->m_len;
 			m = m_free(m);
 			continue;
 		}
 		if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
 		    (m->m_flags & M_NOTREADY) == 0 &&
 		    !mbuf_has_tls_session(m))
 			(void)mb_unmapped_compress(m);
 		if (n)
 			n->m_next = m;
 		else
 			sb->sb_mb = m;
 		sb->sb_mbtail = m;
 		sballoc(sb, m);
 		n = m;
 		m->m_flags &= ~M_EOR;
 		m = m->m_next;
 		n->m_next = 0;
 	}
 	if (eor) {
 		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
 		n->m_flags |= eor;
 	}
 	SBLASTMBUFCHK(sb);
 }
 
 /*
  * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
  */
 static void
 sbflush_internal(struct sockbuf *sb)
 {
 
 	while (sb->sb_mbcnt) {
 		/*
 		 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty:
 		 * we would loop forever. Panic instead.
 		 */
 		if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len))
 			break;
 		m_freem(sbcut_internal(sb, (int)sb->sb_ccc));
 	}
 	KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
 	    ("%s: ccc %u mb %p mbcnt %u", __func__,
 	    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
 }
 
 void
 sbflush_locked(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	sbflush_internal(sb);
 }
 
 void
 sbflush(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbflush_locked(sb);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Cut data from (the front of) a sockbuf.
  */
 static struct mbuf *
 sbcut_internal(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *next, *mfree;
 
 	KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0",
 	    __func__, len));
 	KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u",
 	    __func__, len, sb->sb_ccc));
 
 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
 	mfree = NULL;
 
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(next, ("%s: no next, len %d", __func__, len));
 			m = next;
 			next = m->m_nextpkt;
 		}
 		if (m->m_len > len) {
 			KASSERT(!(m->m_flags & M_NOTAVAIL),
 			    ("%s: m %p M_NOTAVAIL", __func__, m));
 			m->m_len -= len;
 			m->m_data += len;
 			sb->sb_ccc -= len;
 			sb->sb_acc -= len;
 			if (sb->sb_sndptroff != 0)
 				sb->sb_sndptroff -= len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 				sb->sb_ctl -= len;
 			break;
 		}
 		len -= m->m_len;
 		sbfree(sb, m);
 		/*
 		 * Do not put M_NOTREADY buffers to the free list, they
 		 * are referenced from outside.
 		 */
 		if (m->m_flags & M_NOTREADY)
 			m = m->m_next;
 		else {
 			struct mbuf *n;
 
 			n = m->m_next;
 			m->m_next = mfree;
 			mfree = m;
 			m = n;
 		}
 	}
 	/*
 	 * Free any zero-length mbufs from the buffer.
 	 * For SOCK_DGRAM sockets such mbufs represent empty records.
 	 * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer,
 	 * when sosend_generic() needs to send only control data.
 	 */
 	while (m && m->m_len == 0) {
 		struct mbuf *n;
 
 		sbfree(sb, m);
 		n = m->m_next;
 		m->m_next = mfree;
 		mfree = m;
 		m = n;
 	}
 	if (m) {
 		sb->sb_mb = m;
 		m->m_nextpkt = next;
 	} else
 		sb->sb_mb = next;
 	/*
 	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
 	 * sb_lastrecord is up-to-date if we dropped part of the last record.
 	 */
 	m = sb->sb_mb;
 	if (m == NULL) {
 		sb->sb_mbtail = NULL;
 		sb->sb_lastrecord = NULL;
 	} else if (m->m_nextpkt == NULL) {
 		sb->sb_lastrecord = m;
 	}
 
 	return (mfree);
 }
 
 /*
  * Drop data from (the front of) a sockbuf.
  */
 void
 sbdrop_locked(struct sockbuf *sb, int len)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	m_freem(sbcut_internal(sb, len));
 }
 
 /*
  * Drop data from (the front of) a sockbuf,
  * and return it to caller.
  */
 struct mbuf *
 sbcut_locked(struct sockbuf *sb, int len)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	return (sbcut_internal(sb, len));
 }
 
 void
 sbdrop(struct sockbuf *sb, int len)
 {
 	struct mbuf *mfree;
 
 	SOCKBUF_LOCK(sb);
 	mfree = sbcut_internal(sb, len);
 	SOCKBUF_UNLOCK(sb);
 
 	m_freem(mfree);
 }
 
 struct mbuf *
 sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff)
 {
 	struct mbuf *m;
 
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
 	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
 		*moff = off;
 		if (sb->sb_sndptr == NULL) {
 			sb->sb_sndptr = sb->sb_mb;
 			sb->sb_sndptroff = 0;
 		}
 		return (sb->sb_mb);
 	} else {
 		m = sb->sb_sndptr;
 		off -= sb->sb_sndptroff;
 	}
 	*moff = off;
 	return (m);
 }
 
 void
 sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len)
 {
 	/*
 	 * A small copy was done, advance forward the sb_sbsndptr to cover
 	 * it.
 	 */
 	struct mbuf *m;
 
 	if (mb != sb->sb_sndptr) {
 		/* Did not copyout at the same mbuf */
 		return;
 	}
 	m = mb;
 	while (m && (len > 0)) {
 		if (len >= m->m_len) {
 			len -= m->m_len;
 			if (m->m_next) {
 				sb->sb_sndptroff += m->m_len;
 				sb->sb_sndptr = m->m_next;
 			}
 			m = m->m_next;
 		} else {
 			len = 0;
 		}
 	}
 }
 
 /*
  * Return the first mbuf and the mbuf data offset for the provided
  * send offset without changing the "sb_sndptroff" field.
  */
 struct mbuf *
 sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff)
 {
 	struct mbuf *m;
 
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
 
 	/*
 	 * If the "off" is below the stored offset, which happens on
 	 * retransmits, just use "sb_mb":
 	 */
 	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
 		m = sb->sb_mb;
 	} else {
 		m = sb->sb_sndptr;
 		off -= sb->sb_sndptroff;
 	}
 	while (off > 0 && m != NULL) {
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	*moff = off;
 	return (m);
 }
 
 /*
  * Drop a record off the front of a sockbuf and move the next record to the
  * front.
  */
 void
 sbdroprecord_locked(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	m = sb->sb_mb;
 	if (m) {
 		sb->sb_mb = m->m_nextpkt;
 		do {
 			sbfree(sb, m);
 			m = m_free(m);
 		} while (m);
 	}
 	SB_EMPTY_FIXUP(sb);
 }
 
 /*
  * Drop a record off the front of a sockbuf and move the next record to the
  * front.
  */
 void
 sbdroprecord(struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK(sb);
 	sbdroprecord_locked(sb);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Create a "control" mbuf containing the specified data with the specified
  * type for presentation on a socket buffer.
  */
 struct mbuf *
 sbcreatecontrol(caddr_t p, int size, int type, int level)
 {
 	struct cmsghdr *cp;
 	struct mbuf *m;
 
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
 	if (CMSG_SPACE((u_int)size) > MLEN)
 		m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
 	else
 		m = m_get(M_NOWAIT, MT_CONTROL);
 	if (m == NULL)
 		return ((struct mbuf *) NULL);
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
 	    ("sbcreatecontrol: short mbuf"));
 	/*
 	 * Don't leave the padding between the msg header and the
 	 * cmsg data and the padding after the cmsg data un-initialized.
 	 */
 	bzero(cp, CMSG_SPACE((u_int)size));
 	if (p != NULL)
 		(void)memcpy(CMSG_DATA(cp), p, size);
 	m->m_len = CMSG_SPACE(size);
 	cp->cmsg_len = CMSG_LEN(size);
 	cp->cmsg_level = level;
 	cp->cmsg_type = type;
 	return (m);
 }
 
 /*
  * This does the same for socket buffers that sotoxsocket does for sockets:
  * generate an user-format data structure describing the socket buffer.  Note
  * that the xsockbuf structure, since it is always embedded in a socket, does
  * not include a self pointer nor a length.  We make this entry point public
  * in case some other mechanism needs it.
  */
 void
 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
 {
 
 	xsb->sb_cc = sb->sb_ccc;
 	xsb->sb_hiwat = sb->sb_hiwat;
 	xsb->sb_mbcnt = sb->sb_mbcnt;
 	xsb->sb_mcnt = sb->sb_mcnt;	
 	xsb->sb_ccnt = sb->sb_ccnt;
 	xsb->sb_mbmax = sb->sb_mbmax;
 	xsb->sb_lowat = sb->sb_lowat;
 	xsb->sb_flags = sb->sb_flags;
 	xsb->sb_timeo = sb->sb_timeo;
 }
 
 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
 static int dummy;
 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW | CTLFLAG_SKIP, &dummy, 0, "");
 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf,
     CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &sb_max, 0,
     sysctl_handle_sb_max, "LU",
     "Maximum socket buffer size");
 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
     &sb_efficiency, 0, "Socket buffer size waste factor");
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 360578)
+++ head/sys/netinet/ip_output.c	(revision 360579)
@@ -1,1588 +1,1588 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
     struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 	int pflags = PFIL_OUT;
 
 	if (flags & IP_FORWARDING)
 		pflags |= PFIL_FWD;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
 	case PFIL_DROPPED:
 		*error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		return 1; /* Finished */
 	case PFIL_PASS:
 		*error = 0;
 	}
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
 static int
 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr_in *gw, struct route *ro, bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
-		tls = ktls_hold(m->m_next->m_ext_pgs.tls);
+		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			error = EAGAIN;
 			goto done;
 		}
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /* rte<>ro_flags translation */
 static inline void
 rt_update_ro_flags(struct route *ro)
 {
 	int nh_flags = ro->ro_nh->nh_flags;
 
 	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
 
 	ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
 	ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
 	ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
 }
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
 	struct sockaddr_in *dst, sin;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	struct in_addr src;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	uint32_t fibnum;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	int no_route_but_check_spd = 0;
 #endif
 
 	M_ASSERTPKTHDR(m);
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 	if ((flags & IP_FORWARDING) == 0)
 		IPSTAT_INC(ips_localout);
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	if (ro != NULL)
 		dst = (struct sockaddr_in *)&ro->ro_dst;
 	else
 		dst = &sin;
 	if (ro == NULL || ro->ro_nh == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	gw = dst;
 again:
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 */
 	if (inp != NULL && ro != NULL && ro->ro_nh != NULL)
 		NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 * Also check whether routing cache needs invalidation.
 	 */
 	if (ro != NULL && ro->ro_nh != NULL &&
 	    ((!NH_IS_VALID(ro->ro_nh)) || !RT_LINK_IS_UP(ro->ro_nh->nh_ifp) ||
 	    dst->sin_family != AF_INET ||
 	    dst->sin_addr.s_addr != ip->ip_dst.s_addr))
 		RO_INVALIDATE_CACHE(ro);
 	ia = NULL;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
 		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		mtu = ifp->if_mtu;
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		isbroadcast = 0;	/* fool gcc */
 		/* Interface may have no addresses. */
 		if (ia != NULL)
 			src = IA_SIN(ia)->sin_addr;
 		else
 			src.s_addr = INADDR_ANY;
 	} else if (ro != NULL) {
 		if (ro->ro_nh == NULL) {
 			/*
 			 * We want to do any cloning requested by the link
 			 * layer, as this is probably required in all cases
 			 * for correct operation (as it is for ARP).
 			 */
 			uint32_t flowid;
 #ifdef RADIX_MPATH
 			flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
 #else
 			flowid = m->m_pkthdr.flowid;
 #endif
 			ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
 			    NHR_REF, flowid);
 
 			if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh)) ||
 			    !RT_LINK_IS_UP(ro->ro_nh->nh_ifp)) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				/*
 				 * There is no route for this packet, but it is
 				 * possible that a matching SPD entry exists.
 				 */
 				no_route_but_check_spd = 1;
 				mtu = 0; /* Silence GCC warning. */
 				goto sendit;
 #endif
 				IPSTAT_INC(ips_noroute);
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 		}
 		ia = ifatoia(ro->ro_nh->nh_ifa);
 		ifp = ro->ro_nh->nh_ifp;
 		counter_u64_add(ro->ro_nh->nh_pksent, 1);
 		rt_update_ro_flags(ro);
 		if (ro->ro_nh->nh_flags & NHF_GATEWAY)
 			gw = &ro->ro_nh->gw4_sa;
 		if (ro->ro_nh->nh_flags & NHF_HOST)
 			isbroadcast = (ro->ro_nh->nh_flags & NHF_BROADCAST);
 		else if (ifp->if_flags & IFF_BROADCAST)
 			isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
 		else
 			isbroadcast = 0;
 		if (ro->ro_nh->nh_flags & NHF_HOST)
 			mtu = ro->ro_nh->nh_mtu;
 		else
 			mtu = ifp->if_mtu;
 		src = IA_SIN(ia)->sin_addr;
 	} else {
 		struct nhop4_extended nh;
 
 		bzero(&nh, sizeof(nh));
 		if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) !=
 		    0) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ifp = nh.nh_ifp;
 		mtu = nh.nh_mtu;
 		/*
 		 * We are rewriting here dst to be gw actually, contradicting
 		 * comment at the beginning of the function. However, in this
 		 * case we are always dealing with on stack dst.
 		 * In case if pfil(9) sends us back to beginning of the
 		 * function, the dst would be rewritten by ip_output_pfil().
 		 */
 		MPASS(dst == &sin);
 		dst->sin_addr = nh.nh_addr;
 		ia = nh.nh_ia;
 		src = nh.nh_src;
 		isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
 		    (NHF_HOST | NHF_BROADCAST)) ||
 		    ((ifp->if_flags & IFF_BROADCAST) &&
 		    in_ifaddr_broadcast(dst->sin_addr, ia)));
 	}
 
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
 	    __func__, mtu, ro,
 	    (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY)
 			ip->ip_src = src;
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY)
 		ip->ip_src = src;
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
 		switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
 		    &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			if (ro != NULL) {
 				RO_NHFREE(ro);
 				ro->ro_prepend = NULL;
 			}
 			gw = dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 
 		}
 	}
 
 	/* IN_LOOPBACK must not appear on the wire - RFC1122. */
 	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IPSTAT_INC(ips_odropped);
 			error = ENOBUFS;
 			goto bad;
 		}
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	} else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IPSTAT_INC(ips_odropped);
 			error = ENOBUFS;
 			goto bad;
 		}
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IPSTAT_INC(ips_odropped);
 			error = ENOBUFS;
 			goto bad;
 		}
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = ip_output_send(inp, ifp, m, gw, ro,
 		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
 			error = ip_output_send(inp, ifp, m, gw, ro, true);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	return (error);
  bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		m0 = mb_unmapped_to_ext(m0);
 		if (m0 == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		m0 = mb_unmapped_to_ext(m0);
 		if (m0 == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	uint16_t cklen, csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 
 	if (m->m_pkthdr.csum_flags & CSUM_UDP) {
 		/* if udp header is not in the first mbuf copy udplen */
 		if (offset + sizeof(struct udphdr) > m->m_len) {
 			m_copydata(m, offset + offsetof(struct udphdr,
 			    uh_ulen), sizeof(cklen), (caddr_t)&cklen);
 			cklen = ntohs(cklen);
 		} else {
 			uh = (struct udphdr *)mtodo(m, offset);
 			cklen = ntohs(uh->uh_ulen);
 		}
 		csum = in_cksum_skip(m, cklen + offset, offset);
 		if (csum == 0)
 			csum = 0xffff;
 	} else {
 		cklen = ntohs(ip->ip_len);
 		csum = in_cksum_skip(m, cklen, offset);
 	}
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				OPTSET2(INP_ORIGDSTADDR, optval);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			INP_RLOCK(inp);
 			if (inp->inp_options) {
 				struct mbuf *options;
 
 				options = m_copym(inp->inp_options, 0,
 				    M_COPYALL, M_NOWAIT);
 				INP_RUNLOCK(inp);
 				if (options != NULL) {
 					error = sooptcopyout(sopt,
 							     mtod(options, char *),
 							     options->m_len);
 					m_freem(options);
 				} else
 					error = ENOMEM;
 			} else {
 				INP_RUNLOCK(inp);
 				sopt->sopt_valsize = 0;
 			}
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				optval = OPTBIT2(INP_ORIGDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 360578)
+++ head/sys/netinet/tcp_output.c	(revision 360579)
@@ -1,2108 +1,2108 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/domain.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/kernel.h>
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/stats.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, path_mtu_discovery) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(path_mtu_discovery), 1,
 	"Enable Path MTU Discovery");
 
 VNET_DEFINE(int, tcp_do_tso) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_sendspace) = 1024*32;
 #define	V_tcp_sendspace	VNET(tcp_sendspace)
 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
 	"Enable automatic send buffer sizing");
 
 VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0;
 #define	V_tcp_sendbuf_auto_lowat	VNET(tcp_sendbuf_auto_lowat)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_sendbuf_auto_lowat), 0,
 	"Modify threshold for auto send buffer growth to account for SO_SNDLOWAT");
 
 /*
  * Make sure that either retransmit or persist timer is set for SYN, FIN and
  * non-ACK.
  */
 #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags)			\
 	KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\
 	    tcp_timer_active((tp), TT_REXMT) ||				\
 	    tcp_timer_active((tp), TT_PERSIST),				\
 	    ("neither rexmt nor persist timer is set"))
 
 static void inline	cc_after_idle(struct tcpcb *tp);
 
 #ifdef TCP_HHOOK
 /*
  * Wrapper for the TCP established output helper hook.
  */
 void
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
     struct tcpopt *to, uint32_t len, int tso)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 		hhook_data.len = len;
 		hhook_data.tso = tso;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
 		    tp->osd);
 	}
 }
 #endif
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 }
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	int32_t len;
 	uint32_t recwin, sendwin;
 	int off, flags, error = 0;	/* Keep compiler happy */
 	u_int if_hw_tsomaxsegcount = 0;
 	u_int if_hw_tsomaxsegsize = 0;
 	struct mbuf *m;
 	struct ip *ip = NULL;
 #ifdef TCPDEBUG
 	struct ipovly *ipov = NULL;
 #endif
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 #endif
 	int idle, sendalot, curticks;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
 	unsigned int wanted_cookie = 0;
 	unsigned int dont_sendalot = 0;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 #ifdef KERN_TLS
 	const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
 #else
 	const bool hw_tls = false;
 #endif
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 	/*
 	 * For TFO connections in SYN_SENT or SYN_RECEIVED,
 	 * only allow the initial SYN or SYN|ACK and those sent
 	 * by the retransmit timer.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    ((tp->t_state == TCPS_SYN_SENT) ||
 	     (tp->t_state == TCPS_SYN_RECEIVED)) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
 	    (tp->snd_nxt != tp->snd_una))       /* not a retransmit */
 		return (0);
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
 		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		uint32_t cwin;
 
 		cwin =
 		    imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((int32_t)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((int32_t)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < sbused(&so->so_snd))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			tcp_timer_activate(tp, TT_PERSIST, 0);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
 			    off);
 		else {
 			int32_t cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
 			    off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it
 			 * isn't really necessary), to work around a gcc
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd -
 					(tp->snd_nxt - tp->snd_recover) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = imin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	/*
 	 * On TFO sockets, ensure no data is sent in the following cases:
 	 *
 	 *  - When retransmitting SYN|ACK on a passively-created socket
 	 *
 	 *  - When retransmitting SYN on an actively created socket
 	 *
 	 *  - When sending a zero-length cookie (cookie request) on an
 	 *    actively created socket
 	 *
 	 *  - When the socket is in the CLOSED state (RST is being sent)
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
 	     ((tp->t_state == TCPS_SYN_SENT) &&
 	      (tp->t_tfo_client_cookie_len == 0)) ||
 	     (flags & TH_RST)))
 		len = 0;
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 *
 		 * We also do a general check here to ensure that
 		 * we will set the persist timer when we have data
 		 * to send, but a 0-byte window. This makes sure
 		 * the persist timer is set even if the packet
 		 * hits one of the "goto send" lines below.
 		 */
 		len = 0;
 		if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 			(off < (int) sbavail(&so->so_snd))) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!tcp_timer_active(tp, TT_PERSIST))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	tcp_sndbuf_autoscale(tp, so, sendwin);
 
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 *
 	 * IPv4 handling has a clear separation of ip options and ip header
 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
 	 * the right thing below to provide length of just ip options and thus
 	 * checking for ipoptlen is enough to decide if ip options are present.
 	 */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
 #ifdef INET
 	else
 #endif
 #endif /* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
 #endif /* INET */
 #endif /* IPSEC */
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 	    ipoptlen == 0 && !(flags & TH_SYN))
 		tso = 1;
 
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 		    sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
 	    (long)TCP_MAXWIN << tp->rcv_scale);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read
 	 * from the receive buffer, no matter how small, causes a window
 	 * update to be sent.  We also should avoid sending a flurry of
 	 * window updates when the socket buffer had queued a lot of data
 	 * and the application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending
 	 * an update when we can increase the advertized window by more
 	 * than 1/4th of the socket buffer capacity.  When the buffer is
 	 * getting full or is very small be more aggressive and send an
 	 * update whenever we can increase by two mss sized segments.
 	 * In all other situations the ACK's to new incoming data will
 	 * carry further window increases.
 	 *
 	 * Don't send an independent window update if a delayed
 	 * ACK is pending (it will get piggy-backed on it) or the
 	 * remote side already has done a half-close and won't send
 	 * more data.  Skip this if the connection is in T/TCP
 	 * half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		int32_t adv;
 		int oldwin;
 
 		adv = recwin;
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/*
 		 * If the new window size ends up being the same as or less
 		 * than the old size when it is scaled, then don't force
 		 * a window update.
 		 */
 		if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (int32_t)(2 * tp->t_maxseg) &&
 		    (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg ||
 		     adv >= TCP_MAXWIN << tp->rcv_scale))
 			goto send;
 		if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat)
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		goto just_return;
 	}
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * tcp_timer_active(tp, TT_PERSIST)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * tcp_timer_active(tp, TT_REXMT)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	to.to_flags = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 
 			/*
 			 * On SYN or SYN|ACK transmits on TFO connections,
 			 * only include the TFO option if it is not a
 			 * retransmit, as the presence of the TFO option may
 			 * have caused the original SYN or SYN|ACK to have
 			 * been dropped by a middlebox.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->t_rxtshift == 0)) {
 				if (tp->t_state == TCPS_SYN_RECEIVED) {
 					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 					to.to_tfo_cookie =
 					    (u_int8_t *)&tp->t_tfo_cookie.server;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 				} else if (tp->t_state == TCPS_SYN_SENT) {
 					to.to_tfo_len =
 					    tp->t_tfo_client_cookie_len;
 					to.to_tfo_cookie =
 					    tp->t_tfo_cookie.client;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 					/*
 					 * If we wind up having more data to
 					 * send with the SYN than can fit in
 					 * one segment, don't send any more
 					 * until the SYN|ACK comes back from
 					 * the other end.
 					 */
 					dont_sendalot = 1;
 				}
 			}
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			curticks = tcp_ts_getticks();
 			to.to_tsval = curticks + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			if (tp->t_rxtshift == 1)
 				tp->t_badrxtwin = curticks;
 		}
 
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = tcp_ts_getticks();
 
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		/*
 		 * Check that TCP_MD5SIG is enabled in tcpcb to
 		 * account the size needed to set this TCP option.
 		 */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 		/*
 		 * If we wanted a TFO option to be added, but it was unable
 		 * to fit, ensure no data is sent.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxseg length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxseg) {
 		flags &= ~TH_FIN;
 
 		if (tso) {
 			u_int if_hw_tsomax;
 			u_int moff;
 			int max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 
 			/*
 			 * Limit a TSO burst to prevent it from
 			 * overflowing or exceeding the maximum length
 			 * allowed by the network interface:
 			 */
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 				    max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 
 			/*
 			 * Prevent the last segment from being
 			 * fractional unless the send sockbuf can be
 			 * emptied:
 			 */
 			max_len = (tp->t_maxseg - optlen);
 			if (((uint32_t)off + (uint32_t)len) <
 			    sbavail(&so->so_snd)) {
 				moff = len % max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 
 			/*
 			 * In case there are too many small fragments
 			 * don't use TSO:
 			 */
 			if (len <= max_len) {
 				len = max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 
 			/*
 			 * Send the FIN in a separate segment
 			 * after the bulk sending is done.
 			 * We don't trust the TSO implementations
 			 * to clear the FIN flag on all but the
 			 * last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 		} else {
 			if (optlen + ipoptlen >= tp->t_maxseg) {
 				/*
 				 * Since we don't have enough space to put
 				 * the IP header chain and the TCP header in
 				 * one packet as required by RFC 7112, don't
 				 * send it. Also ensure that at least one
 				 * byte of the payload can be put into the
 				 * TCP segment.
 				 */
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EMSGSIZE;
 				sack_rxmit = 0;
 				goto out;
 			}
 			len = tp->t_maxseg - optlen - ipoptlen;
 			sendalot = 1;
 			if (dont_sendalot)
 				sendalot = 0;
 		}
 	} else
 		tso = 0;
 
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		struct mbuf *mb;
 		struct sockbuf *msb;
 		u_int moff;
 
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
 			TCPSTAT_INC(tcps_sndprobe);
 #ifdef STATS
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				stats_voi_update_abs_u32(tp->t_stats,
 				VOI_TCP_RETXPB, len);
 			else
 				stats_voi_update_abs_u64(tp->t_stats,
 				    VOI_TCP_TXPB, len);
 #endif /* STATS */
 		} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 #ifdef STATS
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 			    len);
 #endif /* STATS */
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 #ifdef STATS
 			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 			    len);
 #endif /* STATS */
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf
 		 * to the offset in the socket buffer chain.
 		 */
 		mb = sbsndptr_noadv(&so->so_snd, off, &moff);
 		if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
 			m_copydata(mb, moff, len,
 			    mtod(m, caddr_t) + hdrlen);
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				sbsndptr_adv(&so->so_snd, mb, len);
 			m->m_len += len;
 		} else {
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				msb = NULL;
 			else
 				msb = &so->so_snd;
 			m->m_next = tcp_m_copym(mb, moff,
 			    &len, if_hw_tsomaxsegcount,
 			    if_hw_tsomaxsegsize, msb, hw_tls);
 			if (len <= (tp->t_maxseg - optlen)) {
 				/*
 				 * Must have ran out of mbufs for the copy
 				 * shorten it to no longer need tso. Lets
 				 * not put on sendalot since we are low on
 				 * mbufs.
 				 */
 				tso = 0;
 			}
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) &&
 		    !(flags & TH_SYN))
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup
 	 * SYN packet. If we are on a retransmit, we may
 	 * resend those bits a number of times as per
 	 * RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE|TH_CWR;
 		} else
 			flags |= TH_ECE|TH_CWR;
 	}
 
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags2 & TF2_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with
 		 * ECN capable transmission (ECT).
 		 * Ignore pure ack packets, retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    (sack_rxmit == 0) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags2 & TF2_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags2 &= ~TF2_ECN_SND_CWR;
 		}
 		if (tp->t_flags2 & TF2_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN)) ||
 		    tcp_timer_active(tp, TT_PERSIST))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 * If a RST segment is sent, advertise a window of zero.
 	 */
 	if (flags & TH_RST) {
 		recwin = 0;
 	} else {
 		if (recwin < (so->so_rcv.sb_hiwat / 4) &&
 		    recwin < tp->t_maxseg)
 			recwin = 0;
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 		    recwin < (tp->rcv_adv - tp->rcv_nxt))
 			recwin = (tp->rcv_adv - tp->rcv_nxt);
 	}
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else {
 		/* Avoid shrinking window with window scaling. */
 		recwin = roundup2(recwin, 1 << tp->rcv_scale);
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 	}
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data than can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		/*
 		 * Calculate MD5 signature and put it into the place
 		 * determined before.
 		 * NOTE: since TCP options buffer doesn't point into
 		 * mbuf's data, calculate offset and use it.
 		 */
 		if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th,
 		    (u_char *)(th + 1) + (to.to_signature - opt))) != 0) {
 			/*
 			 * Do not send segment if the calculation of MD5
 			 * digest has failed.
 			 */
 			m_freem(m);
 			goto out;
 		}
 	}
 #endif
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * There is no need to fill in ip6_plen right now.
 		 * It will be filled later by ip6_output.
 		 */
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 		    optlen + len, IPPROTO_TCP, 0);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 
 	KASSERT(len + hdrlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %d + %u != %u",
 	    __func__, len, hdrlen, m_length(m, NULL)));
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 #endif
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif /* TCPDEBUG */
 	TCP_PROBE3(debug__output, tp, th, m);
 
 	/* We're getting ready to send; log now. */
 	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
 	    len, NULL, false);
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before checksum calculation,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace probes.
 		 * ip6_output() will set it properly; it's supposed to include
 		 * the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 
 #ifdef TCPPCAP
 		/* Save packet, if requested. */
 		tcp_pcap_add(th, m, &(tp->t_outpkts));
 #endif
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
 		    &tp->t_inpcb->inp_route6,
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
 		if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_nh != NULL)
 			mtu = tp->t_inpcb->inp_route6.ro_nh->nh_mtu;
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
     {
 	ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 *
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 		ip->ip_off |= htons(IP_DF);
 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 	} else {
 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 	TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 #ifdef TCPPCAP
 	/* Save packet, if requested. */
 	tcp_pcap_add(th, m, &(tp->t_outpkts));
 #endif
 
 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
 	if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_nh != NULL)
 		mtu = tp->t_inpcb->inp_route.ro_nh->nh_mtu;
     }
 #endif /* INET */
 
 out:
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 ||
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
 #ifdef STATS
 			if (!(tp->t_flags & TF_GPUTINPROG) && len) {
 				tp->t_flags |= TF_GPUTINPROG;
 				tp->gput_seq = startseq;
 				tp->gput_ack = startseq +
 				    ulmin(sbavail(&so->so_snd) - off, sendwin);
 				tp->gput_ts = tcp_ts_getticks();
 			}
 #endif /* STATS */
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!tcp_timer_active(tp, TT_REXMT) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (tcp_timer_active(tp, TT_PERSIST)) {
 				tcp_timer_activate(tp, TT_PERSIST, 0);
 				tp->t_rxtshift = 0;
 			}
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		} else if (len == 0 && sbavail(&so->so_snd) &&
 		    !tcp_timer_active(tp, TT_REXMT) &&
 		    !tcp_timer_active(tp, TT_PERSIST)) {
 			/*
 			 * Avoid a situation where we do not set persist timer
 			 * after a zero window condition. For example:
 			 * 1) A -> B: packet with enough data to fill the window
 			 * 2) B -> A: ACK for #1 + new data (0 window
 			 *    advertisement)
 			 * 3) A -> B: ACK for #2, 0 len packet
 			 *
 			 * In this case, A will not activate the persist timer,
 			 * because it chose to send a packet. Unless tcp_output
 			 * is called for some other reason (delayed ack timer,
 			 * another input packet from B, socket syscall), A will
 			 * not send zero window probes.
 			 *
 			 * So, if you send a 0-length packet, but there is data
 			 * in the socket buffer, and neither the rexmt or
 			 * persist timer is already set, then activate the
 			 * persist timer.
 			 */
 			tp->t_rxtshift = 0;
 			tcp_setpersist(tp);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + xlen;
 	}
 	if ((error == 0) &&
 	    (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	     (tp->t_flags & TF_SACK_PERMIT) &&
 	     tp->rcv_numsacks > 0)) {
 		    /* Clean up any DSACK's sent */
 		    tcp_clean_dsack_blocks(tp);
 	}
 	if (error) {
 		/* Record the error. */
 		TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT,
 		    error, 0, NULL, false);
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EACCES:
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 			TCP_XMIT_TIMER_ASSERT(tp, len, flags);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * If we obtained mtu from ip_output() then update
 			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	tp->t_flags &= ~TF_PREVVALID;
 	if (tcp_timer_active(tp, TT_REXMT))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistence timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      tcp_persmin, tcp_persmax);
 	tcp_timer_activate(tp, TT_PERSIST, tt);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int32_t mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		if (optlen == TCP_MAXOLEN)
 			break;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
 				continue;
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
 				continue;
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
 				continue;
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
 				continue;
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) {
 				to->to_flags &= ~TOF_SIGNATURE;
 				continue;
 			}
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}
 		case TOF_FASTOPEN:
 			{
 			int total_len;
 
 			/* XXX is there any point to aligning this option? */
 			total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
 			if (TCP_MAXOLEN - optlen < total_len) {
 				to->to_flags &= ~TOF_FASTOPEN;
 				continue;
 			}
 			*optp++ = TCPOPT_FAST_OPEN;
 			*optp++ = total_len;
 			if (to->to_tfo_len > 0) {
 				bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
 				optp += to->to_tfo_len;
 			}
 			optlen += total_len;
 			break;
 			}
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	/*
 	 * According to RFC 793 (STD0007):
 	 *   "The content of the header beyond the End-of-Option option
 	 *    must be header padding (i.e., zero)."
 	 *   and later: "The padding is composed of zeros."
 	 */
 	while (optlen % 4) {
 		optlen += TCPOLEN_PAD;
 		*optp++ = TCPOPT_PAD;
 	}
 
 	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
 
 /*
  * This is a copy of m_copym(), taking the TSO segment size/limit
  * constraints into account, and advancing the sndptr as it goes.
  */
 struct mbuf *
 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
     int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls, *ntls;
 	struct mbuf *start;
 #endif
 	struct mbuf *n, **np;
 	struct mbuf *top;
 	int32_t off = off0;
 	int32_t len = *plen;
 	int32_t fragsize;
 	int32_t len_cp = 0;
 	int32_t *pkthdrlen;
 	uint32_t mlen, frags;
 	bool copyhdr;
 
 
 	KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len));
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = true;
 	else
 		copyhdr = false;
 	while (off > 0) {
 		KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		if ((sb) && (m == sb->sb_sndptr)) {
 			sb->sb_sndptroff += m->m_len;
 			sb->sb_sndptr = m->m_next;
 		}
 		m = m->m_next;
 	}
 	np = &top;
 	top = NULL;
 	pkthdrlen = NULL;
 #ifdef KERN_TLS
 	if (hw_tls && (m->m_flags & M_NOMAP))
-		tls = m->m_ext_pgs.tls;
+		tls = m->m_epg_tls;
 	else
 		tls = NULL;
 	start = m;
 #endif
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("tcp_m_copym, length > size of mbuf chain"));
 			*plen = len_cp;
 			if (pkthdrlen != NULL)
 				*pkthdrlen = len_cp;
 			break;
 		}
 #ifdef KERN_TLS
 		if (hw_tls) {
 			if (m->m_flags & M_NOMAP)
-				ntls = m->m_ext_pgs.tls;
+				ntls = m->m_epg_tls;
 			else
 				ntls = NULL;
 
 			/*
 			 * Avoid mixing TLS records with handshake
 			 * data or TLS records from different
 			 * sessions.
 			 */
 			if (tls != ntls) {
 				MPASS(m != start);
 				*plen = len_cp;
 				if (pkthdrlen != NULL)
 					*pkthdrlen = len_cp;
 				break;
 			}
 
 			/*
 			 * Don't end a send in the middle of a TLS
 			 * record if it spans multiple TLS records.
 			 */
 			if (tls != NULL && (m != start) && len < m->m_len) {
 				*plen = len_cp;
 				if (pkthdrlen != NULL)
 					*pkthdrlen = len_cp;
 				break;
 			}
 		}
 #endif
 		mlen = min(len, m->m_len - off);
 		if (seglimit) {
 			/*
 			 * For M_NOMAP mbufs, add 3 segments
 			 * + 1 in case we are crossing page boundaries
 			 * + 2 in case the TLS hdr/trailer are used
 			 * It is cheaper to just add the segments
 			 * than it is to take the cache miss to look
 			 * at the mbuf ext_pgs state in detail.
 			 */
 			if (m->m_flags & M_NOMAP) {
 				fragsize = min(segsize, PAGE_SIZE);
 				frags = 3;
 			} else {
 				fragsize = segsize;
 				frags = 0;
 			}
 
 			/* Break if we really can't fit anymore. */
 			if ((frags + 1) >= seglimit) {
 				*plen =	len_cp;
 				if (pkthdrlen != NULL)
 					*pkthdrlen = len_cp;
 				break;
 			}
 
 			/*
 			 * Reduce size if you can't copy the whole
 			 * mbuf. If we can't copy the whole mbuf, also
 			 * adjust len so the loop will end after this
 			 * mbuf.
 			 */
 			if ((frags + howmany(mlen, fragsize)) >= seglimit) {
 				mlen = (seglimit - frags - 1) * fragsize;
 				len = mlen;
 				*plen = len_cp + len;
 				if (pkthdrlen != NULL)
 					*pkthdrlen = *plen;
 			}
 			frags += howmany(mlen, fragsize);
 			if (frags == 0)
 				frags++;
 			seglimit -= frags;
 			KASSERT(seglimit > 0,
 			    ("%s: seglimit went too low", __func__));
 		}
 		if (copyhdr)
 			n = m_gethdr(M_NOWAIT, m->m_type);
 		else
 			n = m_get(M_NOWAIT, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, M_NOWAIT))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			pkthdrlen = &n->m_pkthdr.len;
 			copyhdr = false;
 		}
 		n->m_len = mlen;
 		len_cp += n->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 
 		if (sb && (sb->sb_sndptr == m) &&
 		    ((n->m_len + off) >= m->m_len) && m->m_next) {
 			sb->sb_sndptroff += m->m_len;
 			sb->sb_sndptr = m->m_next;
 		}
 		off = 0;
 		if (len != M_COPYALL) {
 			len -= n->m_len;
 		}
 		m = m->m_next;
 		np = &n->m_next;
 	}
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 void
 tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
 {
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwidth product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwidth (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 *
 	 * XXXGL: should there be used sbused() or sbavail()?
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		int lowat;
 
 		lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0;
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat &&
 		    sbused(&so->so_snd) >=
 		    (so->so_snd.sb_hiwat / 8 * 7) - lowat &&
 		    sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
 		    sendwin >= (sbused(&so->so_snd) -
 		    (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 360578)
+++ head/sys/netinet6/ip6_output.c	(revision 360579)
@@ -1,3328 +1,3328 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #include <netipsec/ipsec_support.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/scope6_var.h>
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
 	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
 	u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
 	u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
 /*
  * Make an extension header from option data.  hp is the source,
  * mp is the destination, and _ol is the optlen.
  */
 #define	MAKE_EXTHDR(hp, mp, _ol)					\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 		(_ol) += (*(mp))->m_len;				\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("%s:%d: assumption failed: "\
 			    "hdr not split: hdrsplit %d exthdrs %p",\
 			    __func__, __LINE__, hdrsplit, &exthdrs);\
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 static int
 ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags,
     int plen, int optlen, bool frag)
 {
 
 	KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p "
 	    "csum_flags %#x frag %d\n",
 	    __func__, __LINE__, plen, optlen, m, ifp, csum_flags, frag));
 
 	if ((csum_flags & CSUM_DELAY_DATA_IPV6) ||
 #ifdef SCTP
 	    (csum_flags & CSUM_SCTP_IPV6) ||
 #endif
 	    (!frag && (ifp->if_capenable & IFCAP_NOMAP) == 0)) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			if (frag)
 				in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			else
 				IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		if (csum_flags & CSUM_DELAY_DATA_IPV6) {
 			in6_delayed_cksum(m, plen - optlen,
 			    sizeof(struct ip6_hdr) + optlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if (csum_flags & CSUM_SCTP_IPV6) {
 			sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
 #endif
 	}
 
 	return (0);
 }
 
 int
 ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
     int fraglen , uint32_t id)
 {
 	struct mbuf *m, **mnext, *m_frgpart;
 	struct ip6_hdr *ip6, *mhip6;
 	struct ip6_frag *ip6f;
 	int off;
 	int error;
 	int tlen = m0->m_pkthdr.len;
 
 	KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8"));
 
 	m = m0;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mnext = &m->m_nextpkt;
 
 	for (off = hlen; off < tlen; off += fraglen) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (!m) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		m->m_data += max_linkhdr;
 		mhip6 = mtod(m, struct ip6_hdr *);
 		*mhip6 = *ip6;
 		m->m_len = sizeof(*mhip6);
 		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 		if (error) {
 			IP6STAT_INC(ip6s_odropped);
 			return (error);
 		}
 		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 		if (off + fraglen >= tlen)
 			fraglen = tlen - off;
 		else
 			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 		mhip6->ip6_plen = htons((u_short)(fraglen + hlen +
 		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 		if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m_cat(m, m_frgpart);
 		m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f);
 		ip6f->ip6f_reserved = 0;
 		ip6f->ip6f_ident = id;
 		ip6f->ip6f_nxt = nextproto;
 		IP6STAT_INC(ip6s_ofragments);
 		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 	}
 
 	return (0);
 }
 
 static int
 ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
     struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
-		tls = ktls_hold(m->m_next->m_ext_pgs.tls);
+		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			error = EAGAIN;
 			goto done;
 		}
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /*
  * IP6 output.
  * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len,
  * nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_nh initialized, route lookup would be
  * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL,
  * then result of route lookup is stored in ro->ro_nh.
  *
  * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu
  * is uint32_t.  So we use u_long to hold largest one, which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 /*
  * XXX TODO: no flowid is assigned for outbound flows?
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev;
 	struct route_in6 *ro_pmtu;
 	struct nhop_object *nh;
 	struct sockaddr_in6 *dst, sin6, src_sa, dst_sa;
 	struct in6_addr odst;
 	u_char *nexthdrp;
 	int tlen, len;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen, plen = 0, unfragpartlen;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr src0, dst0;
 	u_int32_t zone;
 	bool hdrsplit;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
 
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* Unconditionally set flowid. */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	if (IPSEC_ENABLED(ipv6)) {
 		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 #endif /* IPSEC */
 
 	/* Source address validation. */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	/*
 	 * If we are given packet options to add extension headers prepare them.
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	bzero(&exthdrs, sizeof(exthdrs));
 	optlen = 0;
 	unfragpartlen = sizeof(struct ip6_hdr);
 	if (opt) {
 		/* Hop-by-Hop options header. */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
 
 		/* Destination options header (1st part). */
 		if (opt->ip6po_rthdr) {
 #ifndef RTHDR_SUPPORT_IMPLEMENTED
 			/*
 			 * If there is a routing header, discard the packet
 			 * right away here. RH0/1 are obsolete and we do not
 			 * currently support RH2/3/4.
 			 * People trying to use RH253/254 may want to disable
 			 * this check.
 			 * The moment we do support any routing header (again)
 			 * this block should check the routing type more
 			 * selectively.
 			 */
 			error = EINVAL;
 			goto bad;
 #endif
 
 			/*
 			 * Destination options header (1st part).
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
 			    optlen);
 		}
 		/* Routing header. */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
 
 		unfragpartlen += optlen;
 
 		/*
 		 * NOTE: we don't add AH/ESP length here (done in
 		 * ip6_ipsec_output()).
 		 */
 
 		/* Destination options header (2nd part). */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
 	}
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	hdrsplit = false;
 	if (optlen) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		ip6 = mtod(m, struct ip6_hdr *);
 		hdrsplit = true;
 	}
 
 	/* Adjust mbuf packet header length. */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			ip6 = mtod(m, struct ip6_hdr *);
 			hdrsplit = true;
 		}
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 	nexthdrp = &ip6->ip6_nxt;
 
 	if (optlen) {
 		/*
 		 * Concatenate headers and fill in next header fields.
 		 * Here we have, on "m"
 		 *	IPv6 payload
 		 * and we insert headers accordingly.
 		 * Finally, we should be getting:
 		 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload].
 		 *
 		 * During the header composing process "m" points to IPv6
 		 * header.  "mprev" points to an extension header prior to esp.
 		 */
 		mprev = m;
 
 		/*
 		 * We treat dest2 specially.  This makes IPsec processing
 		 * much easier.  The goal here is to make mprev point the
 		 * mbuf prior to dest2.
 		 *
 		 * Result: IPv6 dest2 payload.
 		 * m and mprev will point to IPv6 header.
 		 */
 		if (exthdrs.ip6e_dest2) {
 			if (!hdrsplit)
 				panic("%s:%d: assumption failed: "
 				    "hdr not split: hdrsplit %d exthdrs %p",
 				    __func__, __LINE__, hdrsplit, &exthdrs);
 			exthdrs.ip6e_dest2->m_next = m->m_next;
 			m->m_next = exthdrs.ip6e_dest2;
 			*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_DSTOPTS;
 		}
 
 		/*
 		 * Result: IPv6 hbh dest1 rthdr dest2 payload.
 		 * m will point to IPv6 header.  mprev will point to the
 		 * extension header prior to dest2 (rthdr in the above case).
 		 */
 		MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 			   IPPROTO_DSTOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 			   IPPROTO_ROUTING);
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/* Route packet. */
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	if (ro != NULL)
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 	else
 		dst = &sin6;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 
 again:
 	/*
 	 * If specified, try to fill in the traffic class field.
 	 * Do not override if a non-zero value is already set.
 	 * We check the diffserv field and the ECN field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
 			mask |= 0xfc;
 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* Fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 	if (ro == NULL || ro->ro_nh == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin6_family = AF_INET6;
 		dst->sin6_len = sizeof(*dst);
 		dst->sin6_addr = ip6->ip6_dst;
 	} 
 	/*
 	 * Validate route against routing table changes.
 	 * Make sure that the address family is set in route.
 	 */
 	nh = NULL;
 	ifp = NULL;
 	mtu = 0;
 	if (ro != NULL) {
 		if (ro->ro_nh != NULL && inp != NULL) {
 			ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */
 			NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
 			    fibnum);
 		}
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    (!NH_IS_VALID(ro->ro_nh) ||
 		    ro->ro_dst.sin6_family != AF_INET6 ||
 		    !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)))
 			RO_INVALIDATE_CACHE(ro);
 
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    ro->ro_dst.sin6_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
 			nh = ro->ro_nh;
 			ifp = nh->nh_ifp;
 		} else {
 			if (ro->ro_lle)
 				LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
 			ro->ro_lle = NULL;
 			if (fwd_tag == NULL) {
 				bzero(&dst_sa, sizeof(dst_sa));
 				dst_sa.sin6_family = AF_INET6;
 				dst_sa.sin6_len = sizeof(dst_sa);
 				dst_sa.sin6_addr = ip6->ip6_dst;
 			}
 			error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp,
 			    &nh, fibnum, m->m_pkthdr.flowid);
 			if (error != 0) {
 				IP6STAT_INC(ip6s_noroute);
 				if (ifp != NULL)
 					in6_ifstat_inc(ifp, ifs6_out_discard);
 				goto bad;
 			}
 			if (ifp != NULL)
 			    mtu = ifp->if_mtu;
 		}
 		if (nh == NULL) {
 			/*
 			 * If in6_selectroute() does not return a nexthop
 			 * dst may not have been updated.
 			 */
 			*dst = dst_sa;	/* XXX */
 		} else {
 			if (nh->nh_flags & NHF_HOST)
 			    mtu = nh->nh_mtu;
 			ia = (struct in6_ifaddr *)(nh->nh_ifa);
 			counter_u64_add(nh->nh_pksent, 1);
 		}
 	} else {
 		struct nhop6_extended nh6;
 		struct in6_addr kdst;
 		uint32_t scopeid;
 
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 
 		if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) &&
 		    im6o != NULL &&
 		    (ifp = im6o->im6o_multicast_ifp) != NULL) {
 			/* We do not need a route lookup. */
 			*dst = dst_sa;	/* XXX */
 			goto nonh6lookup;
 		}
 
 		in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid);
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) {
 			if (scopeid > 0) {
 				ifp = in6_getlinkifnet(scopeid);
 				*dst = dst_sa;	/* XXX */
 				goto nonh6lookup;
 			}
 		}
 
 		error = fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0,
 		    &nh6);
 		if (error != 0) {
 			IP6STAT_INC(ip6s_noroute);
 			/* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */
 			error = EHOSTUNREACH;;
 			goto bad;
 		}
 
 		ifp = nh6.nh_ifp;
 		mtu = nh6.nh_mtu;
 		dst->sin6_addr = nh6.nh_addr;
 		ia = nh6.nh_ia;
 		fib6_free_nh_ext(fibnum, &nh6);
 nonh6lookup:
 		;
 	}
 
 	/* Then nh (for unicast) and ifp must be non-NULL valid values. */
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 
 	/* Setup data structures for scope ID checks. */
 	src0 = ip6->ip6_src;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 
 	dst0 = ip6->ip6_dst;
 	/* Re-initialize to be sure. */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 
 	/* Check for valid scope ID. */
 	if (in6_setscope(&src0, ifp, &zone) == 0 &&
 	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
 	    in6_setscope(&dst0, ifp, &zone) == 0 &&
 	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
 		/*
 		 * The outgoing interface is in the zone of the source
 		 * and destination addresses.
 		 *
 		 * Because the loopback interface cannot receive
 		 * packets with a different scope ID than its own,
 		 * there is a trick to pretend the outgoing packet
 		 * was received by the real network interface, by
 		 * setting "origifp" different from "ifp". This is
 		 * only allowed when "ifp" is a loopback network
 		 * interface. Refer to code in nd6_output_ifp() for
 		 * more details.
 		 */
 		origifp = ifp;
 	
 		/*
 		 * We should use ia_ifp to support the case of sending
 		 * packets to an address of our own.
 		 */
 		if (ia != NULL && ia->ia_ifp)
 			ifp = ia->ia_ifp;
 
 	} else if ((ifp->if_flags & IFF_LOOPBACK) == 0 ||
 	    sa6_recoverscope(&src_sa) != 0 ||
 	    sa6_recoverscope(&dst_sa) != 0 ||
 	    dst_sa.sin6_scope_id == 0 ||
 	    (src_sa.sin6_scope_id != 0 &&
 	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
 	    (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) {
 		/*
 		 * If the destination network interface is not a
 		 * loopback interface, or the destination network
 		 * address has no scope ID, or the source address has
 		 * a scope ID set which is different from the
 		 * destination address one, or there is no network
 		 * interface representing this scope ID, the address
 		 * pair is considered invalid.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(ifp, ifs6_out_discard);
 		if (error == 0)
 			error = EHOSTUNREACH; /* XXX */
 		goto bad;
 	}
 	/* All scope ID checks are successful. */
 
 	if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_nh) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((nh->nh_flags & NHF_GATEWAY))
 			dst = &nh->gw6_sa;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 
 		/* Confirm that the outgoing interface supports multicast. */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
 		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
 		goto bad;
 	KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p "
 	    "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro,
 	    ifp, alwaysfrag, fibnum));
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * Clear embedded scope identifiers if necessary.
 	 * in6_clearscope() will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point. */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) {
 	case PFIL_PASS:
 		ip6 = mtod(m, struct ip6_hdr *);
 		break;
 	case PFIL_DROPPED:
 		error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		goto done;
 	}
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else {
 			if (ro != NULL)
 				RO_INVALIDATE_CACHE(ro);
 			needfiblookup = 1; /* Redo the routing table lookup. */
 		}
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		if (ro != NULL)
 			RO_INVALIDATE_CACHE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		if (ro != NULL)
 			dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		else
 			dst = &sin6;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * The logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request.
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	error = ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen, false);
 	if (error != 0)
 		goto bad;
 	/* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* Case 4. */
 		/* Conflicting request - can't transmit. */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* Case 2-b. */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data if
 		 * application wanted to know the MTU value. Also return an
 		 * error code (this is not described in the API spec).
 		 */
 		if (inp != NULL)
 			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Transmit packet without fragmentation. */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* Cases 1-a and 2-a. */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 			ifa_free(&ia6->ia_ifa);
 		}
 		error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
 		goto done;
 	}
 
 	/* Try to fragment the packet.  Cases 1-b and 3. */
 	if (mtu < IPV6_MMTU) {
 		/* Path MTU cannot be less than IPV6_MMTU. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* Jumbo payload cannot be fragmented. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		error = ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags,
 		    plen, optlen, true);
 		if (error != 0)
 			goto bad;
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			ip6 = mtod(m, struct ip6_hdr *);
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		id = htonl(ip6_randomid());
 		error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id);
 		if (error != 0)
 			goto sendorfree;
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/* Remove leading garbage. */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem() checks if mbuf is NULL. */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == NULL) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == NULL)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 /*
  * Calculates IPv6 path mtu for destination @dst.
  * Resulting MTU is stored in @mtup.
  *
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
 {
 	struct nhop6_extended nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	u_long mtu;
 	int error;
 
 	in6_splitscope(dst, &kdst, &scopeid);
 	if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0)
 		return (EHOSTUNREACH);
 
 	ifp = nh6.nh_ifp;
 	mtu = nh6.nh_mtu;
 
 	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
 	fib6_free_nh_ext(fibnum, &nh6);
 
 	return (error);
 }
 
 /*
  * Calculates IPv6 path MTU for @dst based on transmit @ifp,
  * and cached data in @ro_pmtu.
  * MTU from (successful) route lookup is saved (along with dst)
  * inside @ro_pmtu to avoid subsequent route lookups after packet
  * filter processing.
  *
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum, u_int proto)
 {
 	struct nhop6_basic nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct sockaddr_in6 *sa6_dst, sin6;
 	u_long mtu;
 
 	mtu = 0;
 	if (ro_pmtu == NULL || do_lookup) {
 
 		/*
 		 * Here ro_pmtu has final destination address, while
 		 * ro might represent immediate destination.
 		 * Use ro_pmtu destination since mtu might differ.
 		 */
 		if (ro_pmtu != NULL) {
 			sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 			if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
 				ro_pmtu->ro_mtu = 0;
 		} else
 			sa6_dst = &sin6;
 
 		if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_splitscope(dst, &kdst, &scopeid);
 			if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0,
 			    &nh6) == 0) {
 				mtu = nh6.nh_mtu;
 				if (ro_pmtu != NULL)
 					ro_pmtu->ro_mtu = mtu;
 			}
 		} else
 			mtu = ro_pmtu->ro_mtu;
 	}
 
 	if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL)
 		mtu = ro_pmtu->ro_nh->nh_mtu;
 
 	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
  * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
  * hostcache data for @dst.
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  *
  * Returns 0 on success.
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
     u_long *mtup, int *alwaysfragp, u_int proto)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (rt_mtu > 0) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
 
 		/* TCP is known to react to pmtu changes so skip hc */
 		if (proto != IPPROTO_TCP)
 			mtu = tcp_hc_getmtu(&inc);
 
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else
 			mtu = rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *inp = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 /*
  * Don't use more than a quarter of mbuf clusters.  N.B.:
  * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
  * on LP64 architectures, so cast to u_long to avoid undefined
  * behavior.  ILP32 architectures cannot have nmbclusters
  * large enough to overflow for other reasons.
  */
 #define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
 					printf("ip6_ctloutput: mbuf limit hit\n");
 					error = ENOBUFS;
 					break;
 				}
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				INP_WLOCK(inp);
 				error = ip6_pcbopts(&inp->in6p_outputopts, m,
 				    so, sopt);
 				INP_WUNLOCK(inp);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_ORIGDSTADDR:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						inp->in6p_hops = optval;
 						if ((inp->inp_vflag &
 						     INP_IPV4) != 0)
 							inp->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(inp); \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(inp); \
 	inp->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2_N(bit, val) do {					\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 } while (0)
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	OPTSET2_N(bit, val);						\
 	INP_WUNLOCK(inp);						\
 } while (0)
 #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0)
 #define OPTSET2292_EXCLUSIVE(bit)					\
 do {									\
 	INP_WLOCK(inp);							\
 	if (OPTBIT(IN6P_RFC2292)) {					\
 		error = EINVAL;						\
 	} else {							\
 		if (optval)						\
 			inp->inp_flags |= (bit);			\
 		else							\
 			inp->inp_flags &= ~(bit);			\
 	}								\
 	INP_WUNLOCK(inp);						\
 } while (/*CONSTCOND*/ 0)
 
 				case IPV6_RECVPKTINFO:
 					OPTSET2292_EXCLUSIVE(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					INP_WLOCK(inp);
 					if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_RECVFLOWID:
 					OPTSET2(INP_RECVFLOWID, optval);
 					break;
 
 #ifdef	RSS
 				case IPV6_RECVRSSBUCKETID:
 					OPTSET2(INP_RECVRSSBUCKETID, optval);
 					break;
 #endif
 
 				case IPV6_V6ONLY:
 					INP_WLOCK(inp);
 					if (inp->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 						/*
 						 * The socket is already bound.
 						 */
 						INP_WUNLOCK(inp);
 						error = EINVAL;
 						break;
 					}
 					if (optval) {
 						inp->inp_flags |= IN6P_IPV6_V6ONLY;
 						inp->inp_vflag &= ~INP_IPV4;
 					} else {
 						inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
 						inp->inp_vflag |= INP_IPV4;
 					}
 					INP_WUNLOCK(inp);
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					OPTSET2292_EXCLUSIVE(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					OPTSET2(INP_ORIGDSTADDR, optval);
 					break;
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						INP_WLOCK(inp);
 						inp->inp_rss_listen_bucket = optval;
 						OPTSET2_N(INP_RSS_BUCKET_SET, 1);
 						INP_WUNLOCK(inp);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					INP_WLOCK(inp);
 					if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				INP_WLOCK(inp);
 				if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 					INP_WUNLOCK(inp);
 					return (ECONNRESET);
 				}
 				optp = &inp->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				INP_WUNLOCK(inp);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(inp, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(inp);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					inp->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(inp);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_BINDMULTI:
 				switch (optname) {
 
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = inp->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = inp->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					optval = OPTBIT2(INP_ORIGDSTADDR);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = inp->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = inp->inp_flowtype;
 					break;
 
 				case IPV6_RECVFLOWID:
 					optval = OPTBIT2(INP_RECVFLOWID);
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(inp->inp_flowid,
 					    inp->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 
 				case IPV6_RECVRSSBUCKETID:
 					optval = OPTBIT2(INP_RECVRSSBUCKETID);
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				}
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct in6_addr addr;
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 * Copy faddr out of inp to avoid holding lock
 				 * on inp during route lookup.
 				 */
 				INP_RLOCK(inp);
 				bcopy(&inp->in6p_faddr, &addr, sizeof(addr));
 				INP_RUNLOCK(inp);
 				error = ip6_getpmtu_ctl(so->so_fibnum,
 				    &addr, &pmtu);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(inp, optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(inp, sopt);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if (optval < -1 || (optval % 2) != 0) {
 				/*
 				 * The API assumes non-negative even offset
 				 * values or -1 as a special value.
 				 */
 				error = EINVAL;
 			} else if (so->so_proto->pr_protocol ==
 			    IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				inp->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = inp->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else {
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
 		if (opt == NULL)
 			return (ENOMEM);
 	}
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		return (error);
 	}
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct ip6_pktopts *opt;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_NOWAIT);
 		if (*pktopt == NULL)
 			return (ENOBUFS);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
 }
 
 #define GET_PKTOPT_VAR(field, lenexpr) do {					\
 	if (pktopt && pktopt->field) {						\
 		INP_RUNLOCK(inp);						\
 		optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK);		\
 		malloc_optdata = true;						\
 		INP_RLOCK(inp);							\
 		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
 			INP_RUNLOCK(inp);					\
 			free(optdata, M_TEMP);					\
 			return (ECONNRESET);					\
 		}								\
 		pktopt = inp->in6p_outputopts;					\
 		if (pktopt && pktopt->field) {					\
 			optdatalen = min(lenexpr, sopt->sopt_valsize);		\
 			bcopy(&pktopt->field, optdata, optdatalen);		\
 		} else {							\
 			free(optdata, M_TEMP);					\
 			optdata = NULL;						\
 			malloc_optdata = false;					\
 		}								\
 	}									\
 } while(0)
 
 #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field,				\
 	(((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3)
 
 #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field,			\
 	pktopt->field->sa_len)
 
 static int
 ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	bool malloc_optdata = false;
 	int optdatalen = 0;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 	struct ip6_pktopts *pktopt;
 
 	INP_RLOCK(inp);
 	pktopt = inp->in6p_outputopts;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		optdata = (void *)&null_pktinfo;
 		if (pktopt && pktopt->ip6po_pktinfo) {
 			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
 			    sizeof(null_pktinfo));
 			in6_clearscope(&null_pktinfo.ipi6_addr);
 		} else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			deftclass = pktopt->ip6po_tclass;
 		optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_hbh);
 		break;
 	case IPV6_RTHDR:
 		GET_PKTOPT_EXT_HDR(ip6po_rthdr);
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest1);
 		break;
 	case IPV6_DSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest2);
 		break;
 	case IPV6_NEXTHOP:
 		GET_PKTOPT_SOCKADDR(ip6po_nexthop);
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			defminmtu = pktopt->ip6po_minmtu;
 		optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			defpreftemp = pktopt->ip6po_prefer_tempaddr;
 		optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		INP_RUNLOCK(inp);
 		return (ENOPROTOOPT);
 	}
 	INP_RUNLOCK(inp);
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 	if (malloc_optdata)
 		free(optdata, M_TEMP);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_nh) {
 			NH_FREE(pktopt->ip6po_nextroute.ro_nh);
 			pktopt->ip6po_nextroute.ro_nh = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_nh) {
 			NH_FREE(pktopt->ip6po_route.ro_nh);
 			pktopt->ip6po_route.ro_nh = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = NULL;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex > V_if_index)
 			 return (ENXIO);
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
 		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
 		    CSUM_PSEUDO_HDR;
 		copym->m_pkthdr.csum_data = 0xffff;
 	}
 	if_simloop(ifp, copym, AF_INET6, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
 		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *inp)
 {
 	int len;
 
 	if (!inp->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(inp->in6p_outputopts->ip6po_hbh);
 	if (inp->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(inp->in6p_outputopts->ip6po_dest1);
 	len += elen(inp->in6p_outputopts->ip6po_rthdr);
 	len += elen(inp->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h	(revision 360578)
+++ head/sys/sys/mbuf.h	(revision 360579)
@@ -1,1570 +1,1572 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 #ifdef _KERNEL
 #include <sys/sdt.h>
 
 #define	MBUF_PROBE1(probe, arg0)					\
 	SDT_PROBE1(sdt, , , probe, arg0)
 #define	MBUF_PROBE2(probe, arg0, arg1)					\
 	SDT_PROBE2(sdt, , , probe, arg0, arg1)
 #define	MBUF_PROBE3(probe, arg0, arg1, arg2)				\
 	SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2)
 #define	MBUF_PROBE4(probe, arg0, arg1, arg2, arg3)			\
 	SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3)
 #define	MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4)		\
 	SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4)
 
 SDT_PROBE_DECLARE(sdt, , , m__init);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr);
 SDT_PROBE_DECLARE(sdt, , , m__get);
 SDT_PROBE_DECLARE(sdt, , , m__getcl);
 SDT_PROBE_DECLARE(sdt, , , m__clget);
 SDT_PROBE_DECLARE(sdt, , , m__cljget);
 SDT_PROBE_DECLARE(sdt, , , m__cljset);
 SDT_PROBE_DECLARE(sdt, , , m__free);
 SDT_PROBE_DECLARE(sdt, , , m__freem);
 
 #endif /* _KERNEL */
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 #define	M_NODOM		255
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Static network interface owned tag.
  * Allocated through ifp->if_snd_tag_alloc().
  */
 struct m_snd_tag {
 	struct ifnet *ifp;		/* network interface tag belongs to */
 	volatile u_int refcount;
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	union {
 		struct m_snd_tag *snd_tag;	/* send tag, if any */
 		struct ifnet	*rcvif;		/* rcv interface */
 	};
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint32_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 numa_domain;	/* NUMA domain of recvd pkt */
 	uint8_t		 rsstype;	/* hash type */
 	union {
 		uint64_t	rcv_tstmp;	/* timestamp in ns */
 		struct {
 			uint8_t		 l2hlen;	/* layer 2 hdr len */
 			uint8_t		 l3hlen;	/* layer 3 hdr len */
 			uint8_t		 l4hlen;	/* layer 4 hdr len */
 			uint8_t		 l5hlen;	/* layer 5 hdr len */
 			uint32_t	 spare;
 		};
 	};
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]	  /* mld and v6-ND */
 #define	tso_segsz	PH_per.sixteen[1] /* inbound after LRO */
 #define	lro_nsegs	tso_segsz	  /* inbound after LRO */
 #define	csum_data	PH_per.thirtytwo[1] /* inbound from hardware up */
 #define lro_len		PH_loc.sixteen[0] /* inbound during LRO (no reassembly) */
 #define lro_csum	PH_loc.sixteen[1] /* inbound during LRO (no reassembly) */
 /* Note PH_loc is used during IP reassembly (all 8 bytes as a ptr) */
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following header lengths:
  * - 5 (AES-CBC with implicit IV)
  * - 21 (AES-CBC with explicit IV)
  * - 13 (AES-GCM with 8 byte explicit IV)
  */
 #define	MBUF_PEXT_HDR_LEN	23
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following maximum trailer
  * lengths:
  * - 16 (AES-GCM)
  * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
  * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
  * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
  */
 #define	MBUF_PEXT_TRAIL_LEN	64
 
 #if defined(__LP64__)
 #define MBUF_PEXT_MAX_PGS (40 / sizeof(vm_paddr_t))
 #else
 #define MBUF_PEXT_MAX_PGS (72 / sizeof(vm_paddr_t))
 #endif
 
 #define	MBUF_PEXT_MAX_BYTES						\
     (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
 
 struct ktls_session;
 struct socket;
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 typedef	void m_ext_free_t(struct mbuf *);
 struct m_ext {
 	union {
 		/*
 		 * If EXT_FLAG_EMBREF is set, then we use refcount in the
 		 * mbuf, the 'ext_count' member.  Otherwise, we have a
 		 * shadow copy and we use pointer 'ext_cnt'.  The original
 		 * mbuf is responsible to carry the pointer to free routine
 		 * and its arguments.  They aren't copied into shadows in
 		 * mb_dupcl() to avoid dereferencing next cachelines.
 		 */
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	union {
 		struct {
 			/*
 			 * Regular M_EXT mbuf:
 			 * o ext_buf always points to the external buffer.
 			 * o ext_free (below) and two optional arguments
 			 *   ext_arg1 and ext_arg2 store the free context for
 			 *   the external storage.  They are set only in the
 			 *   refcount carrying mbuf, the one with
 			 *   EXT_FLAG_EMBREF flag, with exclusion for
 			 *   EXT_EXTREF type, where the free context is copied
 			 *   into all mbufs that use same external storage.
 			 */
 			char 	*ext_buf;	/* start of buffer */
 #define	m_ext_copylen	offsetof(struct m_ext, ext_arg2)
 			void	*ext_arg2;
 		};
 		struct {
 			/*
 			 * Multi-page M_EXTPG mbuf:
 			 * o extpg_pa - page vector.
 			 * o extpg_trail and extpg_hdr - TLS trailer and
 			 *   header.
 			 * Uses ext_free and may also use ext_arg1.
 			 */
 			vm_paddr_t	extpg_pa[MBUF_PEXT_MAX_PGS];
 			char		extpg_trail[MBUF_PEXT_TRAIL_LEN];
 			char		extpg_hdr[MBUF_PEXT_HDR_LEN];
 			/* Pretend these 3 fields are part of mbuf itself. */
 #define	m_epg_pa	m_ext.extpg_pa
 #define	m_epg_trail	m_ext.extpg_trail
 #define	m_epg_hdr	m_ext.extpg_hdr
-#define	m_epg_copylen	offsetof(struct m_ext, ext_free)
+#define	m_epg_ext_copylen	offsetof(struct m_ext, ext_free)
 		};
 	};
 	/*
 	 * Free method and optional argument pointer, both
 	 * used by M_EXT and M_EXTPG.
 	 */
 	m_ext_free_t	*ext_free;
 	void		*ext_arg1;
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			union {
 				/* M_PKTHDR set. */
 				struct pkthdr	m_pkthdr;
 
 				/* M_EXTPG set.
 				 * Multi-page M_EXTPG mbuf has its meta data
-				 * split between the mbuf_ext_pgs structure
+				 * split between the below anonymous structure
 				 * and m_ext.  It carries vector of pages,
 				 * optional header and trailer char vectors
 				 * and pointers to socket/TLS data.
 				 */
-				struct mbuf_ext_pgs {
+#define	m_epg_startcopy		m_epg_npgs
+#define	m_epg_endcopy		m_epg_stailq
+				struct {
 					/* Overall count of pages and count of
 					 * pages with I/O pending. */
-					uint8_t	npgs;
-					uint8_t	nrdy;
+					uint8_t	m_epg_npgs;
+					uint8_t	m_epg_nrdy;
 					/* TLS header and trailer lengths.
 					 * The data itself resides in m_ext. */
-					uint8_t	hdr_len;
-					uint8_t	trail_len;
-					/* Offset into 1st page and lenght of
+					uint8_t	m_epg_hdrlen;
+					uint8_t	m_epg_trllen;
+					/* Offset into 1st page and length of
 					 * data in the last page. */
-					uint16_t first_pg_off;
-					uint16_t last_pg_len;
-					uint8_t	flags;
+					uint16_t m_epg_1st_off;
+					uint16_t m_epg_last_len;
+					uint8_t	m_epg_flags;
 #define	EPG_FLAG_ANON	0x1	/* Data can be encrypted in place. */
 #define	EPG_FLAG_2FREE	0x2	/* Scheduled for free. */
-					uint8_t	record_type;
-					uint8_t	spare[2];
-					int	enc_cnt;
-					struct ktls_session *tls;
-					struct socket	*so;
-					uint64_t	seqno;
-					STAILQ_ENTRY(mbuf) stailq;
-				} m_ext_pgs;
+					uint8_t	m_epg_record_type;
+					uint8_t	__spare[2];
+					int	m_epg_enc_cnt;
+					struct ktls_session *m_epg_tls;
+					struct socket	*m_epg_so;
+					uint64_t	m_epg_seqno;
+					STAILQ_ENTRY(mbuf) m_epg_stailq;
+				};
 			};
 			union {
 				/* M_EXT or M_EXTPG set. */
 				struct m_ext	m_ext;
 				/* M_PKTHDR set, neither M_EXT nor M_EXTPG. */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 #ifdef _KERNEL
 static inline int
 m_epg_pagelen(const struct mbuf *m, int pidx, int pgoff)
 {
 
 	KASSERT(pgoff == 0 || pidx == 0,
 	    ("page %d with non-zero offset %d in %p", pidx, pgoff, m));
 
-	if (pidx == m->m_ext_pgs.npgs - 1) {
-		return (m->m_ext_pgs.last_pg_len);
+	if (pidx == m->m_epg_npgs - 1) {
+		return (m->m_epg_last_len);
 	} else {
 		return (PAGE_SIZE - pgoff);
 	}
 }
 
 #ifdef INVARIANTS
 #define	MCHECK(ex, msg)	KASSERT((ex),				\
 	    ("Multi page mbuf %p with " #msg " at %s:%d",	\
 	    m, __FILE__, __LINE__))
 /*
  * NB: This expects a non-empty buffer (npgs > 0 and
  * last_pg_len > 0).
  */
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {				\
-	MCHECK(m->m_ext_pgs.npgs > 0, "no valid pages");		\
-	MCHECK(m->m_ext_pgs.npgs <= nitems(m->m_epg_pa),		\
+	MCHECK(m->m_epg_npgs > 0, "no valid pages");		\
+	MCHECK(m->m_epg_npgs <= nitems(m->m_epg_pa),		\
 	    "too many pages");						\
-	MCHECK(m->m_ext_pgs.nrdy <= m->m_ext_pgs.npgs,			\
+	MCHECK(m->m_epg_nrdy <= m->m_epg_npgs,			\
 	    "too many ready pages");					\
-	MCHECK(m->m_ext_pgs.first_pg_off < PAGE_SIZE,			\
+	MCHECK(m->m_epg_1st_off < PAGE_SIZE,			\
 		"too large page offset");				\
-	MCHECK(m->m_ext_pgs.last_pg_len > 0, "zero last page length");	\
-	MCHECK(m->m_ext_pgs.last_pg_len <= PAGE_SIZE,			\
+	MCHECK(m->m_epg_last_len > 0, "zero last page length");	\
+	MCHECK(m->m_epg_last_len <= PAGE_SIZE,			\
 	    "too large last page length");				\
-	if (m->m_ext_pgs.npgs == 1)					\
-		MCHECK(m->m_ext_pgs.first_pg_off +			\
-		    m->m_ext_pgs.last_pg_len <=	 PAGE_SIZE,		\
+	if (m->m_epg_npgs == 1)					\
+		MCHECK(m->m_epg_1st_off +			\
+		    m->m_epg_last_len <=	 PAGE_SIZE,		\
 		    "single page too large");				\
-	MCHECK(m->m_ext_pgs.hdr_len <= sizeof(m->m_epg_hdr),		\
+	MCHECK(m->m_epg_hdrlen <= sizeof(m->m_epg_hdr),		\
 	    "too large header length");					\
-	MCHECK(m->m_ext_pgs.trail_len <= sizeof(m->m_epg_trail),	\
+	MCHECK(m->m_epg_trllen <= sizeof(m->m_epg_trail),	\
 	    "too large header length");					\
 } while (0)
 #else
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {} while (0);
 #endif
 #endif
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-11] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_NOMAP		0x00000100 /* mbuf data is unmapped */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
 				      hw-stamped on port (useful for IEEE 1588
 				      and 802.1AS) */
 #define M_TSTMP_LRO	0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
 
 #define	M_PROTO1	0x00002000 /* protocol-specific */
 #define	M_PROTO2	0x00004000 /* protocol-specific */
 #define	M_PROTO3	0x00008000 /* protocol-specific */
 #define	M_PROTO4	0x00010000 /* protocol-specific */
 #define	M_PROTO5	0x00020000 /* protocol-specific */
 #define	M_PROTO6	0x00040000 /* protocol-specific */
 #define	M_PROTO7	0x00080000 /* protocol-specific */
 #define	M_PROTO8	0x00100000 /* protocol-specific */
 #define	M_PROTO9	0x00200000 /* protocol-specific */
 #define	M_PROTO10	0x00400000 /* protocol-specific */
 #define	M_PROTO11	0x00800000 /* protocol-specific */
 
 #define MB_DTOR_SKIP	0x1	/* don't pollute the cache by touching a freed mbuf */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \
      M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO"
 #define	M_FLAG_PROTOBITS \
     "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \
     "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \
     "\27M_PROTO10\28M_PROTO11"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non-
  * RSS cards or configurations that provide an opaque flow identifier, allowing
  * for ordering and distribution without explicit affinity.  Additionally,
  * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash
  * properties.
  *
  * The meaning of the IPV6_EX suffix:
  * "o  Home address from the home address option in the IPv6 destination
  *     options header.  If the extension header is not present, use the Source
  *     IPv6 Address.
  *  o  IPv6 address that is contained in the Routing-Header-Type-2 from the
  *     associated extension header.  If the extension header is not present,
  *     use the Destination IPv6 Address."
  * Quoted from:
  * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex
  */
 #define	M_HASHTYPE_HASHPROP		0x80	/* has hash properties */
 #define	M_HASHTYPE_HASH(t)		(M_HASHTYPE_HASHPROP | (t))
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		M_HASHTYPE_HASH(1) /* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		M_HASHTYPE_HASH(3) /* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		M_HASHTYPE_HASH(5) /* IPv6 2-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	M_HASHTYPE_HASH(6) /* TCPv6 4-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6		M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
 							    * ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		63	/* ordering, not affinity */
 #define	M_HASHTYPE_OPAQUE_HASH		M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
 						/* ordering+hash, not affinity*/
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 #define	M_HASHTYPE_ISHASH(m)	(M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP)
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_buf */
 #define	EXT_JUMBOP	3	/* jumbo cluster page sized */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
 #define	EXT_PGS		9	/* array of unmapped pages */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_count */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* These flags are vendor */
 #define	EXT_FLAG_VENDOR2	0x020000	/* or submodule specific, */
 #define	EXT_FLAG_VENDOR3	0x040000	/* not used by mbuf code. */
 #define	EXT_FLAG_VENDOR4	0x080000	/* Set/read by submodule. */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 #define MBUF_EXT_PGS_ASSERT(m)						\
 	KASSERT((((m)->m_flags & M_EXT) != 0) &&			\
 	    ((m)->m_ext.ext_type == EXT_PGS),				\
 	    ("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESCED		0x40000000	/* contains merged segments */
 
 #define	CSUM_SND_TAG		0x80000000	/* Packet header has send tag */
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI" \
     "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
     "\16CSUM_IP6_ISCSI" \
     "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
     "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_EXTCONTROL	15	/* control message with externalized contents */
 #define	MT_OOBDATA	16	/* expedited data  */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 #define	MBUF_EXTPGS_MEM_NAME	"mbuf_extpgs"
 
 #ifdef _KERNEL
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 extern uma_zone_t	zone_extpgs;
 
 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
 void		 mb_free_mext_pgs(struct mbuf *);
 struct mbuf	*mb_alloc_ext_pgs(int, m_ext_free_t);
 int		 mb_unmapped_compress(struct mbuf *m);
 struct mbuf 	*mb_unmapped_to_ext(struct mbuf *m);
 void		 mb_free_notready(struct mbuf *m, int count);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_clget(struct mbuf *m, int how);
 void 		*m_cljget(struct mbuf *m, int how, int size);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote_pkthdr(struct mbuf *);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 void		 m_dispose_extcontrolm(struct mbuf *m);
 struct mbuf	*m_dup(const struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, const struct mbuf *, int);
 void		 m_extadd(struct mbuf *, char *, u_int, m_ext_free_t,
 		    void *, void *, int, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
 int		 m_unmappedtouio(const struct mbuf *, int, struct uio *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *);
 void		 m_snd_tag_destroy(struct m_snd_tag *);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt,
     m_ext_free_t freef, void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 	m->m_ext.ext_flags = 0;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, int how, short type, int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR)
 		error = m_pkthdr_init(m, how);
 	else
 		error = 0;
 
 	MBUF_PROBE5(m__init, m, how, type, flags, error);
 	return (error);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	m = uma_zalloc_arg(zone_pack, &args, how);
 	MBUF_PROBE4(m__getcl, how, type, flags, m);
 	return (m);
 }
 
 /*
  * XXX: m_cljset() is a dangerous API.  One must attach only a new,
  * unreferenced cluster to an mbuf(9).  It is not possible to assert
  * that, so care can be taken only by users of the API.
  */
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_flags |= M_EXT;
 	MBUF_PROBE3(m__cljset, m, cl, type);
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 static inline u_int
 m_extrefcnt(struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__));
 
 	return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count :
 	    *m->m_ext.ext_cnt);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2),	\
     (flags), (type))
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 &&	\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_NOMAP) ? NULL :				\
 	 ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Return the rcvif of a packet header. */
 static __inline struct ifnet *
 m_rcvif(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		return (NULL);
 	return (m->m_pkthdr.rcvif);
 }
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 extern bool		mb_use_ext_pgs;	/* Use ext_pgs for sendfile */
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, const struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m __unused, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static inline struct m_snd_tag *
 m_snd_tag_ref(struct m_snd_tag *mst)
 {
 
 	refcount_acquire(&mst->refcount);
 	return (mst);
 }
 
 static inline void
 m_snd_tag_rele(struct m_snd_tag *mst)
 {
 
 	if (refcount_release(&mst->refcount))
 		m_snd_tag_destroy(mst);
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	MBUF_PROBE1(m__free, m);
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 	if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static __inline int
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 /* flags passed as first argument for "m_ether_tcpip_hash()" */
 #define	MBUF_HASHFLAG_L2	(1 << 2)
 #define	MBUF_HASHFLAG_L3	(1 << 3)
 #define	MBUF_HASHFLAG_L4	(1 << 4)
 
 /* mbuf hashing helper routines */
 uint32_t	m_ether_tcpip_hash_init(void);
 uint32_t	m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t);
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 struct mbufq {
 	STAILQ_HEAD(, mbuf)	mq_head;
 	int			mq_len;
 	int			mq_maxlen;
 };
 
 static inline void
 mbufq_init(struct mbufq *mq, int maxlen)
 {
 
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_maxlen = maxlen;
 	mq->mq_len = 0;
 }
 
 static inline struct mbuf *
 mbufq_flush(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_len = 0;
 	return (m);
 }
 
 static inline void
 mbufq_drain(struct mbufq *mq)
 {
 	struct mbuf *m, *n;
 
 	n = mbufq_flush(mq);
 	while ((m = n) != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m_freem(m);
 	}
 }
 
 static inline struct mbuf *
 mbufq_first(const struct mbufq *mq)
 {
 
 	return (STAILQ_FIRST(&mq->mq_head));
 }
 
 static inline struct mbuf *
 mbufq_last(const struct mbufq *mq)
 {
 
 	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
 }
 
 static inline int
 mbufq_full(const struct mbufq *mq)
 {
 
 	return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen);
 }
 
 static inline int
 mbufq_len(const struct mbufq *mq)
 {
 
 	return (mq->mq_len);
 }
 
 static inline int
 mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
 {
 
 	if (mbufq_full(mq))
 		return (ENOBUFS);
 	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 	return (0);
 }
 
 static inline struct mbuf *
 mbufq_dequeue(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	if (m) {
 		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		mq->mq_len--;
 	}
 	return (m);
 }
 
 static inline void
 mbufq_prepend(struct mbufq *mq, struct mbuf *m)
 {
 
 	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 }
 
 /*
  * Note: this doesn't enforce the maximum list size for dst.
  */
 static inline void
 mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src)
 {
 
 	mq_dst->mq_len += mq_src->mq_len;
 	STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head);
 	mq_src->mq_len = 0;
 }
 
 #ifdef _SYS_TIMESPEC_H_
 static inline void
 mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m));
 	ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000;
 }
 #endif
 
 #ifdef DEBUGNET
 /* Invoked from the debugnet client code. */
 void	debugnet_mbuf_drain(void);
 void	debugnet_mbuf_start(void);
 void	debugnet_mbuf_finish(void);
 void	debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize);
 #endif
 
 static inline bool
 mbuf_has_tls_session(struct mbuf *m)
 {
 
 	if (m->m_flags & M_NOMAP) {
 		MBUF_EXT_PGS_ASSERT(m);
-		if (m->m_ext_pgs.tls != NULL) {
+		if (m->m_epg_tls != NULL) {
 			return (true);
 		}
 	}
 	return (false);
 }
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */