Index: head/share/man/man9/sglist.9
===================================================================
--- head/share/man/man9/sglist.9	(revision 360573)
+++ head/share/man/man9/sglist.9	(revision 360574)
@@ -1,621 +1,604 @@
 .\"
 .\" Copyright (c) 2009 Hudson River Trading LLC
 .\" Written by: John H. Baldwin <jhb@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 28, 2019
+.Dd April 24, 2020
 .Dt SGLIST 9
 .Os
 .Sh NAME
 .Nm sglist ,
 .Nm sglist_alloc ,
 .Nm sglist_append ,
 .Nm sglist_append_bio ,
-.Nm sglist_append_ext_pgs,
-.Nm sglist_append_mb_ext_pgs,
 .Nm sglist_append_mbuf ,
+.Nm sglist_append_mbuf_epg,
 .Nm sglist_append_phys ,
 .Nm sglist_append_sglist ,
 .Nm sglist_append_uio ,
 .Nm sglist_append_user ,
 .Nm sglist_append_vmpages ,
 .Nm sglist_build ,
 .Nm sglist_clone ,
 .Nm sglist_consume_uio ,
 .Nm sglist_count ,
-.Nm sglist_count_ext_pgs ,
-.Nm sglist_count_mb_ext_pgs ,
+.Nm sglist_count_mbuf_epg ,
 .Nm sglist_count_vmpages ,
 .Nm sglist_free ,
 .Nm sglist_hold ,
 .Nm sglist_init ,
 .Nm sglist_join ,
 .Nm sglist_length ,
 .Nm sglist_reset ,
 .Nm sglist_slice ,
 .Nm sglist_split
 .Nd manage a scatter/gather list of physical memory addresses
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/sglist.h
 .Ft struct sglist *
 .Fn sglist_alloc "int nsegs" "int mflags"
 .Ft int
 .Fn sglist_append "struct sglist *sg" "void *buf" "size_t len"
 .Ft int
 .Fn sglist_append_bio "struct sglist *sg" "struct bio *bp"
 .Ft int
-.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Fn sglist_append_mbuf_epg "struct sglist *sg" "struct mbuf *m" "size_t offset" "size_t len"
 .Ft int
-.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m"
-.Ft int
 .Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m"
 .Ft int
 .Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
 .Ft int
 .Fn sglist_append_sglist "struct sglist *sg" "struct sglist *source" "size_t offset" "size_t len"
 .Ft int
 .Fn sglist_append_uio "struct sglist *sg" "struct uio *uio"
 .Ft int
 .Fn sglist_append_user "struct sglist *sg" "void *buf" "size_t len" "struct thread *td"
 .Ft int
 .Fn sglist_append_vmpages "struct sglist *sg" "vm_page_t *m" "size_t pgoff" "size_t len"
 .Ft struct sglist *
 .Fn sglist_build "void *buf" "size_t len" "int mflags"
 .Ft struct sglist *
 .Fn sglist_clone "struct sglist *sg" "int mflags"
 .Ft int
 .Fn sglist_consume_uio "struct sglist *sg" "struct uio *uio" "size_t resid"
 .Ft int
 .Fn sglist_count "void *buf" "size_t len"
 .Ft int
-.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Fn sglist_count_mbuf_epg "struct mbuf *m" "size_t offset" "size_t len"
 .Ft int
-.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
-.Ft int
 .Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
 .Ft void
 .Fn sglist_free "struct sglist *sg"
 .Ft struct sglist *
 .Fn sglist_hold "struct sglist *sg"
 .Ft void
 .Fn sglist_init "struct sglist *sg" "int maxsegs" "struct sglist_seg *segs"
 .Ft int
 .Fn sglist_join "struct sglist *first" "struct sglist *second"
 .Ft size_t
 .Fn sglist_length "struct sglist *sg"
 .Ft void
 .Fn sglist_reset "struct sglist *sg"
 .Ft int
 .Fn sglist_slice "struct sglist *original" "struct sglist **slice" "size_t offset" "size_t length" "int mflags"
 .Ft int
 .Fn sglist_split "struct sglist *original" "struct sglist **head" "size_t length" "int mflags"
 .Sh DESCRIPTION
 The
 .Nm
 API manages physical address ranges.
 Each list contains one or more elements.
 Each element contains a starting physical address and a length.
 Scatter/gather lists are read-only while they are shared.
 If one wishes to alter an existing scatter/gather list and does not hold the
 sole reference to the list,
 then one should create a new list instead of modifying the existing list.
 .Pp
 Each scatter/gather list object contains a reference count.
 New lists are created with a single reference.
 New references are obtained by calling
 .Nm sglist_hold
 and are released by calling
 .Nm sglist_free .
 .Ss Allocating and Initializing Lists
 Each
 .Nm
 object consists of a header structure and a variable-length array of
 scatter/gather list elements.
 The
 .Nm sglist_alloc
 function allocates a new list that contains a header and
 .Fa nsegs
 scatter/gather list elements.
 The
 .Fa mflags
 argument can be set to either
 .Dv M_NOWAIT
 or
 .Dv M_WAITOK .
 .Pp
 The
 .Nm sglist_count
 function returns the number of scatter/gather list elements needed to describe
 the physical address ranges mapped by a single kernel virtual address range.
 The kernel virtual address range starts at
 .Fa buf
 and is
 .Fa len
 bytes long.
 .Pp
 The
-.Nm sglist_count_ext_pgs
+.Nm sglist_count_mbuf_epg
 function returns the number of scatter/gather list elements needed to describe
-the unmapped external mbuf buffer
-.Fa ext_pgs .
+the external multipage mbuf buffer
+.Fa m .
 The ranges start at an offset of
 .Fa offset
 relative to the start of the buffer and is
 .Fa len
 bytes long.
-The
-.Nm sglist_count_mb_ext_pgs
-function returns the number of scatter/gather list elements needed to describe
-the physical address ranges of a single unmapped mbuf
-.Fa m .
 .Pp
 The
 .Nm sglist_count_vmpages
 function returns the number of scatter/gather list elements needed to describe
 the physical address ranges of a buffer backed by an array of virtual memory
 pages
 .Fa m .
 The buffer starts at an offset of
 .Fa pgoff
 bytes relative to the first page and is
 .Fa len
 bytes long.
 .Pp
 The
 .Nm sglist_build
 function allocates a new scatter/gather list object that describes the physical
 address ranges mapped by a single kernel virtual address range.
 The kernel virtual address range starts at
 .Fa buf
 and is
 .Fa len
 bytes long.
 The
 .Fa mflags
 argument can be set to either
 .Dv M_NOWAIT
 or
 .Dv M_WAITOK .
 .Pp
 The
 .Nm sglist_clone
 function returns a copy of an existing scatter/gather list object
 .Fa sg .
 The
 .Fa mflags
 argument can be set to either
 .Dv M_NOWAIT
 or
 .Dv M_WAITOK .
 This can be used to obtain a private copy of a scatter/gather list before
 modifying it.
 .Pp
 The
 .Nm sglist_init
 function initializes a scatter/gather list header.
 The header is pointed to by
 .Fa sg
 and is initialized to manage an array of
 .Fa maxsegs
 scatter/gather list elements pointed to by
 .Fa segs .
 This can be used to initialize a scatter/gather list header whose storage
 is not provided by
 .Nm sglist_alloc .
 In that case, the caller should not call
 .Nm sglist_free
 to release its own reference and is responsible for ensuring all other
 references to the list are dropped before it releases the storage for
 .Fa sg
 and
 .Fa segs .
 .Ss Constructing Scatter/Gather Lists
 The
 .Nm
 API provides several routines for building a scatter/gather list to describe
 one or more objects.
 Specifically, the
 .Nm sglist_append
 family of routines can be used to append the physical address ranges described
 by an object to the end of a scatter/gather list.
 All of these routines return 0 on success or an error on failure.
 If a request to append an address range to a scatter/gather list fails,
 the scatter/gather list will remain unchanged.
 .Pp
 The
 .Nm sglist_append
 function appends the physical address ranges described by a single kernel
 virtual address range to the scatter/gather list
 .Fa sg .
 The kernel virtual address range starts at
 .Fa buf
 and is
 .Fa len
 bytes long.
 .Pp
 The
 .Nm sglist_append_bio
 function appends the physical address ranges described by a single bio
 .Fa bp
 to the scatter/gather list
 .Fa sg .
 .Pp
 The
-.Nm sglist_append_ext_pgs
-function appends the physical address ranges described by the unmapped
-external mbuf buffer
+.Nm sglist_append_mbuf_epg
+function appends the physical address ranges described by the
+external multipage
+.Xr mbuf 9
+buffer
 .Fa ext_pgs
 to the scatter/gather list
 .Fa sg .
 The physical address ranges start at offset
 .Fa offset
 within
 .Fa ext_pgs
 and continue for
 .Fa len
 bytes.
-.Pp
-The
-.Nm sglist_append_mb_ext_pgs
-function appends the physical address ranges described by the unmapped
-mbuf
-.Fa m
-to the scatter/gather list
-.Fa sg .
 Note that unlike
 .Nm sglist_append_mbuf ,
-.Nm sglist_append_mb_ext_pgs
+.Nm sglist_append_mbuf_epg
 only adds ranges for a single mbuf,
 not an entire mbuf chain.
 .Pp
 The
 .Nm sglist_append_mbuf
 function appends the physical address ranges described by an entire mbuf
 chain
 .Fa m
 to the scatter/gather list
 .Fa sg .
 .Pp
 The
 .Nm sglist_append_phys
 function appends a single physical address range to the scatter/gather list
 .Fa sg .
 The physical address range starts at
 .Fa paddr
 and is
 .Fa len
 bytes long.
 .Pp
 The
 .Nm sglist_append_sglist
 function appends physical address ranges described by the scatter/gather list
 .Fa source
 to the scatter/gather list
 .Fa sg .
 The physical address ranges start at offset
 .Fa offset
 within
 .Fa source
 and continue for
 .Fa len
 bytes.
 .Pp
 The
 .Nm sglist_append_uio
 function appends the physical address ranges described by a
 .Xr uio 9
 object to the scatter/gather list
 .Fa sg .
 Note that it is the caller's responsibility to ensure that the pages backing
 the I/O request are wired for the lifetime of
 .Fa sg .
 Note also that this routine does not modify
 .Fa uio .
 .Pp
 The
 .Nm sglist_append_user
 function appends the physical address ranges described by a single user
 virtual address range to the scatter/gather list
 .Fa sg .
 The user virtual address range is relative to the address space of the thread
 .Fa td .
 It starts at
 .Fa buf
 and is
 .Fa len
 bytes long.
 Note that it is the caller's responsibility to ensure that the pages backing
 the user buffer are wired for the lifetime of
 .Fa sg .
 .Pp
 The
 .Nm sglist_append_vmpages
 function appends the physical address ranges of a buffer backed by an array
 of virtual memory pages
 .Fa m .
 The buffer starts at an offset of
 .Fa pgoff
 bytes relative to the first page and is
 .Fa len
 bytes long.
 .Pp
 The
 .Nm sglist_consume_uio
 function is a variation of
 .Nm sglist_append_uio .
 As with
 .Nm sglist_append_uio ,
 it appends the physical address ranges described by
 .Fa uio
 to the scatter/gather list
 .Fa sg .
 Unlike
 .Nm sglist_append_uio ,
 however,
 .Nm sglist_consume_uio
 modifies the I/O request to indicate that the appended address ranges have
 been processed similar to calling
 .Xr uiomove 9 .
 This routine will only append ranges that describe up to
 .Fa resid
 total bytes in length.
 If the available segments in the scatter/gather list are exhausted before
 .Fa resid
 bytes are processed,
 then the
 .Fa uio
 structure will be updated to reflect the actual number of bytes processed,
 and
 .Nm sglist_consume_io
 will return zero to indicate success.
 In effect, this function will perform partial reads or writes.
 The caller can compare the
 .Fa uio_resid
 member of
 .Fa uio
 before and after calling
 .Nm sglist_consume_uio
 to determine the actual number of bytes processed.
 .Ss Manipulating Scatter/Gather Lists
 The
 .Nm sglist_join
 function appends physical address ranges from the scatter/gather list
 .Fa second
 onto
 .Fa first
 and then resets
 .Fa second
 to an empty list.
 It returns zero on success or an error on failure.
 .Pp
 The
 .Nm sglist_split
 function splits an existing scatter/gather list into two lists.
 The first
 .Fa length
 bytes described by the list
 .Fa original
 are moved to a new list
 .Fa *head .
 If
 .Fa original
 describes a total address range that is smaller than
 .Fa length
 bytes,
 then all of the address ranges will be moved to the new list at
 .Fa *head
 and
 .Fa original
 will be an empty list.
 The caller may supply an existing scatter/gather list in
 .Fa *head .
 If so, the list must be empty.
 Otherwise, the caller may set
 .Fa *head
 to
 .Dv NULL
 in which case a new scatter/gather list will be allocated.
 In that case,
 .Fa mflags
 may be set to either
 .Dv M_NOWAIT
 or
 .Dv M_WAITOK .
 Note that since the
 .Fa original
 list is modified by this call, it must be a private list with no other
 references.
 The
 .Nm sglist_split
 function returns zero on success or an error on failure.
 .Pp
 The
 .Nm sglist_slice
 function generates a new scatter/gather list from a sub-range of an existing
 scatter/gather list
 .Fa original .
 The sub-range to extract is specified by the
 .Fa offset
 and
 .Fa length
 parameters.
 The new scatter/gather list is stored in
 .Fa *slice .
 As with
 .Fa head
 for
 .Nm sglist_join ,
 the caller may either provide an empty scatter/gather list,
 or it may set
 .Fa *slice
 to
 .Dv NULL
 in which case
 .Nm sglist_slice
 will allocate a new list subject to
 .Fa mflags .
 Unlike
 .Nm sglist_split ,
 .Nm sglist_slice
 does not modify
 .Fa original
 and does not require it to be a private list.
 The
 .Nm sglist_split
 function returns zero on success or an error on failure.
 .Ss Miscellaneous Routines
 The
 .Nm sglist_reset
 function clears the scatter/gather list
 .Fa sg
 so that it no longer maps any address ranges.
 This can allow reuse of a single scatter/gather list object for multiple
 requests.
 .Pp
 The
 .Nm sglist_length
 function returns the total length of the physical address ranges described
 by the scatter/gather list
 .Fa sg .
 .Sh RETURN VALUES
 The
 .Nm sglist_alloc ,
 .Nm sglist_build ,
 and
 .Nm sglist_clone
 functions return a new scatter/gather list on success or
 .Dv NULL
 on failure.
 .Pp
 The
 .Nm sglist_append
 family of functions and the
 .Nm sglist_consume_uio ,
 .Nm sglist_join ,
 .Nm sglist_slice ,
 and
 .Nm sglist_split
 functions return zero on success or an error on failure.
 .Pp
 The
 .Nm sglist_count
 family of
 functions return a count of scatter/gather list elements.
 .Pp
 The
 .Nm sglist_length
 function returns a count of address space described by a scatter/gather list
 in bytes.
 .Sh ERRORS
 The
 .Nm sglist_append
 functions return the following errors on failure:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The scatter/gather list has zero segments.
 .It Bq Er EFBIG
 There are not enough available segments in the scatter/gather list to append
 the specified physical address ranges.
 .El
 .Pp
 The
 .Nm sglist_consume_uio
 function returns the following error on failure:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The scatter/gather list has zero segments.
 .El
 .Pp
 The
 .Nm sglist_join
 function returns the following error on failure:
 .Bl -tag -width Er
 .It Bq Er EFBIG
 There are not enough available segments in the scatter/gather list
 .Fa first
 to append the physical address ranges from
 .Fa second .
 .El
 .Pp
 The
 .Nm sglist_slice
 function returns the following errors on failure:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The
 .Fa original
 scatter/gather list does not describe enough address space to cover the
 requested sub-range.
 .It Bq Er EINVAL
 The caller-supplied scatter/gather list in
 .Fa *slice
 is not empty.
 .It Bq Er ENOMEM
 An attempt to allocate a new scatter/gather list with
 .Dv M_NOWAIT
 set in
 .Fa mflags
 failed.
 .It Bq Er EFBIG
 There are not enough available segments in the caller-supplied scatter/gather
 list in
 .Fa *slice
 to describe the requested physical address ranges.
 .El
 .Pp
 The
 .Nm sglist_split
 function returns the following errors on failure:
 .Bl -tag -width Er
 .It Bq Er EDOOFUS
 The
 .Fa original
 scatter/gather list has more than one reference.
 .It Bq Er EINVAL
 The caller-supplied scatter/gather list in
 .Fa *head
 is not empty.
 .It Bq Er ENOMEM
 An attempt to allocate a new scatter/gather list with
 .Dv M_NOWAIT
 set in
 .Fa mflags
 failed.
 .It Bq Er EFBIG
 There are not enough available segments in the caller-supplied scatter/gather
 list in
 .Fa *head
 to describe the requested physical address ranges.
 .El
 .Sh SEE ALSO
 .Xr g_bio 9 ,
 .Xr malloc 9 ,
 .Xr mbuf 9 ,
 .Xr uio 9
 .Sh HISTORY
 This API was first introduced in
 .Fx 8.0 .
Index: head/sys/dev/cxgbe/crypto/t4_kern_tls.c
===================================================================
--- head/sys/dev/cxgbe/crypto/t4_kern_tls.c	(revision 360573)
+++ head/sys/dev/cxgbe/crypto/t4_kern_tls.c	(revision 360574)
@@ -1,2408 +1,2408 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018-2019 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/ktls.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp_var.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "t4_l2t.h"
 #include "t4_clip.h"
 #include "t4_mp_ring.h"
 #include "crypto/t4_crypto.h"
 
 #if defined(INET) || defined(INET6)
 
 #define SALT_SIZE		4
 
 #define GCM_TAG_SIZE			16
 #define TLS_HEADER_LENGTH		5
 
 #define	TLS_KEY_CONTEXT_SZ	roundup2(sizeof(struct tls_keyctx), 32)
 
 struct tls_scmd {
 	__be32 seqno_numivs;
 	__be32 ivgen_hdrlen;
 };
 
 struct tls_key_req {
 	/* FW_ULPTX_WR */
 	__be32 wr_hi;
 	__be32 wr_mid;
         __be32 ftid;
         __u8   reneg_to_write_rx;
         __u8   protocol;
         __be16 mfs;
 	/* master command */
 	__be32 cmd;
 	__be32 len16;             /* command length */
 	__be32 dlen;              /* data length in 32-byte units */
 	__be32 kaddr;
 	/* sub-command */
 	__be32 sc_more;
 	__be32 sc_len;
 }__packed;
 
 struct tls_keyctx {
 	struct tx_keyctx_hdr {
 		__u8   ctxlen;
 		__u8   r2;
 		__be16 dualck_to_txvalid;
 		__u8   txsalt[4];
 		__be64 r5;
 	} txhdr;
         struct keys {
                 __u8   edkey[32];
                 __u8   ipad[64];
                 __u8   opad[64];
         } keys;
 };
 
 #define S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 11
 #define M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 0x1
 #define V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT)
 #define G_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) & \
      M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT)
 #define F_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT \
     V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1U)
 
 #define S_TLS_KEYCTX_TX_WR_SALT_PRESENT 10
 #define M_TLS_KEYCTX_TX_WR_SALT_PRESENT 0x1
 #define V_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_SALT_PRESENT)
 #define G_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_SALT_PRESENT) & \
      M_TLS_KEYCTX_TX_WR_SALT_PRESENT)
 #define F_TLS_KEYCTX_TX_WR_SALT_PRESENT \
     V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1U)
 
 #define S_TLS_KEYCTX_TX_WR_TXCK_SIZE 6
 #define M_TLS_KEYCTX_TX_WR_TXCK_SIZE 0xf
 #define V_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXCK_SIZE)
 #define G_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXCK_SIZE) & \
      M_TLS_KEYCTX_TX_WR_TXCK_SIZE)
 
 #define S_TLS_KEYCTX_TX_WR_TXMK_SIZE 2
 #define M_TLS_KEYCTX_TX_WR_TXMK_SIZE 0xf
 #define V_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXMK_SIZE)
 #define G_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXMK_SIZE) & \
      M_TLS_KEYCTX_TX_WR_TXMK_SIZE)
 
 #define S_TLS_KEYCTX_TX_WR_TXVALID   0
 #define M_TLS_KEYCTX_TX_WR_TXVALID   0x1
 #define V_TLS_KEYCTX_TX_WR_TXVALID(x) \
     ((x) << S_TLS_KEYCTX_TX_WR_TXVALID)
 #define G_TLS_KEYCTX_TX_WR_TXVALID(x) \
     (((x) >> S_TLS_KEYCTX_TX_WR_TXVALID) & M_TLS_KEYCTX_TX_WR_TXVALID)
 #define F_TLS_KEYCTX_TX_WR_TXVALID   V_TLS_KEYCTX_TX_WR_TXVALID(1U)
 
 /* Key Context Programming Operation type */
 #define KEY_WRITE_RX			0x1
 #define KEY_WRITE_TX			0x2
 #define KEY_DELETE_RX			0x4
 #define KEY_DELETE_TX			0x8
 
 struct tlspcb {
 	struct cxgbe_snd_tag com;
 	struct vi_info *vi;	/* virtual interface */
 	struct adapter *sc;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	int tid;		/* Connection identifier */
 
 	int tx_key_addr;
 	bool inline_key;
 	bool using_timestamps;
 	unsigned char enc_mode;
 
 	struct tls_scmd scmd0;
 	struct tls_scmd scmd0_short;
 
 	unsigned int tx_key_info_size;
 
 	uint32_t prev_seq;
 	uint32_t prev_ack;
 	uint32_t prev_tsecr;
 	uint16_t prev_win;
 	uint16_t prev_mss;
 
 	/* Only used outside of setup and teardown when using inline keys. */
 	struct tls_keyctx keyctx;
 
 	/* Fields only used during setup and teardown. */
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 
 	unsigned char auth_mode;
 	unsigned char hmac_ctrl;
 	unsigned char mac_first;
 	unsigned char iv_size;
 
 	unsigned int frag_size;
 	unsigned int cipher_secret_size;
 	int proto_ver;
 
 	bool open_pending;
 };
 
 static int ktls_setup_keys(struct tlspcb *tlsp,
     const struct ktls_session *tls, struct sge_txq *txq);
 
 static inline struct tlspcb *
 mst_to_tls(struct m_snd_tag *t)
 {
 	return ((struct tlspcb *)mst_to_cst(t));
 }
 
 /* XXX: There are similar versions of these two in tom/t4_tls.c. */
 static int
 get_new_keyid(struct tlspcb *tlsp)
 {
 	vmem_addr_t addr;
 
 	if (vmem_alloc(tlsp->sc->key_map, TLS_KEY_CONTEXT_SZ,
 	    M_NOWAIT | M_FIRSTFIT, &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct tlspcb *tlsp, int keyid)
 {
 
 	CTR3(KTR_CXGBE, "%s: tid %d key addr %#x", __func__, tlsp->tid, keyid);
 	vmem_free(tlsp->sc->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static struct tlspcb *
 alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tlspcb *tlsp;
 
 	tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags);
 	if (tlsp == NULL)
 		return (NULL);
 
 	cxgbe_snd_tag_init(&tlsp->com, ifp, IF_SND_TAG_TYPE_TLS);
 	tlsp->vi = vi;
 	tlsp->sc = sc;
 	tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	tlsp->tid = -1;
 	tlsp->tx_key_addr = -1;
 
 	return (tlsp);
 }
 
 static void
 init_ktls_key_params(struct tlspcb *tlsp, const struct ktls_session *tls)
 {
 	int mac_key_size;
 
 	if (tls->params.tls_vminor == TLS_MINOR_VER_ONE)
 		tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_1;
 	else
 		tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_2;
 	tlsp->cipher_secret_size = tls->params.cipher_key_len;
 	tlsp->tx_key_info_size = sizeof(struct tx_keyctx_hdr) +
 	    tlsp->cipher_secret_size;
 	if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) {
 		tlsp->auth_mode = SCMD_AUTH_MODE_GHASH;
 		tlsp->enc_mode = SCMD_CIPH_MODE_AES_GCM;
 		tlsp->iv_size = 4;
 		tlsp->mac_first = 0;
 		tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NOP;
 		tlsp->tx_key_info_size += GMAC_BLOCK_LEN;
 	} else {
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			mac_key_size = roundup2(SHA1_HASH_LEN, 16);
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA1;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			mac_key_size = SHA2_256_HASH_LEN;
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA256;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			mac_key_size = SHA2_512_HASH_LEN;
 			tlsp->auth_mode = SCMD_AUTH_MODE_SHA512_384;
 			break;
 		}
 		tlsp->enc_mode = SCMD_CIPH_MODE_AES_CBC;
 		tlsp->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		tlsp->mac_first = 1;
 		tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC;
 		tlsp->tx_key_info_size += mac_key_size * 2;
 	}
 
 	tlsp->frag_size = tls->params.max_frame_len;
 }
 
 static int
 ktls_act_open_cpl_size(bool isipv6)
 {
 
 	if (isipv6)
 		return (sizeof(struct cpl_t6_act_open_req6));
 	else
 		return (sizeof(struct cpl_t6_act_open_req));
 }
 
 static void
 mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp,
     struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req *cpl6;
 	struct cpl_act_open_req *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		qid_atid));
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 	    &cpl->peer_ip, &cpl->peer_port);
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static void
 mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req6 *cpl6;
 	struct cpl_act_open_req6 *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req6 *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
 		qid_atid));
 	cpl->local_port = inp->inp_lport;
 	cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	cpl->peer_port = inp->inp_fport;
 	cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 	cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static int
 send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid)
 {
 	struct wrqe *wr;
 	bool isipv6;
 
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	if (isipv6) {
 		tlsp->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
 		if (tlsp->ce == NULL)
 			return (ENOENT);
 	}
 
 	wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq);
 	if (wr == NULL) {
 		CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__,
 		    atid);
 		return (ENOMEM);
 	}
 
 	if (isipv6)
 		mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr));
 	else
 		mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr));
 
 	tlsp->open_pending = true;
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 	u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
 	struct tlspcb *tlsp = lookup_atid(sc, atid);
 	struct inpcb *inp = tlsp->inp;
 
 	CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status);
 	free_atid(sc, atid);
 	if (status == 0)
 		tlsp->tid = GET_TID(cpl);
 
 	INP_WLOCK(inp);
 	tlsp->open_pending = false;
 	wakeup(tlsp);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0,
     "CPL_SET_TCB_FIELD ULP command not 16-byte aligned");
 
 static void
 write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq,
     uint16_t word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt;
 	struct ulptx_idata *idata;
 	struct cpl_set_tcb_field_core *cpl;
 
 	/* ULP_TXPKT */
 	txpkt = dst;
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (struct ulptx_idata *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htobe32(sizeof(*cpl));
 
 	/* CPL_SET_TCB_FIELD */
 	cpl = (struct cpl_set_tcb_field_core *)(idata + 1);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid));
 	cpl->reply_ctrl = htobe16(F_NO_REPLY);
 	cpl->word_cookie = htobe16(V_WORD(word));
 	cpl->mask = htobe64(mask);
 	cpl->val = htobe64(val);
 
 	/* ULPTX_NOOP */
 	idata = (struct ulptx_idata *)(cpl + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 	idata->len = htobe32(0);
 }
 
 static int
 ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq)
 {
 	struct fw_ulptx_wr *wr;
 	struct mbuf *m;
 	char *dst;
 	void *items[1];
 	int error, len;
 
 	len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com.com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 
 	/* FW_ULPTX_WR */
 	wr = mtod(m, void *);
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(len / 16));
 	wr->cookie = 0;
 	dst = (char *)(wr + 1);
 
         /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS,
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)),
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1)));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW,
 	    V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) |
 	    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 	    V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW,
 	    V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET,
 		    V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET),
 		    V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28));
 		dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	}
 
 	KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__));
 
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	return (error);
 }
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	const struct ktls_session *tls;
 	struct tlspcb *tlsp;
 	struct adapter *sc;
 	struct vi_info *vi;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct sge_txq *txq;
 	int atid, error, keyid;
 
 	tls = params->tls.tls;
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO)
 		return (EPROTONOSUPPORT);
 
 	/* Sanity check values in *tls. */
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			return (EPROTONOSUPPORT);
 		}
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE)
 			return (EINVAL);
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	vi = ifp->if_softc;
 	sc = vi->pi->adapter;
 
 	tlsp = alloc_tlspcb(ifp, vi, M_WAITOK);
 
 	atid = alloc_atid(sc, tlsp);
 	if (atid < 0) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (sc->tlst.inline_keys)
 		keyid = -1;
 	else
 		keyid = get_new_keyid(tlsp);
 	if (keyid < 0) {
 		CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__,
 		    atid);
 		tlsp->inline_key = true;
 	} else {
 		tlsp->tx_key_addr = keyid;
 		CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x",
 		    __func__,
 		    atid, tlsp->tx_key_addr);
 	}
 
 	inp = params->tls.inp;
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 	tlsp->inp = inp;
 
 	tp = inp->inp_ppcb;
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		tlsp->using_timestamps = true;
 		if ((tp->ts_offset & 0xfffffff) != 0) {
 			INP_RUNLOCK(inp);
 			error = EINVAL;
 			goto failed;
 		}
 	} else
 		tlsp->using_timestamps = false;
 
 	error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid);
 	if (error) {
 		INP_RUNLOCK(inp);
 		goto failed;
 	}
 
 	/* Wait for reply to active open. */
 	CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__,
 	    atid);
 	while (tlsp->open_pending) {
 		/*
 		 * XXX: PCATCH?  We would then have to discard the PCB
 		 * when the completion CPL arrived.
 		 */
 		error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0);
 	}
 
 	atid = -1;
 	if (tlsp->tid < 0) {
 		INP_RUNLOCK(inp);
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 
 	txq = &sc->sge.txq[vi->first_txq];
 	if (inp->inp_flowtype != M_HASHTYPE_NONE)
 		txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 	tlsp->txq = txq;
 
 	error = ktls_set_tcb_fields(tlsp, tp, txq);
 	INP_RUNLOCK(inp);
 	if (error)
 		goto failed;
 
 	init_ktls_key_params(tlsp, tls);
 
 	error = ktls_setup_keys(tlsp, tls, txq);
 	if (error)
 		goto failed;
 
 	/* The SCMD fields used when encrypting a full TLS record. */
 	tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) |
 	    V_SCMD_PROTO_VERSION(tlsp->proto_ver) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) |
 	    V_SCMD_CIPH_MODE(tlsp->enc_mode) |
 	    V_SCMD_AUTH_MODE(tlsp->auth_mode) |
 	    V_SCMD_HMAC_CTRL(tlsp->hmac_ctrl) |
 	    V_SCMD_IV_SIZE(tlsp->iv_size) | V_SCMD_NUM_IVS(1));
 
 	tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0);
 	if (tlsp->inline_key)
 		tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 	tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen);
 
 	/*
 	 * The SCMD fields used when encrypting a partial TLS record
 	 * (no trailer and possibly a truncated payload).
 	 */
 	tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) |
 	    V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) |
 	    V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) |
 	    V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) |
 	    V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR);
 	else
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(tlsp->enc_mode);
 	tlsp->scmd0_short.seqno_numivs =
 	    htobe32(tlsp->scmd0_short.seqno_numivs);
 
 	tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0) |
 	    V_SCMD_AADIVDROP(1);
 	if (tlsp->inline_key)
 		tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 
 	TXQ_LOCK(txq);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		txq->kern_tls_gcm++;
 	else
 		txq->kern_tls_cbc++;
 	TXQ_UNLOCK(txq);
 	*pt = &tlsp->com.com;
 	return (0);
 
 failed:
 	if (atid >= 0)
 		free_atid(sc, atid);
 	m_snd_tag_rele(&tlsp->com.com);
 	return (error);
 }
 
 static int
 ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls,
     struct sge_txq *txq)
 {
 	struct auth_hash *axf;
 	int error, keyid, kwrlen, kctxlen, len;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 	void *items[1], *key;
 	struct tx_keyctx_hdr *khdr;
 	unsigned int ck_size, mk_size, partial_digest_len;
 	struct mbuf *m;
 
 	/*
 	 * Store the salt and keys in the key context.  For
 	 * connections with an inline key, this key context is passed
 	 * as immediate data in each work request.  For connections
 	 * storing the key in DDR, a work request is used to store a
 	 * copy of the key context in DDR.
 	 */
 	kctx = &tlsp->keyctx;
 	khdr = &kctx->txhdr;
 
 	switch (tlsp->cipher_secret_size) {
 	case 128 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128;
 		break;
 	case 192 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192;
 		break;
 	case 256 / 8:
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256;
 		break;
 	default:
 		panic("bad key size");
 	}
 	axf = NULL;
 	partial_digest_len = 0;
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512;
 	else {
 		switch (tlsp->auth_mode) {
 		case SCMD_AUTH_MODE_SHA1:
 			axf = &auth_hash_hmac_sha1;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_160;
 			partial_digest_len = SHA1_HASH_LEN;
 			break;
 		case SCMD_AUTH_MODE_SHA256:
 			axf = &auth_hash_hmac_sha2_256;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_256;
 			partial_digest_len = SHA2_256_HASH_LEN;
 			break;
 		case SCMD_AUTH_MODE_SHA512_384:
 			axf = &auth_hash_hmac_sha2_384;
 			mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512;
 			partial_digest_len = SHA2_512_HASH_LEN;
 			break;
 		default:
 			panic("bad auth mode");
 		}
 	}
 
 	khdr->ctxlen = (tlsp->tx_key_info_size >> 4);
 	khdr->dualck_to_txvalid = V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 	    V_TLS_KEYCTX_TX_WR_TXCK_SIZE(ck_size) |
 	    V_TLS_KEYCTX_TX_WR_TXMK_SIZE(mk_size) |
 	    V_TLS_KEYCTX_TX_WR_TXVALID(1);
 	if (tlsp->enc_mode != SCMD_CIPH_MODE_AES_GCM)
 		khdr->dualck_to_txvalid |= V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1);
 	khdr->dualck_to_txvalid = htobe16(khdr->dualck_to_txvalid);
 	key = kctx->keys.edkey;
 	memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		memcpy(khdr->txsalt, tls->params.iv, SALT_SIZE);
 		t4_init_gmac_hash(tls->params.cipher_key,
 		    tls->params.cipher_key_len,
 		    (char *)key + tls->params.cipher_key_len);
 	} else {
 		t4_init_hmac_digest(axf, partial_digest_len,
 		    tls->params.auth_key, tls->params.auth_key_len,
 		    (char *)key + tls->params.cipher_key_len);
 	}
 
 	if (tlsp->inline_key)
 		return (0);
 
 	keyid = tlsp->tx_key_addr;
 
 	/* Populate key work request. */
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = kwrlen + kctxlen;
 
         m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com.com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	kwr = mtod(m, void *);
 	memset(kwr, 0, len);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)));
 	kwr->protocol = tlsp->proto_ver;
 	kwr->mfs = htons(tlsp->frag_size);
 	kwr->reneg_to_write_rx = KEY_WRITE_TX;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((tlsp->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memcpy(kctx, &tlsp->keyctx, sizeof(*kctx));
 
 	/*
 	 * Place the key work request in the transmit queue.  It
 	 * should be sent to the NIC before any TLS packets using this
 	 * session.
 	 */
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	else
 		CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid);
 	return (error);
 }
 
 static u_int
 ktls_base_wr_size(struct tlspcb *tlsp)
 {
 	u_int wr_len;
 
 	wr_len = sizeof(struct fw_ulptx_wr);	// 16
 	wr_len += sizeof(struct ulp_txpkt);	// 8
 	wr_len += sizeof(struct ulptx_idata);	// 8
 	wr_len += sizeof(struct cpl_tx_sec_pdu);// 32
 	if (tlsp->inline_key)
 		wr_len += tlsp->tx_key_info_size;
 	else {
 		wr_len += sizeof(struct ulptx_sc_memrd);// 8
 		wr_len += sizeof(struct ulptx_idata);	// 8
 	}
 	wr_len += sizeof(struct cpl_tx_data);	// 16
 	return (wr_len);
 }
 
 /* How many bytes of TCP payload to send for a given TLS record. */
 static u_int
 ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct mbuf_ext_pgs *ext_pgs;
 	struct tls_record_layer *hdr;
 	u_int plen, mlen;
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	ext_pgs = &m_tls->m_ext_pgs;
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 
 	/*
 	 * What range of the TLS record is the mbuf requesting to be
 	 * sent.
 	 */
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 
 	/* Always send complete records. */
 	if (mlen == TLS_HEADER_LENGTH + plen)
 		return (mlen);
 
 	/*
 	 * If the host stack has asked to send part of the trailer,
 	 * trim the length to avoid sending any of the trailer.  There
 	 * is no way to send a partial trailer currently.
 	 */
 	if (mlen > TLS_HEADER_LENGTH + plen - ext_pgs->trail_len)
 		mlen = TLS_HEADER_LENGTH + plen - ext_pgs->trail_len;
 
 
 	/*
 	 * For AES-CBC adjust the ciphertext length for the block
 	 * size.
 	 */
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    mlen > TLS_HEADER_LENGTH) {
 		mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)",
 	    __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen);
 #endif
 	return (mlen);
 }
 
 /*
  * For a "short" TLS record, determine the offset into the TLS record
  * payload to send.  This offset does not include the TLS header, but
  * a non-zero offset implies that a header will not be sent.
  */
 static u_int
 ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct mbuf_ext_pgs *ext_pgs;
 	struct tls_record_layer *hdr;
 	u_int offset, plen;
 #ifdef INVARIANTS
 	u_int mlen;
 #endif
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	ext_pgs = &m_tls->m_ext_pgs;
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 #ifdef INVARIANTS
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 	MPASS(mlen < TLS_HEADER_LENGTH + plen);
 #endif
 	if (mtod(m_tls, vm_offset_t) <= ext_pgs->hdr_len)
 		return (0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		/*
 		 * Always send something.  This function is only called
 		 * if we aren't sending the tag at all, but if the
 		 * request starts in the tag then we are in an odd
 		 * state where would effectively send nothing.  Cap
 		 * the offset at the last byte of the record payload
 		 * to send the last cipher block.
 		 */
 		offset = min(mtod(m_tls, vm_offset_t) - ext_pgs->hdr_len,
 		    (plen - TLS_HEADER_LENGTH - ext_pgs->trail_len) - 1);
 		return (rounddown(offset, AES_BLOCK_LEN));
 	}
 	return (0);
 }
 
 static u_int
 ktls_sgl_size(u_int nsegs)
 {
 	u_int wr_len;
 
 	/* First segment is part of ulptx_sgl. */
 	nsegs--;
 
 	wr_len = sizeof(struct ulptx_sgl);
 	wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	return (wr_len);
 }
 
 static int
 ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls,
     int *nsegsp)
 {
 	struct mbuf_ext_pgs *ext_pgs;
 	struct tls_record_layer *hdr;
 	u_int imm_len, offset, plen, wr_len, tlen;
 
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	ext_pgs = &m_tls->m_ext_pgs;
 
 	/*
 	 * Determine the size of the TLS record payload to send
 	 * excluding header and trailer.
 	 */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= ext_pgs->hdr_len) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) +
 		    roundup2(m->m_len + m_tls->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 		    "%s: tid %d TLS header-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len + m_tls->m_len);
 		}
 
 		/* This should always be the last TLS record in a chain. */
 		MPASS(m_tls->m_next == NULL);
 
 		/*
 		 * XXX: Set a bogus 'nsegs' value to avoid tripping an
 		 * assertion in mbuf_nsegs() in t4_sge.c.
 		 */
 		*nsegsp = 1;
 		return (wr_len);
 	}
 
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - ext_pgs->trail_len;
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 	} else
 		offset = 0;
 
 	/* Calculate the size of the work request. */
 	wr_len = ktls_base_wr_size(tlsp);
 
 	/*
 	 * Full records and short records with an offset of 0 include
 	 * the TLS header as immediate data.  Short records include a
 	 * raw AES IV as immediate data.
 	 */
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += ext_pgs->hdr_len;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	wr_len += roundup2(imm_len, 16);
 
 	/* TLS record payload via DSGL. */
-	*nsegsp = sglist_count_ext_pgs(m_tls, ext_pgs->hdr_len + offset,
+	*nsegsp = sglist_count_mbuf_epg(m_tls, ext_pgs->hdr_len + offset,
 	    plen - (ext_pgs->hdr_len + offset));
 	wr_len += ktls_sgl_size(*nsegsp);
 
 	wr_len = roundup2(wr_len, 16);
 	return (wr_len);
 }
 
 /*
  * See if we have any TCP options requiring a dedicated options-only
  * packet.
  */
 static int
 ktls_has_tcp_options(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_NOP:
 		case TCPOPT_TIMESTAMP:
 			break;
 		default:
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Find the TCP timestamp option.
  */
 static void *
 ktls_find_tcp_timestamps(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP)
 			return (cp + 2);
 	}
 	return (NULL);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	struct tlspcb *tlsp;
 	struct ether_header *eh;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	int nsegs;
 	u_int wr_len, tot_len;
 
 	/*
 	 * Locate headers in initial mbuf.
 	 *
 	 * XXX: This assumes all of the headers are in the initial mbuf.
 	 * Could perhaps use m_advance() like parse_pkt() if that turns
 	 * out to not be true.
 	 */
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	if (m->m_len <= sizeof(*eh) + sizeof(*ip)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__,
 		    tlsp->tid);
 		return (EINVAL);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ntohs(eh->ether_type) != ETHERTYPE_IP &&
 	    ntohs(eh->ether_type) != ETHERTYPE_IPV6) {
 		CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	m->m_pkthdr.l2hlen = sizeof(*eh);
 
 	/* XXX: Reject unsupported IP options? */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (struct ip *)(eh + 1);
 		if (ip->ip_p != IPPROTO_TCP) {
 			CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP",
 			    __func__, tlsp->tid);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = ip->ip_hl * 4;
 	} else {
 		ip6 = (struct ip6_hdr *)(eh + 1);
 		if (ip6->ip6_nxt != IPPROTO_TCP) {
 			CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)",
 			    __func__, tlsp->tid, ip6->ip6_nxt);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 	}
 	if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    sizeof(*tcp)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen);
 	m->m_pkthdr.l4hlen = tcp->th_off * 4;
 
 	/* Bail if there is TCP payload before the TLS record. */
 	if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    m->m_pkthdr.l4hlen) {
 		CTR6(KTR_CXGBE,
 		    "%s: tid %d header mbuf bad length (%d + %d + %d != %d)",
 		    __func__, tlsp->tid, m->m_pkthdr.l2hlen,
 		    m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len);
 		return (EINVAL);
 	}
 
 	/* Assume all headers are in 'm' for now. */
 	MPASS(m->m_next != NULL);
 	MPASS(m->m_next->m_flags & M_NOMAP);
 
 	tot_len = 0;
 
 	/*
 	 * Each of the remaining mbufs in the chain should reference a
 	 * TLS record.
 	 */
 	*nsegsp = 0;
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_NOMAP);
 
 		wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__,
 		    tlsp->tid, wr_len, nsegs);
 #endif
 		if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS)
 			return (EFBIG);
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 
 		/*
 		 * Store 'nsegs' for the first TLS record in the
 		 * header mbuf's metadata.
 		 */
 		if (*nsegsp == 0)
 			*nsegsp = nsegs;
 	}
 
 	MPASS(tot_len != 0);
 
 	/*
 	 * See if we have any TCP options or a FIN requiring a
 	 * dedicated packet.
 	 */
 	if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) {
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 			    "%s: tid %d options-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len);
 			return (EINVAL);
 		}
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 	}
 
 	/* Include room for a TP work request to program an L2T entry. */
 	tot_len += EQ_ESIZE;
 
 	/*
 	 * Include room for a ULPTX work request including up to 5
 	 * CPL_SET_TCB_FIELD commands before the first TLS work
 	 * request.
 	 */
 	wr_len = sizeof(struct fw_ulptx_wr) +
 	    5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/*
 	 * If timestamps are present, reserve 1 more command for
 	 * setting the echoed timestamp.
 	 */
 	if (tlsp->using_timestamps)
 		wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	tot_len += roundup2(wr_len, EQ_ESIZE);
 
 	*len16p = tot_len / 16;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__,
 	    tlsp->tid, *len16p, *nsegsp);
 #endif
 	return (0);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_buf(struct sglist *gl, caddr_t to)
 {
 	struct sglist_seg *seg;
 	__be64 *flitp;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)to & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, to));
 
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)to;
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	for (i = 0; i < nsegs - 1; i++, seg++) {
 		usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 		usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	flitp += nflits;
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 		if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx])
 			(*to) = (caddr_t)eq->desc;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static int
 ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Clear PUSH and FIN in the TCP header if present. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_flags &= ~(TH_PUSH | TH_FIN);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_options++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = NULL;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m,
     struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	struct mbuf_ext_pgs *ext_pgs;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	/* Locate the template TLS header. */
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	ext_pgs = &m_tls->m_ext_pgs;
 
 	/* This should always be the last TLS record in a chain. */
 	MPASS(m_tls->m_next == NULL);
 
 	wr = dst;
 	pktlen = m->m_len + m_tls->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t));
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of TCP header. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 
 	/* Copy the subset of the TLS header requested. */
 	copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr +
 	    mtod(m_tls, vm_offset_t), &out, m_tls->m_len);
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_header++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE,
     "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor");
 _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW,
     "SND_NXT_RAW and SND_UNA_RAW are in different words");
 
 static int
 ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq,
     void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls,
     u_int nsegs, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt,
     u_int pidx, bool set_l2t_idx)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct fw_ulptx_wr *wr;
 	struct ulp_txpkt *txpkt;
 	struct ulptx_sc_memrd *memrd;
 	struct ulptx_idata *idata;
 	struct cpl_tx_sec_pdu *sec_pdu;
 	struct cpl_tx_data *tx_data;
 	struct mbuf_ext_pgs *ext_pgs;
 	struct tls_record_layer *hdr;
 	char *iv, *out;
 	u_int aad_start, aad_stop;
 	u_int auth_start, auth_stop, auth_insert;
 	u_int cipher_start, cipher_stop, iv_offset;
 	u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len;
 	u_int fields, tx_max_offset, tx_max;
 	bool first_wr, last_wr, using_scratch;
 
 	ndesc = 0;
 	MPASS(tlsp->txq == txq);
 
 	first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 &&
 	    tlsp->prev_win == 0);
 
 	/*
 	 * Use the per-txq scratch pad if near the end of the ring to
 	 * simplify handling of wrap-around.  This uses a simple but
 	 * not quite perfect test of using the scratch buffer if we
 	 * can't fit a maximal work request in without wrapping.
 	 */
 	using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE);
 
 	/* Locate the TLS header. */
 	MBUF_EXT_PGS_ASSERT(m_tls);
 	ext_pgs = &m_tls->m_ext_pgs;
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - ext_pgs->trail_len;
 
 	/* Determine how much of the TLS record to send. */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= ext_pgs->hdr_len) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u",
 		    __func__, tlsp->tid, (u_int)ext_pgs->seqno);
 #endif
 		return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available,
 		    tcp_seqno, pidx));
 	}
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u",
 		    __func__, tlsp->tid, (u_int)ext_pgs->seqno, offset);
 #endif
 		if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) {
 			txq->kern_tls_fin_short++;
 #ifdef INVARIANTS
 			panic("%s: FIN on short TLS record", __func__);
 #endif
 		}
 	} else
 		offset = 0;
 
 	/*
 	 * This is the last work request for a given TLS mbuf chain if
 	 * it is the last mbuf in the chain and FIN is not set.  If
 	 * FIN is set, then ktls_write_tcp_fin() will write out the
 	 * last work request.
 	 */
 	last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0;
 
 	/*
 	 * The host stack may ask us to not send part of the start of
 	 * a TLS record.  (For example, the stack might have
 	 * previously sent a "short" TLS record and might later send
 	 * down an mbuf that requests to send the remainder of the TLS
 	 * record.)  The crypto engine must process a TLS record from
 	 * the beginning if computing a GCM tag or HMAC, so we always
 	 * send the TLS record from the beginning as input to the
 	 * crypto engine and via CPL_TX_DATA to TP.  However, TP will
 	 * drop individual packets after they have been chopped up
 	 * into MSS-sized chunks if the entire sequence range of those
 	 * packets is less than SND_UNA.  SND_UNA is computed as
 	 * TX_MAX - SND_UNA_RAW.  Thus, use the offset stored in
 	 * m_data to set TX_MAX to the first byte in the TCP sequence
 	 * space the host actually wants us to send and set
 	 * SND_UNA_RAW to 0.
 	 *
 	 * If the host sends us back to back requests that span the
 	 * trailer of a single TLS record (first request ends "in" the
 	 * trailer and second request starts at the next byte but
 	 * still "in" the trailer), the initial bytes of the trailer
 	 * that the first request drops will not be retransmitted.  If
 	 * the host uses the same requests when retransmitting the
 	 * connection will hang.  To handle this, always transmit the
 	 * full trailer for a request that begins "in" the trailer
 	 * (the second request in the example above).  This should
 	 * also help to avoid retransmits for the common case.
 	 *
 	 * A similar condition exists when using CBC for back to back
 	 * requests that span a single AES block.  The first request
 	 * will be truncated to end at the end of the previous AES
 	 * block.  To handle this, always begin transmission at the
 	 * start of the current AES block.
 	 */
 	tx_max_offset = mtod(m_tls, vm_offset_t);
 	if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 	    ext_pgs->trail_len) {
 		/* Always send the full trailer. */
 		tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 		    ext_pgs->trail_len;
 	}
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    tx_max_offset > TLS_HEADER_LENGTH) {
 		/* Always send all of the first AES block. */
 		tx_max_offset = TLS_HEADER_LENGTH +
 		    rounddown(tx_max_offset - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 	tx_max = tcp_seqno + tx_max_offset;
 
 	/*
 	 * Update TCB fields.  Reserve space for the FW_ULPTX_WR header
 	 * but don't populate it until we know how many field updates
 	 * are required.
 	 */
 	if (using_scratch)
 		wr = (void *)txq->ss;
 	else
 		wr = dst;
 	out = (void *)(wr + 1);
 	fields = 0;
 	if (set_l2t_idx) {
 		KASSERT(nsegs != 0,
 		    ("trying to set L2T_IX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__,
 		    tlsp->tid, tlsp->l2te->idx);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX,
 		    V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 	if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) {
 		KASSERT(nsegs != 0,
 		    ("trying to set T_RTSEQ_RECENT for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT",
 		    __func__, tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT,
 		    V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT),
 		    V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1])));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_tsecr = ntohl(tsopt[1]);
 	}
 
 	if (first_wr || tlsp->prev_seq != tx_max) {
 		KASSERT(nsegs != 0,
 		    ("trying to set TX_MAX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE,
 		    "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)",
 		    __func__, tlsp->tid, tx_max, tcp_seqno);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX,
 		    V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * If there is data to drop at the beginning of this TLS
 	 * record or if this is a retransmit,
 	 * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX.
 	 */
 	if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) {
 		KASSERT(nsegs != 0,
 		    ("trying to clear SND_UNA_RAW for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__,
 		    tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW,
 		    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 		    V_TCB_SND_UNA_RAW(0));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * Store the expected sequence number of the next byte after
 	 * this record.
 	 */
 	tlsp->prev_seq = tcp_seqno + tlen;
 
 	if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_NXT for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT,
 		    V_TCB_RCV_NXT(M_TCB_RCV_NXT),
 		    V_TCB_RCV_NXT(ntohl(tcp->th_ack)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_ack = ntohl(tcp->th_ack);
 	}
 
 	if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_WND for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND,
 		    V_TCB_RCV_WND(M_TCB_RCV_WND),
 		    V_TCB_RCV_WND(ntohs(tcp->th_win)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_win = ntohs(tcp->th_win);
 	}
 
 	/* Recalculate 'nsegs' if cached value is not available. */
 	if (nsegs == 0)
-		nsegs = sglist_count_ext_pgs(m_tls, ext_pgs->hdr_len +
+		nsegs = sglist_count_mbuf_epg(m_tls, ext_pgs->hdr_len +
 		    offset, plen - (ext_pgs->hdr_len + offset));
 
 	/* Calculate the size of the TLS work request. */
 	twr_len = ktls_base_wr_size(tlsp);
 
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += ext_pgs->hdr_len;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	twr_len += roundup2(imm_len, 16);
 	twr_len += ktls_sgl_size(nsegs);
 
 	/*
 	 * If any field updates were required, determine if they can
 	 * be included in the TLS work request.  If not, use the
 	 * FW_ULPTX_WR work request header at 'wr' as a dedicated work
 	 * request for the field updates and start a new work request
 	 * for the TLS work request afterward.
 	 */
 	if (fields != 0) {
 		wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		if (twr_len + wr_len <= SGE_MAX_WR_LEN &&
 		    tlsp->sc->tlst.combo_wrs) {
 			wr_len += twr_len;
 			txpkt = (void *)out;
 		} else {
 			wr_len += sizeof(*wr);
 			wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 			wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 			    V_FW_WR_LEN16(wr_len / 16));
 			wr->cookie = 0;
 
 			/*
 			 * If we were using scratch space, copy the
 			 * field updates work request to the ring.
 			 */
 			if (using_scratch) {
 				out = dst;
 				copy_to_txd(eq, txq->ss, &out, wr_len);
 			}
 
 			ndesc = howmany(wr_len, EQ_ESIZE);
 			MPASS(ndesc <= available);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 
 			/*
 			 * Determine if we should use scratch space
 			 * for the TLS work request based on the
 			 * available space after advancing pidx for
 			 * the field updates work request.
 			 */
 			wr_len = twr_len;
 			using_scratch = (eq->sidx - pidx <
 			    howmany(wr_len, EQ_ESIZE));
 			if (using_scratch)
 				wr = (void *)txq->ss;
 			else
 				wr = dst;
 			txpkt = (void *)(wr + 1);
 		}
 	} else {
 		wr_len = twr_len;
 		txpkt = (void *)out;
 	}
 
 	wr_len = roundup2(wr_len, 16);
 	MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available);
 
 	/* FW_ULPTX_WR */
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(wr_len / 16));
 	wr->cookie = 0;
 
 	/* ULP_TXPKT */
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (void *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 	    V_ULP_TX_SC_MORE(1));
 	idata->len = sizeof(struct cpl_tx_sec_pdu);
 
 	/*
 	 * The key context, CPL_TX_DATA, and immediate data are part
 	 * of this ULPTX_IDATA when using an inline key.  When reading
 	 * the key from memory, the CPL_TX_DATA and immediate data are
 	 * part of a separate ULPTX_IDATA.
 	 */
 	if (tlsp->inline_key)
 		idata->len += tlsp->tx_key_info_size +
 		    sizeof(struct cpl_tx_data) + imm_len;
 	idata->len = htobe32(idata->len);
 
 	/* CPL_TX_SEC_PDU */
 	sec_pdu = (void *)(idata + 1);
 
 	/*
 	 * For short records, AAD is counted as header data in SCMD0,
 	 * the IV is next followed by a cipher region for the payload.
 	 */
 	if (plen == tlen) {
 		aad_start = 0;
 		aad_stop = 0;
 		iv_offset = 1;
 		auth_start = 0;
 		auth_stop = 0;
 		auth_insert = 0;
 		cipher_start = AES_BLOCK_LEN + 1;
 		cipher_stop = 0;
 
 		sec_pdu->pldlen = htobe32(16 + plen -
 		    (ext_pgs->hdr_len + offset));
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = htobe32(
 		    tlsp->scmd0_short.ivgen_hdrlen |
 		    V_SCMD_HDR_LEN(offset == 0 ? ext_pgs->hdr_len : 0));
 
 		txq->kern_tls_short++;
 	} else {
 		/*
 		 * AAD is TLS header.  IV is after AAD.  The cipher region
 		 * starts after the IV.  See comments in ccr_authenc() and
 		 * ccr_gmac() in t4_crypto.c regarding cipher and auth
 		 * start/stop values.
 		 */
 		aad_start = 1;
 		aad_stop = TLS_HEADER_LENGTH;
 		iv_offset = TLS_HEADER_LENGTH + 1;
 		cipher_start = ext_pgs->hdr_len + 1;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		} else {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		}
 
 		sec_pdu->pldlen = htobe32(plen);
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen;
 
 		if (mtod(m_tls, vm_offset_t) == 0)
 			txq->kern_tls_full++;
 		else
 			txq->kern_tls_partial++;
 	}
 	sec_pdu->op_ivinsrtofst = htobe32(
 	    V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) |
 	    V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) |
 	    V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset));
 	sec_pdu->aadstart_cipherstop_hi = htobe32(
 	    V_CPL_TX_SEC_PDU_AADSTART(aad_start) |
 	    V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4));
 	sec_pdu->cipherstop_lo_authinsert = htobe32(
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) |
 	    V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) |
 	    V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) |
 	    V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert));
 
 	sec_pdu->scmd1 = htobe64(ext_pgs->seqno);
 
 	/* Key context */
 	out = (void *)(sec_pdu + 1);
 	if (tlsp->inline_key) {
 		memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size);
 		out += tlsp->tx_key_info_size;
 	} else {
 		/* ULPTX_SC_MEMRD to read key context. */
 		memrd = (void *)out;
 		memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4));
 		memrd->addr = htobe32(tlsp->tx_key_addr >> 5);
 
 		/* ULPTX_IDATA for CPL_TX_DATA and TLS header. */
 		idata = (void *)(memrd + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 		    V_ULP_TX_SC_MORE(1));
 		idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len);
 
 		out = (void *)(idata + 1);
 	}
 
 	/* CPL_TX_DATA */
 	tx_data = (void *)out;
 	OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid));
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		mss = m->m_pkthdr.tso_segsz;
 		tlsp->prev_mss = mss;
 	} else if (tlsp->prev_mss != 0)
 		mss = tlsp->prev_mss;
 	else
 		mss = tlsp->vi->ifp->if_mtu -
 		    (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen);
 	if (offset == 0) {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen));
 		tx_data->rsvd = htobe32(tcp_seqno);
 	} else {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) |
 		    V_TX_LENGTH(tlen - (ext_pgs->hdr_len + offset)));
 		tx_data->rsvd = htobe32(tcp_seqno + ext_pgs->hdr_len + offset);
 	}
 	tx_data->flags = htobe32(F_TX_BYPASS);
 	if (last_wr && tcp->th_flags & TH_PUSH)
 		tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE);
 
 	/* Populate the TLS header */
 	out = (void *)(tx_data + 1);
 	if (offset == 0) {
 		memcpy(out, m_tls->m_epg_hdr, ext_pgs->hdr_len);
 		out += ext_pgs->hdr_len;
 	}
 
 	/* AES IV for a short record. */
 	if (plen == tlen) {
 		iv = out;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			memcpy(iv, tlsp->keyctx.txhdr.txsalt, SALT_SIZE);
 			memcpy(iv + 4, hdr + 1, 8);
 			*(uint32_t *)(iv + 12) = htobe32(2 +
 			    offset / AES_BLOCK_LEN);
 		} else
 			memcpy(iv, hdr + 1, AES_BLOCK_LEN);
 		out += AES_BLOCK_LEN;
 	}
 
 	if (imm_len % 16 != 0) {
 		/* Zero pad to an 8-byte boundary. */
 		memset(out, 0, 8 - (imm_len % 8));
 		out += 8 - (imm_len % 8);
 
 		/*
 		 * Insert a ULP_TX_SC_NOOP if needed so the SGL is
 		 * 16-byte aligned.
 		 */
 		if (imm_len % 16 <= 8) {
 			idata = (void *)out;
 			idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 			idata->len = htobe32(0);
 			out = (void *)(idata + 1);
 		}
 	}
 
 	/* SGL for record payload */
 	sglist_reset(txq->gl);
-	if (sglist_append_ext_pgs(txq->gl, m_tls, ext_pgs->hdr_len + offset,
+	if (sglist_append_mbuf_epg(txq->gl, m_tls, ext_pgs->hdr_len + offset,
 	    plen - (ext_pgs->hdr_len + offset)) != 0) {
 #ifdef INVARIANTS
 		panic("%s: failed to append sglist", __func__);
 #endif
 	}
 	write_gl_to_buf(txq->gl, out);
 
 	if (using_scratch) {
 		out = dst;
 		copy_to_txd(eq, txq->ss, &out, wr_len);
 	}
 
 	ndesc += howmany(wr_len, EQ_ESIZE);
 	MPASS(ndesc <= available);
 
 	txq->kern_tls_records++;
 	txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t);
 	if (mtod(m_tls, vm_offset_t) != 0) {
 		if (offset == 0)
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t);
 		else
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t) -
 			    (ext_pgs->hdr_len + offset);
 	}
 
 	txsd = &txq->sdesc[pidx];
 	if (last_wr)
 		txsd->m = m;
 	else
 		txsd->m = NULL;
 	txsd->desc_used = howmany(wr_len, EQ_ESIZE);
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_fin++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct tlspcb *tlsp;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	struct ether_header *eh;
 	tcp_seq tcp_seqno;
 	u_int ndesc, pidx, totdesc;
 	uint16_t vlan_tag;
 	bool has_fin, set_l2t_idx;
 	void *tsopt;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	totdesc = 0;
 	eh = mtod(m, struct ether_header *);
 	tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen +
 	    m->m_pkthdr.l3hlen);
 	pidx = eq->pidx;
 	has_fin = (tcp->th_flags & TH_FIN) != 0;
 
 	/*
 	 * If this TLS record has a FIN, then we will send any
 	 * requested options as part of the FIN packet.
 	 */
 	if (!has_fin && ktls_has_tcp_options(tcp)) {
 		ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__,
 		    tlsp->tid);
 #endif
 	}
 
 	/*
 	 * Allocate a new L2T entry if necessary.  This may write out
 	 * a work request to the txq.
 	 */
 	if (m->m_flags & M_VLANTAG)
 		vlan_tag = m->m_pkthdr.ether_vtag;
 	else
 		vlan_tag = 0xfff;
 	set_l2t_idx = false;
 	if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag ||
 	    memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
 		set_l2t_idx = true;
 		if (tlsp->l2te)
 			t4_l2t_release(tlsp->l2te);
 		tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc,
 		    vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost);
 		if (tlsp->l2te == NULL)
 			CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE");
 		if (ndesc != 0) {
 			MPASS(ndesc <= available - totdesc);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			totdesc += ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 		}
 	}
 
 	/*
 	 * Iterate over each TLS record constructing a work request
 	 * for that record.
 	 */
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_NOMAP);
 
 		/*
 		 * Determine the initial TCP sequence number for this
 		 * record.
 		 */
 		tsopt = NULL;
 		if (m_tls == m->m_next) {
 			tcp_seqno = ntohl(tcp->th_seq) -
 			    mtod(m_tls, vm_offset_t);
 			if (tlsp->using_timestamps)
 				tsopt = ktls_find_tcp_timestamps(tcp);
 		} else {
 			MPASS(mtod(m_tls, vm_offset_t) == 0);
 			tcp_seqno = tlsp->prev_seq;
 		}
 
 		ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls,
 		    nsegs, available - totdesc, tcp_seqno, tsopt, pidx,
 		    set_l2t_idx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 
 		/*
 		 * The value of nsegs from the header mbuf's metadata
 		 * is only valid for the first TLS record.
 		 */
 		nsegs = 0;
 
 		/* Only need to set the L2T index once. */
 		set_l2t_idx = false;
 	}
 
 	if (has_fin) {
 		/*
 		 * If the TCP header for this chain has FIN sent, then
 		 * explicitly send a packet that has FIN set.  This
 		 * will also have PUSH set if requested.  This assumes
 		 * we sent at least one TLS record work request and
 		 * uses the TCP sequence number after that reqeust as
 		 * the sequence number for the FIN packet.
 		 */
 		ndesc = ktls_write_tcp_fin(txq, dst, m, available,
 		    tlsp->prev_seq, pidx);
 		totdesc += ndesc;
 	}
 
 	MPASS(totdesc <= available);
 	return (totdesc);
 }
 
 void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	struct adapter *sc;
 	struct tlspcb *tlsp;
 
 	tlsp = mst_to_tls(mst);
 	sc = tlsp->sc;
 
 	CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid);
 
 	if (tlsp->l2te)
 		t4_l2t_release(tlsp->l2te);
 	if (tlsp->tid >= 0)
 		release_tid(sc, tlsp->tid, tlsp->ctrlq);
 	if (tlsp->ce)
 		t4_release_lip(sc, tlsp->ce);
 	if (tlsp->tx_key_addr >= 0)
 		free_keyid(tlsp, tlsp->tx_key_addr);
 
 	explicit_bzero(&tlsp->keyctx, sizeof(&tlsp->keyctx));
 	free(tlsp, M_CXGBE);
 }
 
 void
 t6_ktls_modload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 void
 t6_ktls_modunload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 #else
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	return (ENXIO);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	return (EINVAL);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	panic("can't happen");
 }
 
 void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	panic("can't happen");
 }
 
 void
 t6_ktls_modload(void)
 {
 }
 
 void
 t6_ktls_modunload(void)
 {
 }
 
 #endif
Index: head/sys/dev/cxgbe/t4_sge.c
===================================================================
--- head/sys/dev/cxgbe/t4_sge.c	(revision 360573)
+++ head/sys/dev/cxgbe/t4_sge.c	(revision 360574)
@@ -1,6067 +1,6068 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/socketvar.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /* Internal mbuf flags stored in PH_loc.eight[1]. */
 #define	MC_NOMAP		0x01
 #define	MC_RAW_WR		0x02
 #define	MC_TLS			0x04
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
     "payload DMA offset in rx buffer (bytes)");
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
     "payload pad boundary (bytes)");
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
     "status page size (bytes)");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
     "Congestion control for RX queues (0 = backpressure, 1 = drop");
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
     0, "Enable buffer packing");
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
     "payload pack boundary (bytes)");
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
 
 #ifdef RATELIMIT
 /*
  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
  * for rewriting.  -1 and 0-3 are all valid values.
  * -1: hardware should leave the TCP timestamps alone.
  * 0: 1ms
  * 1: 100us
  * 2: 10us
  * 3: 1us
  */
 static int tsclk = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
     "Control TCP timestamp rewriting when using pacing");
 
 static int eo_max_backlog = 1024 * 1024;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
     0, "Maximum backlog of ratelimited data per flow");
 #endif
 
 /*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
 static int tscale = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
     "Interrupt holdoff timer scale on T6+");
 
 /*
  * Number of LRO entries in the lro_ctrl structure per rx queue.
  */
 static int lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
     "Number of LRO entries per RX queue");
 
 /*
  * This enables presorting of frames before they're fed into tcp_lro_rx.
  */
 static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *,
     u_int);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_iq *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int,
     struct sysctl_oid *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static int find_refill_source(struct adapter *, int, bool);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 #endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
 
 void
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(h == NULL || t4_an_handler == NULL);
 
 	loc = (uintptr_t *)&t4_an_handler;
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(type < nitems(t4_fw_msg_handler));
 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	MPASS(type != FW_TYPE_RSSCPL);
 	MPASS(type != FW6_TYPE_RSSCPL);
 
 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
 
 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 static int
 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	u_int tid;
 	int cookie;
 
 	MPASS(m == NULL);
 
 	tid = GET_TID(cpl);
 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
 		/*
 		 * The return code for filter-write is put in the CPL cookie so
 		 * we have to rely on the hardware tid (is_ftid) to determine
 		 * that this is a response to a filter.
 		 */
 		cookie = CPL_COOKIE_FILTER;
 	} else {
 		cookie = G_COOKIE(cpl->cookie);
 	}
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
 
 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 	unsigned int cookie;
 
 	MPASS(m == NULL);
 
 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 
 	MPASS(m == NULL);
 	MPASS(cookie != CPL_COOKIE_RESERVED);
 
 	return (act_open_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_hashfilter(sc))
 		cookie = CPL_COOKIE_HASHFILTER;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
 }
 
 static int
 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_etid(sc, tid))
 		cookie = CPL_COOKIE_ETHOFLD;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (fw4_ack_handlers[cookie](iq, rss, m));
 }
 
 static void
 t4_init_shared_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
 }
 
 void
 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < NUM_CPL_COOKIES);
 	MPASS(t4_cpl_handler[opcode] != NULL);
 
 	switch (opcode) {
 	case CPL_SET_TCB_RPL:
 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
 		break;
 	case CPL_L2T_WRITE_RPL:
 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
 		break;
 	case CPL_ACT_OPEN_RPL:
 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
 		break;
 	case CPL_ABORT_RPL_RSS:
 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
 		break;
 	case CPL_FW4_ACK:
 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
 		break;
 	default:
 		MPASS(0);
 		return;
 	}
 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 0 instead.\n", fl_pktshift);
 		fl_pktshift = 0;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
 		printf("Invalid hw.cxgbe.tscale value (%d),"
 		    " using 1 instead.\n", tscale);
 		tscale = 1;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_init_shared_cpl_handlers();
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 #ifdef RATELIMIT
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
 	    CPL_COOKIE_ETHOFLD);
 #endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 /* max 4096 */
 #define MAX_PACK_BOUNDARY 512
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		if (sc->params.pci.mps > MAX_PACK_BOUNDARY)
 			pack = MAX_PACK_BOUNDARY;
 		else
 			pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i, reg;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
 	reg = A_SGE_FL_BUFFER_SIZE2;
 	for (i = 0; i < nitems(sw_buf_sizes); i++) {
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i]);
 		reg += 4;
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE);
 		reg += 4;
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		m = V_TSCALE(M_TSCALE);
 		if (tscale == 1)
 			v = 0;
 		else
 			v = V_TSCALE(tscale - 2);
 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
 
 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 			v &= ~m;
 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(16);
 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 		}
 	}
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  Its
  * address mut be 16B aligned.  If padding is in use the buffer's start and end
  * need to be aligned to the pad boundary as well.  We'll just make sure that
  * the size is a multiple of the pad boundary here, it is up to the buffer
  * allocation code to make sure the start of the buffer is aligned.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct rx_buf_info *rxb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	s->safe_zidx = -1;
 	rxb = &s->rx_buf_info[0];
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		rxb->size1 = sw_buf_sizes[i];
 		rxb->zone = m_getzone(rxb->size1);
 		rxb->type = m_gettype(rxb->size1);
 		rxb->size2 = 0;
 		rxb->hwidx1 = -1;
 		rxb->hwidx2 = -1;
 		for (j = 0; j < SGE_FLBUF_SIZES; j++) {
 			int hwsize = sp->sge_fl_buffer_size[j];
 
 			if (!hwsz_ok(sc, hwsize))
 				continue;
 
 			/* hwidx for size1 */
 			if (rxb->hwidx1 == -1 && rxb->size1 == hwsize)
 				rxb->hwidx1 = j;
 
 			/* hwidx for size2 (buffer packing) */
 			if (rxb->size1 - CL_METADATA_SIZE < hwsize)
 				continue;
 			n = rxb->size1 - hwsize - CL_METADATA_SIZE;
 			if (n == 0) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 				break;	/* stop looking */
 			}
 			if (rxb->hwidx2 != -1) {
 				if (n < sp->sge_fl_buffer_size[rxb->hwidx2] -
 				    hwsize - CL_METADATA_SIZE) {
 					rxb->hwidx2 = j;
 					rxb->size2 = hwsize;
 				}
 			} else if (n <= 2 * CL_METADATA_SIZE) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 			}
 		}
 		if (rxb->hwidx2 != -1)
 			sc->flags |= BUF_PACKING_OK;
 		if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster)
 			s->safe_zidx = i;
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc, 1);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, sc, 0,
 	    sysctl_bufsizes, "A", "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue, control queues, and special
  * purpose rx queues owned by the adapter.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * That's all for the VF driver.
 	 */
 	if (sc->flags & IS_VF)
 		return (rc);
 
 	oid = device_get_sysctl_tree(sc->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	/*
 	 * XXX: General purpose rx queues, one per port.
 	 */
 
 	/*
 	 * Control queues, one per port.
 	 */
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues");
 	for_each_port(sc, i) {
 		struct sge_wrq *ctrlq = &sc->sge.ctrlq[i];
 
 		rc = alloc_ctrlq(sc, ctrlq, i, oid);
 		if (rc != 0)
 			return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	if (!(sc->flags & IS_VF)) {
 		for_each_port(sc, i)
 			free_wrq(sc, &sc->sge.ctrlq[i]);
 	}
 	free_fwq(sc);
 
 	return (0);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu)
 {
 
 	/* large enough even when hw VLAN extraction is disabled */
 	return (sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 	    ETHER_VLAN_ENCAP_LEN + mtu);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, intr_idx, iqidx;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = vi->first_intr;
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(!forwarding_intr_to_fwq(sc));
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * Allocate rx queues first because a default iqid is required when
 	 * creating a tx queue.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_rxq(vi, rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues for offloaded TCP connections");
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_ofld_rxq(vi, ofld_rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #endif
 
 	/*
 	 * Now the tx queues.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
 	for_each_txq(vi, i, txq) {
 		iqidx = vi->first_rxq + (i % vi->nrxq);
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues for TOE/ETHOFLD");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		if (vi->nofldrxq > 0) {
 			iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id,
 			    name);
 		} else {
 			iqidx = vi->first_rxq + (i % vi->nrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name);
 		}
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 	}
 #endif
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
  * unusual scenario.
  *
  * a) Deals with errors, if any.
  * b) Services firmware event queue, which is taking interrupts for all other
  *    queues.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	MPASS(sc->intr_count == 1);
 
 	if (sc->intr_type == INTR_INTX)
 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 
 	t4_intr_err(arg);
 	t4_intr_evt(fwq);
 }
 
 /*
  * Interrupt handler for errors (installed directly when multiple interrupts are
  * being used, or called by t4_intr_all).
  */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 	uint32_t v;
 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
 
 	if (sc->flags & ADAP_ERR)
 		return;
 
 	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
 	if (v & F_PFSW) {
 		sc->swintr++;
 		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
 	}
 
 	t4_slow_intr_handler(sc, verbose);
 }
 
 /*
  * Interrupt handler for iq-only queues.  The firmware event queue is the only
  * such queue right now.
  */
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /*
  * Interrupt handler for iq+fl queues.
  */
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq_fl(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 #ifdef DEV_NETMAP
 /*
  * Interrupt handler for netmap rx queues.
  */
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 
 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
 		service_nm_rxq(nm_rxq);
 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
 	}
 }
 
 /*
  * Interrupt handler for vectors shared between NIC and netmap rx queues.
  */
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 	MPASS(irq->nm_rxq != NULL);
 	t4_nm_intr(irq->nm_rxq);
 
 	MPASS(irq->rxq != NULL);
 	t4_intr(irq->rxq);
 }
 #endif
 
 /*
  * Deals with interrupts on an iq-only (no freelist) queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type;
 	uint32_t lq;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
 	    iq->flags));
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 				panic("%s: data for an iq (%p) with no freelist",
 				    __func__, iq);
 
 				/* NOTREACHED */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (__predict_true(lq >= 1024)) {
 					t4_an_handler(iq, &d->rsp);
 					break;
 				}
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq_fl(q, q->qsize / 16) == 0) {
 						(void) atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 				if (budget) {
 					return (EINPROGRESS);
 				}
 			}
 		}
 
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq_fl(q, q->qsize / 8) == 0)
 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	return (0);
 }
 
 static inline int
 sort_before_lro(struct lro_ctrl *lro)
 {
 
 	return (lro->lro_mbuf_max != 0);
 }
 
 static inline uint64_t
 last_flit_to_ns(struct adapter *sc, uint64_t lf)
 {
 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
 
 	if (n > UINT64_MAX / 1000000)
 		return (n / sc->params.vpd.cclk * 1000000);
 	else
 		return (n * 1000000 / sc->params.vpd.cclk);
 }
 
 static inline void
 move_to_next_rxbuf(struct sge_fl *fl)
 {
 
 	fl->rx_offset = 0;
 	if (__predict_false((++fl->cidx & 7) == 0)) {
 		uint16_t cidx = fl->cidx >> 3;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 }
 
 /*
  * Deals with interrupts on an iq+fl queue.
  */
 static int
 service_iq_fl(struct sge_iq *iq, int budget)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct sge_fl *fl;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs, limit;
 	int rsp_type, starved;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	MPASS(iq->flags & IQ_HAS_FL);
 
 	ndescs = 0;
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_ADJ_CREDIT) {
 		MPASS(sort_before_lro(lro));
 		iq->flags &= ~IQ_ADJ_CREDIT;
 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
 			tcp_lro_flush_all(lro);
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
 			    V_INGRESSQID((u32)iq->cntxt_id) |
 			    V_SEINTARM(iq->intr_params));
 			return (0);
 		}
 		ndescs = 1;
 	}
 #else
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 #endif
 
 	limit = budget ? budget : iq->qsize / 16;
 	fl = &rxq->fl;
 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 		rmb();
 
 		m0 = NULL;
 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 		lq = be32toh(d->rsp.pldbuflen_qid);
 
 		switch (rsp_type) {
 		case X_RSPD_TYPE_FLBUF:
 			if (lq & F_RSPD_NEWBUF) {
 				if (fl->rx_offset > 0)
 					move_to_next_rxbuf(fl);
 				lq = G_RSPD_LEN(lq);
 			}
 			if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 64);
 				FL_UNLOCK(fl);
 				fl_hw_cidx = fl->hw_cidx;
 			}
 
 			if (d->rss.opcode == CPL_RX_PKT) {
 				if (__predict_true(eth_rx(sc, rxq, d, lq) == 0))
 					break;
 				goto out;
 			}
 			m0 = get_fl_payload(sc, fl, lq);
 			if (__predict_false(m0 == NULL))
 				goto out;
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 			break;
 
 		case X_RSPD_TYPE_INTR:
 
 			/*
 			 * There are 1K interrupt-capable queues (qids 0
 			 * through 1023).  A response type indicating a
 			 * forwarded interrupt with a qid >= 1K is an
 			 * iWARP async notification.  That is the only
 			 * acceptable indirect interrupt on this queue.
 			 */
 			if (__predict_false(lq < 1024)) {
 				panic("%s: indirect interrupt on iq_fl %p "
 				    "with qid %u", __func__, iq, lq);
 			}
 
 			t4_an_handler(iq, &d->rsp);
 			break;
 
 		default:
 			KASSERT(0, ("%s: illegal response type %d on iq %p",
 			    __func__, rsp_type, iq));
 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
 			    device_get_nameunit(sc->dev), rsp_type, iq);
 			break;
 		}
 
 		d++;
 		if (__predict_false(++iq->cidx == iq->sidx)) {
 			iq->cidx = 0;
 			iq->gen ^= F_RSPD_GEN;
 			d = &iq->desc[0];
 		}
 		if (__predict_false(++ndescs == limit)) {
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 			    V_INGRESSQID(iq->cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 
 #if defined(INET) || defined(INET6)
 			if (iq->flags & IQ_LRO_ENABLED &&
 			    !sort_before_lro(lro) &&
 			    sc->lro_timeout != 0) {
 				tcp_lro_flush_inactive(lro, &lro_timeout);
 			}
 #endif
 			if (budget)
 				return (EINPROGRESS);
 			ndescs = 0;
 		}
 	}
 out:
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
 			MPASS(sort_before_lro(lro));
 			/* hold back one credit and don't flush LRO state */
 			iq->flags |= IQ_ADJ_CREDIT;
 			ndescs--;
 		} else {
 			tcp_lro_flush_all(lro);
 		}
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	FL_LOCK(fl);
 	starved = refill_fl(sc, fl, 64);
 	FL_UNLOCK(fl);
 	if (__predict_false(starved != 0))
 		add_fl_to_sfl(sc, fl);
 
 	return (0);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct fl_sdesc *sd)
 {
 
 	return ((void *)(sd->cl + sd->moff));
 }
 
 static void
 rxb_free(struct mbuf *m)
 {
 	struct cluster_metadata *clm = m->m_ext.ext_arg1;
 
 	uma_zfree(clm->zone, clm->cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned comes from zone_muf and carries the payload in one of these
  * ways
  * a) complete frame inside the mbuf
  * b) m_cljset (for clusters without metadata)
  * d) m_extaddref (cluster with metadata)
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	struct cluster_metadata *clm;
 	int len, blen;
 	caddr_t payload;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 		payload = sd->cl + fl->rx_offset;
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= rxb->size2);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 		payload = sd->cl;
 	}
 
 	if (fr_offset == 0) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 		m->m_pkthdr.len = remaining;
 	} else {
 		m = m_get(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 	}
 	m->m_len = len;
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->rx_offset += blen;
 			MPASS(fl->rx_offset <= rxb->size2);
 			if (fl->rx_offset < rxb->size2)
 				return (m);	/* without advancing the cidx */
 		}
 	} else if (fl->flags & FL_BUF_PACKING) {
 		clm = cl_metadata(sd);
 		if (sd->nmbuf++ == 0) {
 			clm->refcount = 1;
 			clm->zone = rxb->zone;
 			clm->cl = sd->cl;
 			counter_u64_add(extfree_refs, 1);
 		}
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm,
 		    NULL);
 
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (m);	/* without advancing the cidx */
 	} else {
 		m_cljset(m, sd->cl, rxb->type);
 		sd->cl = NULL;	/* consumed, not a recycle candidate */
 	}
 
 	move_to_next_rxbuf(fl);
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == plen);
 		MPASS(fl->remaining < plen);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, plen);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = plen - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, plen - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	int len, blen;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (len);	/* without advancing the cidx */
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 	}
 	move_to_next_rxbuf(fl);
 	return (len);
 }
 
 static inline void
 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int remaining, fr_offset, len;
 
 	fr_offset = 0;
 	remaining = plen;
 	while (remaining > 0) {
 		len = skip_scatter_segment(sc, fl, fr_offset, remaining);
 		fr_offset += len;
 		remaining -= len;
 	}
 }
 
 static inline int
 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int len;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 
 	if (fl->flags & FL_BUF_PACKING)
 		len = rxb->size2 - fl->rx_offset;
 	else
 		len = rxb->size1;
 
 	return (min(plen, len));
 }
 
 static int
 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d,
     u_int plen)
 {
 	struct mbuf *m0;
 	struct ifnet *ifp = rxq->ifp;
 	struct sge_fl *fl = &rxq->fl;
 	struct vi_info *vi = ifp->if_softc;
 	const struct cpl_rx_pkt *cpl;
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	MPASS(plen > sc->params.sge.fl_pktshift);
 	if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
 	    __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
 		struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 		caddr_t frame;
 		int rc, slen;
 
 		slen = get_segment_len(sc, fl, plen) -
 		    sc->params.sge.fl_pktshift;
 		frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift;
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		rc = pfil_run_hooks(vi->pfil, frame, ifp,
 		    slen | PFIL_MEMPTR | PFIL_IN, NULL);
 		CURVNET_RESTORE();
 		if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) {
 			skip_fl_payload(sc, fl, plen);
 			return (0);
 		}
 		if (rc == PFIL_REALLOCED) {
 			skip_fl_payload(sc, fl, plen);
 			m0 = pfil_mem2mbuf(frame);
 			goto have_mbuf;
 		}
 	}
 
 	m0 = get_fl_payload(sc, fl, plen);
 	if (__predict_false(m0 == NULL))
 		return (ENOMEM);
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 have_mbuf:
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]);
 	m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
 
 	cpl = (const void *)(&d->rss + 1);
 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
 		/*
 		 * Fill up rcv_tstmp but do not set M_TSTMP.
 		 * rcv_tstmp is not in the format that the
 		 * kernel expects and we don't want to mislead
 		 * it.  For now this is only for custom code
 		 * that knows how to interpret cxgbe's stamp.
 		 */
 		m0->m_pkthdr.rcv_tstmp =
 		    last_flit_to_ns(sc, d->rsp.u.last_flit);
 #ifdef notyet
 		m0->m_flags |= M_TSTMP;
 #endif
 	}
 
 #ifdef NUMA
 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
 	if (rxq->iq.flags & IQ_LRO_ENABLED &&
 	    (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
 	    M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
 		if (sort_before_lro(lro)) {
 			tcp_lro_queue_mbuf(lro, m0);
 			return (0); /* queued for sort, then LRO */
 		}
 		if (tcp_lro_rx(lro, m0, 0) == 0)
 			return (0); /* queued for LRO */
 	}
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_cflags(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[4]);
 }
 
 static inline void
 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[4] = flags;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	if (!(mbuf_cflags(m) & MC_TLS))
 		MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 #ifdef RATELIMIT
 static inline int
 mbuf_eo_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[1]);
 }
 
 static inline void
 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
 }
 
 static inline int
 mbuf_eo_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[2];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[2] = len16;
 }
 
 static inline int
 mbuf_eo_tsclk_tsoff(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[3]);
 }
 
 static inline void
 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
 }
 
 static inline int
 needs_eo(struct cxgbe_snd_tag *cst)
 {
 
 	return (cst != NULL && cst->type == IF_SND_TAG_TYPE_RATE_LIMIT);
 }
 #endif
 
 /*
  * Try to allocate an mbuf to contain a raw work request.  To make it
  * easy to construct the work request, don't allocate a chain but a
  * single mbuf.
  */
 struct mbuf *
 alloc_wr_mbuf(int len, int how)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(how, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_cflags(m, MC_RAW_WR);
 	set_mbuf_len16(m, howmany(len, 16));
 	return (m);
 }
 
 static inline int
 needs_hwcsum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
 	    CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & CSUM_TSO);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
 }
 
 static inline int
 needs_tcp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 #ifdef RATELIMIT
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 static inline int
 needs_udp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
 }
 #endif
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_flags & M_VLANTAG);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 static inline int
 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
 {
-	struct mbuf_ext_pgs *ext_pgs;
 	vm_paddr_t paddr;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	int nsegs = 0;
 
 	MBUF_EXT_PGS_ASSERT(m);
-	ext_pgs = &m->m_ext_pgs;
 	off = mtod(m, vm_offset_t);
 	len = m->m_len;
 	off += skip;
 	len -= skip;
 
-	if (ext_pgs->hdr_len != 0) {
-		if (off >= ext_pgs->hdr_len) {
-			off -= ext_pgs->hdr_len;
+	if (m->m_ext_pgs.hdr_len != 0) {
+		if (off >= m->m_ext_pgs.hdr_len) {
+			off -= m->m_ext_pgs.hdr_len;
 		} else {
-			seglen = ext_pgs->hdr_len - off;
+			seglen = m->m_ext_pgs.hdr_len - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			paddr = pmap_kextract(
 			    (vm_offset_t)&m->m_epg_hdr[segoff]);
 			if (*nextaddr != paddr)
 				nsegs++;
 			*nextaddr = paddr + seglen;
 		}
 	}
-	pgoff = ext_pgs->first_pg_off;
-	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
-		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+	pgoff = m->m_ext_pgs.first_pg_off;
+	for (i = 0; i < m->m_ext_pgs.npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(&m->m_ext_pgs, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 		pgoff = 0;
 	};
 	if (len != 0) {
-		seglen = min(len, ext_pgs->trail_len - off);
+		seglen = min(len, m->m_ext_pgs.trail_len - off);
 		len -= seglen;
 		paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]);
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 	}
 
 	return (nsegs);
 }
 
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.  It is possible for this
  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
 {
 	vm_paddr_t nextaddr, paddr;
 	vm_offset_t va;
 	int len, nsegs;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len > 0);
 	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	nextaddr = 0;
 	for (; m; m = m->m_next) {
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		if (skip >= len) {
 			skip -= len;
 			continue;
 		}
 		if ((m->m_flags & M_NOMAP) != 0) {
 			*cflags |= MC_NOMAP;
 			nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
 			skip = 0;
 			continue;
 		}
 		va = mtod(m, vm_offset_t) + skip;
 		len -= skip;
 		skip = 0;
 		paddr = pmap_kextract(va);
 		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (paddr == nextaddr)
 			nsegs--;
 		nextaddr = pmap_kextract(va + len - 1) + 1;
 	}
 
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	struct cxgbe_snd_tag *cst;
 #endif
 	uint16_t eh_type;
 	uint8_t cflags;
 
 	cflags = 0;
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0, 0, &cflags);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		cst = mst_to_cst(m0->m_pkthdr.snd_tag);
 	else
 		cst = NULL;
 #endif
 #ifdef KERN_TLS
 	if (cst != NULL && cst->type == IF_SND_TAG_TYPE_TLS) {
 		int len16;
 
 		cflags |= MC_TLS;
 		set_mbuf_cflags(m0, cflags);
 		rc = t6_ktls_parse_pkt(m0, &nsegs, &len16);
 		if (rc != 0)
 			goto fail;
 		set_mbuf_nsegs(m0, nsegs);
 		set_mbuf_len16(m0, len16);
 		return (0);
 	}
 #endif
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
 	    !(cflags & MC_NOMAP))) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, cflags);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
 	 * checksumming is enabled.  needs_l4_csum happens to check for all the
 	 * right things.
 	 */
 	if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
 		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
 		m0->m_pkthdr.snd_tag = NULL;
 		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		cst = NULL;
 	}
 #endif
 
 	if (!needs_hwcsum(m0)
 #ifdef RATELIMIT
    		 && !needs_eo(cst)
 #endif
 	)
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
 			set_mbuf_eo_tsclk_tsoff(m0,
 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
 	} else if (needs_udp_csum(m0)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
 #ifdef RATELIMIT
 	if (needs_eo(cst)) {
 		u_int immhdrs;
 
 		/* EO WRs have the headers in the WR and not the GL. */
 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
 		    m0->m_pkthdr.l4hlen;
 		cflags = 0;
 		nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
 		MPASS(cflags == mbuf_cflags(m0));
 		set_mbuf_eo_nsegs(m0, nsegs);
 		set_mbuf_eo_len16(m0,
 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
 	}
 #endif
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16) {
 			int available;
 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 			/*
 			 * Note that the WR via which we'll request tx updates
 			 * is at pidx and not eq->pidx, which has moved on
 			 * already.
 			 */
 			dst = (void *)&eq->desc[pidx];
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 			if (available < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 			}
 
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		} else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0);
 }
 
 static inline int
 discard_tx(struct sge_eq *eq)
 {
 
 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
 }
 
 static inline int
 wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 {
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
 	case FW_ETH_TX_PKT_WR:
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false(discard_tx(eq))) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) {
 			MPASS(howmany(mbuf_len16(m0), EQ_ESIZE / 16) <= 64);
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (mbuf_cflags(m0) & MC_RAW_WR) {
 			total++;
 			remaining--;
 			n = write_raw_wr(txq, (void *)wr, m0, available);
 #ifdef KERN_TLS
 		} else if (mbuf_cflags(m0) & MC_TLS) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = t6_ktls_write_wr(txq,(void *)wr, m0,
 			    mbuf_nsegs(m0), available);
 #endif
 		} else if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(sc, txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(sc, txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available);
 		if (!(mbuf_cflags(m0) & MC_TLS))
 			MPASS(n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (wr_can_update_eq(wr)) {
 			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
 			    32) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			}
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING);
 	fl->safe_zidx = sc->sge.safe_zidx;
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly then the intr_idx
  * specifies the vector, starting from 0.  -1 means the interrupts for this
  * queue should be forwarded to the fwq.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (intr_idx < 0) {
 		/* Forwarded interrupts, all headed to fwq */
 		v |= F_FW_IQ_CMD_IQANDST;
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
 	} else {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 	}
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_iq *iq)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->abs_id, 0,
 	    sysctl_uint16, "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->cntxt_id, 0,
 	    sysctl_uint16, "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &iq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &fl->cntxt_id, 0,
 	    sysctl_uint16, "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue");
 	add_iq_sysctls(&sc->ctx, oid, fwq);
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	char name[16];
 	struct sysctl_oid_list *children;
 
 	snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev),
 	    idx);
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 
 	children = SYSCTL_CHILDREN(oid);
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrl queue");
 	rc = alloc_wrq(sc, NULL, ctrlq, oid);
 
 	return (rc);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_e_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
 	if (rc != 0)
 		return (rc);
 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP)
 		rxq->iq.flags |= IQ_RX_TIMESTAMP;
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
 	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->fl_sidx2 = nm_rxq->fl_sidx;	/* copy for rxsync cacheline */
 	nm_rxq->intr_idx = intr_idx;
 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_abs_id,
 	    0, sysctl_uint16, "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_cntxt_id,
 	    0, sysctl_uint16, "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->iq_cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_rxq->fl_cntxt_id,
 	    0, sysctl_uint16, "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
 	else
 		MPASS(nm_rxq->iq_cntxt_id == 0);
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	if (sc->params.fw_vers >= FW_VERSION32(1, 24, 11, 0))
 		nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS2_WR));
 	else
 		nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_txq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &nm_txq->pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
 	else
 		MPASS(nm_txq->cntxt_id == 0);
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 /*
  * Returns a reasonable automatic cidx flush threshold for a given queue size.
  */
 static u_int
 qsize_to_fthresh(int qsize)
 {
 	u_int fthresh;
 
 	while (!powerof2(qsize))
 		qsize++;
 	fthresh = ilog2(qsize);
 	if (fthresh > X_CIDXFLUSHTHRESH_128)
 		fthresh = X_CIDXFLUSHTHRESH_128;
 
 	return (fthresh);
 }
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = eq->dbidx = 0;
 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
 	eq->equeqidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &wrq->eq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &wrq->eq.cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &wrq->eq.pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 		    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &eq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &eq->cidx, 0,
 	    sysctl_uint16, "I", "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &eq->pidx, 0,
 	    sysctl_uint16, "I", "producer index");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, vi, idx, sysctl_tc,
 	    "I", "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
 
 #ifdef KERN_TLS
 	if (sc->flags & KERN_TLS_OK) {
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_records", CTLFLAG_RD, &txq->kern_tls_records,
 		    "# of NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_short", CTLFLAG_RD, &txq->kern_tls_short,
 		    "# of short NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_partial", CTLFLAG_RD, &txq->kern_tls_partial,
 		    "# of partial NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_full", CTLFLAG_RD, &txq->kern_tls_full,
 		    "# of full NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_octets", CTLFLAG_RD, &txq->kern_tls_octets,
 		    "# of payload octets in transmitted NIC TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_waste", CTLFLAG_RD, &txq->kern_tls_waste,
 		    "# of octets DMAd but not transmitted in NIC TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_options", CTLFLAG_RD, &txq->kern_tls_options,
 		    "# of NIC TLS options-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_header", CTLFLAG_RD, &txq->kern_tls_header,
 		    "# of NIC TLS header-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin,
 		    "# of NIC TLS FIN-only packets transmitted");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_fin_short", CTLFLAG_RD, &txq->kern_tls_fin_short,
 		    "# of NIC TLS padded FIN packets on short TLS records");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_cbc", CTLFLAG_RD, &txq->kern_tls_cbc,
 		    "# of NIC TLS sessions using AES-CBC");
 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
 		    "kern_tls_gcm", CTLFLAG_RD, &txq->kern_tls_gcm,
 		    "# of NIC TLS sessions using AES-GCM");
 	}
 #endif
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct rx_buf_info *rxb;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 				goto recycled;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sd);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 		rxb = &sc->sge.rx_buf_info[fl->zidx];
 		cl = uma_zalloc(rxb->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (fl->zidx != fl->safe_zidx) {
 				rxb = &sc->sge.rx_buf_info[fl->safe_zidx];
 				cl = uma_zalloc(rxb->zone, M_NOWAIT);
 			}
 			if (cl == NULL)
 				break;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		sd->cl = cl;
 		sd->zidx = fl->zidx;
 
 		if (fl->flags & FL_BUF_PACKING) {
 			*d = htobe64(pa | rxb->hwidx2);
 			sd->moff = rxb->size2;
 		} else {
 			*d = htobe64(pa | rxb->hwidx1);
 			sd->moff = 0;
 		}
 recycled:
 		sd->nmbuf = 0;
 		d++;
 		sd++;
 		if (__predict_false((++fl->pidx & 7) == 0)) {
 			uint16_t pidx = fl->pidx >> 3;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (n < 8 || pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if ((fl->pidx >> 3) != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl);
 		else if (fl->flags & FL_BUF_PACKING) {
 			clm = cl_metadata(sd);
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone,
 				    sd->cl);
 				counter_u64_add(extfree_rels, 1);
 			}
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 static inline uint64_t
 csum_to_ctrl(struct adapter *sc, struct mbuf *m)
 {
 	uint64_t ctrl;
 	int csum_type;
 
 	M_ASSERTPKTHDR(m);
 
 	if (needs_hwcsum(m) == 0)
 		return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
 
 	ctrl = 0;
 	if (needs_l3_csum(m) == 0)
 		ctrl |= F_TXPKT_IPCSUM_DIS;
 	switch (m->m_pkthdr.csum_flags &
 	    (CSUM_IP_TCP | CSUM_IP_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP)) {
 	case CSUM_IP_TCP:
 		csum_type = TX_CSUM_TCPIP;
 		break;
 	case CSUM_IP_UDP:
 		csum_type = TX_CSUM_UDPIP;
 		break;
 	case CSUM_IP6_TCP:
 		csum_type = TX_CSUM_TCPIP6;
 		break;
 	case CSUM_IP6_UDP:
 		csum_type = TX_CSUM_UDPIP6;
 		break;
 	default:
 		/* needs_hwcsum told us that at least some hwcsum is needed. */
 		MPASS(ctrl == 0);
 		MPASS(m->m_pkthdr.csum_flags & CSUM_IP);
 		ctrl |= F_TXPKT_L4CSUM_DIS;
 		csum_type = TX_CSUM_IP;
 		break;
 	}
 
 	MPASS(m->m_pkthdr.l2hlen > 0);
 	MPASS(m->m_pkthdr.l3hlen > 0);
 	ctrl |= V_TXPKT_CSUM_TYPE(csum_type) |
 	    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	if (chip_id(sc) <= CHELSIO_T5)
 		ctrl |= V_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
 	else
 		ctrl |= V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
 
 	return (ctrl);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 			ETHER_HDR_LEN) >> 2) |
 		    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 		    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a raw WR to the hardware descriptors, update the software
  * descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct mbuf *m;
 	caddr_t dst;
 	int len16, ndesc;
 
 	len16 = mbuf_len16(m0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	dst = wr;
 	for (m = m0; m != NULL; m = m->m_next)
 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 
 	txq->raw_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
 	    available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 			ETHER_HDR_LEN) >> 2) |
 		    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 		    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	if (cannot_use_txpkts(m))
 		return (1);
 
 	nsegs = mbuf_nsegs(m);
 	if (txp->wr_type == 1 && nsegs != 1)
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 		} else {
 			cpl = flitp;
 		}
 
 		/* Checksum offload */
 		ctrl1 = csum_to_ctrl(sc, m);
 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
 	} else {
 		txq->txpkts1_pkts += txp->npkt;
 		txq->txpkts1_wrs++;
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 		KASSERT(ndesc != 0,
 		    ("%s: descriptor with no credits: cidx %d",
 		    __func__, eq->cidx));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		uint64_t rc;
 
 		rc = (uint64_t)segs[i].ss_len << 32;
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)(segs[i + 1].ss_len);
 
 		return (htobe64(rc));
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static int
 find_refill_source(struct adapter *sc, int maxp, bool packing)
 {
 	int i, zidx = -1;
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 
 	if (packing) {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx2 == -1)
 				continue;
 			if (rxb->size1 < PAGE_SIZE &&
 			    rxb->size1 < largest_rx_cluster)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE);
 			if (rxb->size2 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	} else {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx1 == -1)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			if (rxb->size1 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	}
 
 	return (zidx);
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static inline bool
 bufidx_used(struct adapter *sc, int idx)
 {
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 	int i;
 
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		if (rxb->size1 > largest_rx_cluster)
 			continue;
 		if (rxb->hwidx1 == idx || rxb->hwidx2 == idx)
 			return (true);
 	}
 
 	return (false);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sge_params *sp = &sc->params.sge;
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++) {
 		if (bufidx_used(sc, i))
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 #ifdef RATELIMIT
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
 {
 	u_int n;
 
 	MPASS(immhdrs > 0);
 
 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
 	if (__predict_false(nsegs == 0))
 		goto done;
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 done:
 	return (howmany(n, 16));
 }
 
 #define ETID_FLOWC_NPARAMS 6
 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
 
 static int
 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi,
     struct vi_info *vi)
 {
 	struct wrq_cookie cookie;
 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
 	struct fw_flowc_wr *flowc;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
 	    EO_FLOWC_PENDING);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		return (ENOMEM);
 
 	bzero(flowc, ETID_FLOWC_LEN);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(cst->iqid);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 	flowc->mnemval[5].val = htobe32(cst->schedcl);
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags &= ~EO_FLOWC_PENDING;
 	cst->flags |= EO_FLOWC_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
 	cst->tx_credits -= ETID_FLOWC_LEN16;
 
 	return (0);
 }
 
 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
 
 void
 send_etid_flush_wr(struct cxgbe_rate_tag *cst)
 {
 	struct fw_flowc_wr *flowc;
 	struct wrq_cookie cookie;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		CXGBE_UNIMPLEMENTED(__func__);
 
 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags |= EO_FLUSH_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
 	cst->tx_credits -= ETID_FLUSH_LEN16;
 	cst->ncompl++;
 }
 
 static void
 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr,
     struct mbuf *m0, int compl)
 {
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	uint32_t ctrl;	/* used in many unrelated places */
 	int len16, pktlen, nsegs, immhdrs;
 	caddr_t dst;
 	uintptr_t p;
 	struct ulptx_sgl *usgl;
 	struct sglist sg;
 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	M_ASSERTPKTHDR(m0);
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
 
 	len16 = mbuf_eo_len16(m0);
 	nsegs = mbuf_eo_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
 	ctrl += immhdrs;
 
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	wr->r3 = 0;
 	if (needs_udp_csum(m0)) {
 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
 		wr->u.udpseg.rtplen = 0;
 		wr->u.udpseg.r4 = 0;
 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
 		cpl = (void *)(wr + 1);
 	} else {
 		MPASS(needs_tcp_csum(m0));
 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
 		wr->u.tcpseg.r4 = 0;
 		wr->u.tcpseg.r5 = 0;
 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
 
 		if (needs_tso(m0)) {
 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
 
 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 			    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 				ETHER_HDR_LEN) >> 2) |
 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 				ctrl |= F_LSO_IPV6;
 			lso->lso_ctrl = htobe32(ctrl);
 			lso->ipid_ofst = htobe16(0);
 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 			lso->seqno_offset = htobe32(0);
 			lso->len = htobe32(pktlen);
 
 			cpl = (void *)(lso + 1);
 		} else {
 			wr->u.tcpseg.mss = htobe16(0xffff);
 			cpl = (void *)(wr + 1);
 		}
 	}
 
 	/* Checksum offload must be requested for ethofld. */
 	MPASS(needs_l4_csum(m0));
 	ctrl1 = csum_to_ctrl(cst->adapter, m0);
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = cst->ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
 	p = (uintptr_t)(cpl + 1);
 	m_copydata(m0, 0, immhdrs, (void *)p);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 		int i, pad;
 
 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
 		p += immhdrs;
 		pad = 16 - (immhdrs & 0xf);
 		bzero((void *)p, pad);
 
 		usgl = (void *)(p + pad);
 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 		    V_ULPTX_NSGE(nsegs));
 
 		sglist_init(&sg, nitems(segs), segs);
 		for (; m0 != NULL; m0 = m0->m_next) {
 			if (__predict_false(m0->m_len == 0))
 				continue;
 			if (immhdrs >= m0->m_len) {
 				immhdrs -= m0->m_len;
 				continue;
 			}
-
-			sglist_append(&sg, mtod(m0, char *) + immhdrs,
-			    m0->m_len - immhdrs);
+			if (m0->m_flags & M_NOMAP)
+				sglist_append_mbuf_epg(&sg, m0,
+				    mtod(m0, vm_offset_t), m0->m_len);
+                        else
+				sglist_append(&sg, mtod(m0, char *) + immhdrs,
+				    m0->m_len - immhdrs);
 			immhdrs = 0;
 		}
 		MPASS(sg.sg_nseg == nsegs);
 
 		/*
 		 * Zero pad last 8B in case the WR doesn't end on a 16B
 		 * boundary.
 		 */
 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
 
 		usgl->len0 = htobe32(segs[0].ss_len);
 		usgl->addr0 = htobe64(segs[0].ss_paddr);
 		for (i = 0; i < nsegs - 1; i++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 	}
 
 }
 
 static void
 ethofld_tx(struct cxgbe_rate_tag *cst)
 {
 	struct mbuf *m;
 	struct wrq_cookie cookie;
 	int next_credits, compl;
 	struct fw_eth_tx_eo_wr *wr;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
 		M_ASSERTPKTHDR(m);
 
 		/* How many len16 credits do we need to send this mbuf. */
 		next_credits = mbuf_eo_len16(m);
 		MPASS(next_credits > 0);
 		if (next_credits > cst->tx_credits) {
 			/*
 			 * Tx will make progress eventually because there is at
 			 * least one outstanding fw4_ack that will return
 			 * credits and kick the tx.
 			 */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
 		if (__predict_false(wr == NULL)) {
 			/* XXX: wishful thinking, not a real assertion. */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		cst->tx_credits -= next_credits;
 		cst->tx_nocompl += next_credits;
 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
 		ETHER_BPF_MTAP(cst->com.com.ifp, m);
 		write_ethofld_wr(cst, wr, m, compl);
 		commit_wrq_wr(cst->eo_txq, wr, &cookie);
 		if (compl) {
 			cst->ncompl++;
 			cst->tx_nocompl	= 0;
 		}
 		(void) mbufq_dequeue(&cst->pending_tx);
 
 		/*
 		 * Drop the mbuf's reference on the tag now rather
 		 * than waiting until m_freem().  This ensures that
 		 * cxgbe_rate_tag_free gets called when the inp drops
 		 * its reference on the tag and there are no more
 		 * mbufs in the pending_tx queue and can flush any
 		 * pending requests.  Otherwise if the last mbuf
 		 * doesn't request a completion the etid will never be
 		 * released.
 		 */
 		m->m_pkthdr.snd_tag = NULL;
 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		m_snd_tag_rele(&cst->com.com);
 
 		mbufq_enqueue(&cst->pending_fwack, m);
 	}
 }
 
 int
 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
 {
 	struct cxgbe_rate_tag *cst;
 	int rc;
 
 	MPASS(m0->m_nextpkt == NULL);
 	MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
 	MPASS(m0->m_pkthdr.snd_tag != NULL);
 	cst = mst_to_crt(m0->m_pkthdr.snd_tag);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 
 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
 		struct vi_info *vi = ifp->if_softc;
 		struct port_info *pi = vi->pi;
 		struct adapter *sc = pi->adapter;
 		const uint32_t rss_mask = vi->rss_size - 1;
 		uint32_t rss_hash;
 
 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
 		if (M_HASHTYPE_ISHASH(m0))
 			rss_hash = m0->m_pkthdr.flowid;
 		else
 			rss_hash = arc4random();
 		/* We assume RSS hashing */
 		cst->iqid = vi->rss[rss_hash & rss_mask];
 		cst->eo_txq += rss_hash % vi->nofldtxq;
 		rc = send_etid_flowc_wr(cst, pi, vi);
 		if (rc != 0)
 			goto done;
 	}
 
 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
 		rc = ENOBUFS;
 		goto done;
 	}
 
 	mbufq_enqueue(&cst->pending_tx, m0);
 	cst->plen += m0->m_pkthdr.len;
 
 	/*
 	 * Hold an extra reference on the tag while generating work
 	 * requests to ensure that we don't try to free the tag during
 	 * ethofld_tx() in case we are sending the final mbuf after
 	 * the inp was freed.
 	 */
 	m_snd_tag_ref(&cst->com.com);
 	ethofld_tx(cst);
 	mtx_unlock(&cst->lock);
 	m_snd_tag_rele(&cst->com.com);
 	return (0);
 
 done:
 	mtx_unlock(&cst->lock);
 	if (__predict_false(rc != 0))
 		m_freem(m0);
 	return (rc);
 }
 
 static int
 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	struct mbuf *m;
 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct cxgbe_rate_tag *cst;
 	uint8_t credits = cpl->credits;
 
 	cst = lookup_etid(sc, etid);
 	mtx_lock(&cst->lock);
 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
 		MPASS(credits >= ETID_FLOWC_LEN16);
 		credits -= ETID_FLOWC_LEN16;
 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
 	}
 
 	KASSERT(cst->ncompl > 0,
 	    ("%s: etid %u (%p) wasn't expecting completion.",
 	    __func__, etid, cst));
 	cst->ncompl--;
 
 	while (credits > 0) {
 		m = mbufq_dequeue(&cst->pending_fwack);
 		if (__predict_false(m == NULL)) {
 			/*
 			 * The remaining credits are for the final flush that
 			 * was issued when the tag was freed by the kernel.
 			 */
 			MPASS((cst->flags &
 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
 			    EO_FLUSH_RPL_PENDING);
 			MPASS(credits == ETID_FLUSH_LEN16);
 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
 			MPASS(cst->ncompl == 0);
 
 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
 			cst->tx_credits += cpl->credits;
 			cxgbe_rate_tag_free_locked(cst);
 			return (0);	/* cst is gone. */
 		}
 		KASSERT(m != NULL,
 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
 		    credits));
 		KASSERT(credits >= mbuf_eo_len16(m),
 		    ("%s: too few credits (%u, %u, %u)", __func__,
 		    cpl->credits, credits, mbuf_eo_len16(m)));
 		credits -= mbuf_eo_len16(m);
 		cst->plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 
 	cst->tx_credits += cpl->credits;
 	MPASS(cst->tx_credits <= cst->tx_total);
 
 	if (cst->flags & EO_SND_TAG_REF) {
 		/*
 		 * As with ethofld_transmit(), hold an extra reference
 		 * so that the tag is stable across ethold_tx().
 		 */
 		m_snd_tag_ref(&cst->com.com);
 		m = mbufq_first(&cst->pending_tx);
 		if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
 			ethofld_tx(cst);
 		mtx_unlock(&cst->lock);
 		m_snd_tag_rele(&cst->com.com);
 	} else {
 		/*
 		 * There shouldn't be any pending packets if the tag
 		 * was freed by the kernel since any pending packet
 		 * should hold a reference to the tag.
 		 */
 		MPASS(mbufq_first(&cst->pending_tx) == NULL);
 		mtx_unlock(&cst->lock);
 	}
 
 	return (0);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 360573)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 360574)
@@ -1,2306 +1,2308 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
 
 void
 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (tp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 	if (toep->params.tc_idx != -1) {
 		MPASS(toep->params.tc_idx >= 0 &&
 		    toep->params.tc_idx < sc->chip_params->nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
 	if (tp) {
 		FLOWC_PARAM(MSS, toep->params.emss);
 		FLOWC_PARAM(SNDNXT, tp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
 	} else
 		FLOWC_PARAM(MSS, 512);
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
 	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, ulp_mode(toep));
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->params.tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 	}
 
 	if (toep->params.tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->params.tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
 	toep->params.tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 
 	INP_LOCK_ASSERT(inp);
 
 	toep->params.mtu_idx = G_TCPOPT_MSS(opt);
 	tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
 
 	toep->params.emss = tp->t_maxseg;
 	if (G_TCPOPT_TSTAMP(opt)) {
 		toep->params.tstamp = 1;
 		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 	} else
 		toep->params.tstamp = 0;
 
 	if (G_TCPOPT_SACK(opt)) {
 		toep->params.sack = 1;
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	} else {
 		toep->params.sack = 0;
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 	}
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	} else
 		toep->params.wscale = 0;
 
 	CTR6(KTR_CXGBE,
 	    "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
 	    toep->tid, toep->params.mtu_idx, toep->params.emss,
 	    toep->params.tstamp, toep->params.sack, toep->params.wscale);
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	uint16_t tcpopt = be16toh(opt);
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	send_flowc_wr(toep, tp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (ulp_mode(toep) == ULP_MODE_TLS) {
 		if (toep->tls.rcv_over >= rx_credits) {
 			toep->tls.rcv_over -= rx_credits;
 			rx_credits = 0;
 		} else {
 			rx_credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
 
 	if (rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
 	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits)
 {
 	const int n = 1;	/* Use no more than one desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
     unsigned int plen, uint8_t credits, int shove, int ulp_submode)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (toep->params.tx_align > 0) {
 		if (plen < 2 * toep->params.emss)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(toep->params.nagle == 0 ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (m->m_flags & M_NOMAP)
-			rc = sglist_append_mb_ext_pgs(&sg, m);
+			rc = sglist_append_mbuf_epg(&sg, m,
+			    mtod(m, vm_offset_t), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool nomap_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP ||
 	    ulp_mode(toep) == ULP_MODE_TLS ||
 	    ulp_mode(toep) == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		nomap_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if (m->m_flags & M_NOMAP) {
 #ifdef KERN_TLS
 				if (m->m_ext_pgs.tls != NULL) {
 					toep->flags |= TPF_KTLS;
 					if (plen == 0) {
 						SOCKBUF_UNLOCK(sb);
 						t4_push_ktls(sc, toep, 0);
 						return;
 					}
 					break;
 				}
 #endif
-				n = sglist_count_mb_ext_pgs(m);
+				n = sglist_count_mbuf_epg(m,
+				    mtod(m, vm_offset_t), m->m_len);
 			} else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(so,
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (m->m_flags & M_NOMAP)
 				nomap_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL,
 			    ("%s: nothing to send, but m != NULL", __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !nomap_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		M_ASSERTPKTHDR(sndptr);
 
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/*
 			 * This mbuf would send us _over_ the nsegs limit.
 			 * Suspend tx because the PDU can't be sent out.
 			 */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/*
 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
 		 * is NULL.  A PDU's length is always a multiple of 4.
 		 */
 		MPASS(m == NULL);
 		MPASS((plen & 3) == 0);
 		MPASS(sndptr->m_pkthdr.len == plen);
 
 		shove = !(tp->t_flags & TF_MORETOCOME);
 		ulp_submode = mbuf_ulp_submode(sndptr);
 		MPASS(ulp_submode < nitems(ulp_extra_len));
 
 		/*
 		 * plen doesn't include header and data digests, which are
 		 * generated and inserted in the right places by the TOE, but
 		 * they do occupy TCP sequence space and need to be accounted
 		 * for.
 		 */
 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
 		if (plen <= max_imm) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
 			    shove, ulp_submode);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
 			    shove, ulp_submode);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += adjusted_plen;
 		tp->snd_max += adjusted_plen;
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 {
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, drop);
 	else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM)
 		t4_push_tls_records(sc, toep, drop);
 #ifdef KERN_TLS
 	else if (toep->flags & TPF_KTLS)
 		t4_push_ktls(sc, toep, drop);
 #endif
 	else
 		t4_push_frames(sc, toep, drop);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED)
 		t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    toep->ddp.flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	tp->rcv_nxt++;	/* FIN */
 
 	so = inp->inp_socket;
 	socantrcvmore(so);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 
 	if (ulp_mode(toep) != ULP_MODE_RDMA) {
 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    		be32toh(cpl->rcv_nxt)));
 	}
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_wrq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
 	int len, rx_credits;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	sbappendstream_locked(sb, m, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		if (txsd->iv_buffer) {
 			free(txsd->iv_buffer, M_CXGBE);
 			txsd->iv_buffer = NULL;
 		}
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		t4_push_data(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data trasmitted before the tid's ULP mode
 				 * changed to ISCSI is still in so_snd.
 				 * Incoming credits should account for so_snd
 				 * first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (tls_tx_key(toep) &&
 			    toep->tls.mode == TLS_MODE_TLSOM) {
 				struct tls_ofld_info *tls_ofld = &toep->tls;
 
 				MPASS(tls_ofld->sb_off >= plen);
 				tls_ofld->sb_off -= plen;
 			}
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend1' field in AIO jobs to hold an error that should
  * be reported when the job is completed, the 'backend3' field to
  * store the amount of data sent by the AIO job so far, and the
  * 'backend4' field to hold a reference count on the job.
  *
  * Each unmapped mbuf holds a reference on the job as does the queue
  * so long as the job is queued.
  */
 #define	aio_error	backend1
 #define	aio_sent	backend3
 #define	aio_refs	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 
 static void
 aiotx_free_job(struct kaiocb *job)
 {
 	long status;
 	int error;
 
 	if (refcount_release(&job->aio_refs) == 0)
 		return;
 
 	error = (intptr_t)job->aio_error;
 	status = job->aio_sent;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error != 0 && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else {
 		job->msgsnd = 1;
 		aio_complete(job, status, 0);
 	}
 }
 
 static void
 aiotx_free_pgs(struct mbuf *m)
 {
 	struct mbuf_ext_pgs *ext_pgs;
 	struct kaiocb *job;
 	vm_page_t pg;
 
 	MBUF_EXT_PGS_ASSERT(m);
 	ext_pgs = &m->m_ext_pgs;
 	job = m->m_ext.ext_arg1;
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(job));
 #endif
 
 	for (int i = 0; i < ext_pgs->npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire(pg, PQ_ACTIVE);
 	}
 
 	aiotx_free_job(job);
 }
 
 /*
  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
  * of an AIO job.
  */
 static struct mbuf *
 alloc_aiotx_mbuf(struct kaiocb *job, int len)
 {
 	struct vmspace *vm;
 	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
 	struct mbuf *m, *top, *last;
 	struct mbuf_ext_pgs *ext_pgs;
 	vm_map_t map;
 	vm_offset_t start;
 	int i, mlen, npages, pgoff;
 
 	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
 	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
 	    job, len));
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
 	pgoff = start & PAGE_MASK;
 
 	top = NULL;
 	last = NULL;
 	while (len > 0) {
 		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
 		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
 		    ("%s: next start (%#jx + %#x) is not page aligned",
 		    __func__, (uintmax_t)start, mlen));
 
 		npages = vm_fault_quick_hold_pages(map, start, mlen,
 		    VM_PROT_WRITE, pgs, nitems(pgs));
 		if (npages < 0)
 			break;
 
 		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
 		if (m == NULL) {
 			vm_page_unhold_pages(pgs, npages);
 			break;
 		}
 
 		ext_pgs = &m->m_ext_pgs;
 		ext_pgs->first_pg_off = pgoff;
 		ext_pgs->npgs = npages;
 		if (npages == 1) {
 			KASSERT(mlen + pgoff <= PAGE_SIZE,
 			    ("%s: single page is too large (off %d len %d)",
 			    __func__, pgoff, mlen));
 			ext_pgs->last_pg_len = mlen;
 		} else {
 			ext_pgs->last_pg_len = mlen - (PAGE_SIZE - pgoff) -
 			    (npages - 2) * PAGE_SIZE;
 		}
 		for (i = 0; i < npages; i++)
 			m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
 
 		m->m_len = mlen;
 		m->m_ext.ext_size = npages * PAGE_SIZE;
 		m->m_ext.ext_arg1 = job;
 		refcount_acquire(&job->aio_refs);
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
 		    __func__, jobtotid(job), m, job, npages);
 #endif
 
 		if (top == NULL)
 			top = m;
 		else
 			last->m_next = m;
 		last = m;
 
 		len -= mlen;
 		start += mlen;
 		pgoff = 0;
 	}
 
 	return (top);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct sockbuf *sb;
 	struct file *fp;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error, len;
 	bool moretocome, sendmore;
 
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	fp = job->fd_file;
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(fp->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Inline sosend_generic(). */
 
 	error = sblock(sb, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			sbunlock(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	len = sbspace(sb);
 	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
 		len = job->uaiocb.aio_nbytes - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (len > toep->params.sndbuf) {
 		len = toep->params.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(len != 0);
 
 	m = alloc_aiotx_mbuf(job, len);
 	if (m == NULL) {
 		sbunlock(sb);
 		error = EFAULT;
 		goto out;
 	}
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		sbunlock(sb);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	job->aio_sent += m_length(m, NULL);
 
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tp->t_fb->tfb_tcp_output(tp);
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	sbunlock(sb);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a blocking socket and the request has not been
 	 * fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop the queue's
 	 * reference to the job.  Any mbufs in flight should still
 	 * hold a reference, but this drops the reference that the
 	 * queue owns while it is waiting to queue mbufs to the
 	 * socket.
 	 */
 	aiotx_free_job(job);
 
 out:
 	if (error) {
 		job->aio_error = (void *)(intptr_t)error;
 		aiotx_free_job(job);
 	}
 	if (m != NULL)
 		m_free(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct socket *so;
 	struct kaiocb *job;
 
 	so = toep->aiotx_so;
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_so = NULL;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	CURVNET_RESTORE();
 
 	free_toepcb(toep);
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 static void
 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
 #endif
 	if (toep->aiotx_so != NULL)
 		return;
 	soref(so);
 	toep->aiotx_so = so;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	job->aio_error = (void *)(intptr_t)ECANCELED;
 	aiotx_free_job(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	refcount_init(&job->aio_refs, 1);
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(so, toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
Index: head/sys/kern/subr_bus_dma.c
===================================================================
--- head/sys/kern/subr_bus_dma.c	(revision 360573)
+++ head/sys/kern/subr_bus_dma.c	(revision 360574)
@@ -1,688 +1,686 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 EMC Corp.
  * All rights reserved.
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/ktr.h>
 #include <sys/mbuf.h>
 #include <sys/memdesc.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/pmap.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 
 #include <opencrypto/cryptodev.h>
 
 #include <machine/bus.h>
 
 /*
  * Load up data starting at offset within a region specified by a
  * list of virtual address ranges until either length or the region
  * are exhausted.
  */
 static int
 _bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
     int flags, size_t offset, size_t length)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) {
 		char *addr;
 		size_t ds_len;
 
 		KASSERT((offset < list->ds_len),
 		    ("Invalid mid-segment offset"));
 		addr = (char *)(uintptr_t)list->ds_addr + offset;
 		ds_len = list->ds_len - offset;
 		offset = 0;
 		if (ds_len > length)
 			ds_len = length;
 		length -= ds_len;
 		KASSERT((ds_len != 0), ("Segment length is zero"));
 		error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap,
 		    flags, NULL, nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load a list of physical addresses.
  */
 static int
 _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0; sglist_cnt--, list++) {
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
 		    nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load an unmapped mbuf
  */
 static int
-_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+_bus_dmamap_load_mbuf_epg(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
 {
-	struct mbuf_ext_pgs *ext_pgs;
 	int error, i, off, len, pglen, pgoff, seglen, segoff;
 
 	MBUF_EXT_PGS_ASSERT(m);
-	ext_pgs = &m->m_ext_pgs;
 
 	len = m->m_len;
 	error = 0;
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
-	if (ext_pgs->hdr_len != 0) {
-		if (off >= ext_pgs->hdr_len) {
-			off -= ext_pgs->hdr_len;
+	if (m->m_ext_pgs.hdr_len != 0) {
+		if (off >= m->m_ext_pgs.hdr_len) {
+			off -= m->m_ext_pgs.hdr_len;
 		} else {
-			seglen = ext_pgs->hdr_len - off;
+			seglen = m->m_ext_pgs.hdr_len - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = _bus_dmamap_load_buffer(dmat, map,
 			    &m->m_epg_hdr[segoff], seglen, kernel_pmap,
 			    flags, segs, nsegs);
 		}
 	}
-	pgoff = ext_pgs->first_pg_off;
-	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
-		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+	pgoff = m->m_ext_pgs.first_pg_off;
+	for (i = 0; i < m->m_ext_pgs.npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(&m->m_ext_pgs, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		error = _bus_dmamap_load_phys(dmat, map,
 		    m->m_epg_pa[i] + segoff, seglen, flags, segs, nsegs);
 		pgoff = 0;
 	};
 	if (len != 0 && error == 0) {
-		KASSERT((off + len) <= ext_pgs->trail_len,
+		KASSERT((off + len) <= m->m_ext_pgs.trail_len,
 		    ("off + len > trail (%d + %d > %d)", off, len,
-		    ext_pgs->trail_len));
+		    m->m_ext_pgs.trail_len));
 		error = _bus_dmamap_load_buffer(dmat, map,
 		    &m->m_epg_trail[off], len, kernel_pmap, flags, segs,
 		    nsegs);
 	}
 	return (error);
 }
 
 /*
  * Load an mbuf chain.
  */
 static int
 _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	struct mbuf *m;
 	int error;
 
 	error = 0;
 	for (m = m0; m != NULL && error == 0; m = m->m_next) {
 		if (m->m_len > 0) {
 			if ((m->m_flags & M_NOMAP) != 0)
-				error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
+				error = _bus_dmamap_load_mbuf_epg(dmat,
 				    map, m, segs, nsegs, flags);
 			else
 				error = _bus_dmamap_load_buffer(dmat, map,
 				    m->m_data, m->m_len, kernel_pmap,
 				    flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
 		}
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, *nsegs);
 	return (error);
 }
 
 /*
  * Load from block io.
  */
 static int
 _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
     int *nsegs, int flags)
 {
 
 	if ((bio->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data;
 		return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n,
 		    kernel_pmap, nsegs, flags, bio->bio_ma_offset,
 		    bio->bio_bcount));
 	}
 
 	if ((bio->bio_flags & BIO_UNMAPPED) != 0)
 		return (_bus_dmamap_load_ma(dmat, map, bio->bio_ma,
 		    bio->bio_bcount, bio->bio_ma_offset, flags, NULL, nsegs));
 
 	return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data,
 	    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs));
 }
 
 int
 bus_dmamap_load_ma_triv(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
     bus_dma_segment_t *segs, int *segp)
 {
 	vm_paddr_t paddr;
 	bus_size_t len;
 	int error, i;
 
 	error = 0;
 	for (i = 0; tlen > 0; i++, tlen -= len) {
 		len = min(PAGE_SIZE - ma_offs, tlen);
 		paddr = VM_PAGE_TO_PHYS(ma[i]) + ma_offs;
 		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
 		    flags, segs, segp);
 		if (error != 0)
 			break;
 		ma_offs = 0;
 	}
 	return (error);
 }
 
 /*
  * Load a cam control block.
  */
 static int
 _bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    int *nsegs, int flags)
 {
 	struct ccb_hdr *ccb_h;
 	void *data_ptr;
 	int error;
 	uint32_t dxfer_len;
 	uint16_t sglist_cnt;
 
 	error = 0;
 	ccb_h = &ccb->ccb_h;
 	switch (ccb_h->func_code) {
 	case XPT_SCSI_IO: {
 		struct ccb_scsiio *csio;
 
 		csio = &ccb->csio;
 		data_ptr = csio->data_ptr;
 		dxfer_len = csio->dxfer_len;
 		sglist_cnt = csio->sglist_cnt;
 		break;
 	}
 	case XPT_CONT_TARGET_IO: {
 		struct ccb_scsiio *ctio;
 
 		ctio = &ccb->ctio;
 		data_ptr = ctio->data_ptr;
 		dxfer_len = ctio->dxfer_len;
 		sglist_cnt = ctio->sglist_cnt;
 		break;
 	}
 	case XPT_ATA_IO: {
 		struct ccb_ataio *ataio;
 
 		ataio = &ccb->ataio;
 		data_ptr = ataio->data_ptr;
 		dxfer_len = ataio->dxfer_len;
 		sglist_cnt = 0;
 		break;
 	}
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN: {
 		struct ccb_nvmeio *nvmeio;
 
 		nvmeio = &ccb->nvmeio;
 		data_ptr = nvmeio->data_ptr;
 		dxfer_len = nvmeio->dxfer_len;
 		sglist_cnt = nvmeio->sglist_cnt;
 		break;
 	}
 	default:
 		panic("_bus_dmamap_load_ccb: Unsupported func code %d",
 		    ccb_h->func_code);
 	}
 
 	switch ((ccb_h->flags & CAM_DATA_MASK)) {
 	case CAM_DATA_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
 		    kernel_pmap, flags, NULL, nsegs);
 		break;
 	case CAM_DATA_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
 		    nsegs);
 		break;
 	case CAM_DATA_SG:
 		error = _bus_dmamap_load_vlist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
 		    nsegs, flags, 0, dxfer_len);
 		break;
 	case CAM_DATA_SG_PADDR:
 		error = _bus_dmamap_load_plist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
 		break;
 	case CAM_DATA_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
 		    nsegs, flags);
 		break;
 	default:
 		panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
 		    ccb_h->flags);
 	}
 	return (error);
 }
 
 /*
  * Load a uio.
  */
 static int
 _bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     int *nsegs, int flags)
 {
 	bus_size_t resid;
 	bus_size_t minlen;
 	struct iovec *iov;
 	pmap_t pmap;
 	caddr_t addr;
 	int error, i;
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 			("bus_dmamap_load_uio: USERSPACE but no proc"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = kernel_pmap;
 	resid = uio->uio_resid;
 	iov = uio->uio_iov;
 	error = 0;
 
 	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
 		 * until we have exhausted the residual count.
 		 */
 
 		addr = (caddr_t) iov[i].iov_base;
 		minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
 		if (minlen > 0) {
 			error = _bus_dmamap_load_buffer(dmat, map, addr,
 			    minlen, pmap, flags, NULL, nsegs);
 			resid -= minlen;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Map the buffer buf into bus space using the dmamap map.
  */
 int
 bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
     bus_size_t buflen, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_vaddr(buf, buflen);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 
 	nsegs = -1;
 	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
 	    flags, NULL, &nsegs);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	M_ASSERTPKTHDR(m0);
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
 	++nsegs;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	int error;
 
 	flags |= BUS_DMA_NOWAIT;
 	*nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
 	++*nsegs;
 	_bus_dmamap_complete(dmat, map, segs, *nsegs, error);
 	return (error);
 }
 
 int
 bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
 	nsegs++;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct ccb_hdr *ccb_h;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	ccb_h = &ccb->ccb_h;
 	if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
 		callback(callback_arg, NULL, 0, 0);
 		return (0);
 	}
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_ccb(ccb);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_bio(bio);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct memdesc *mem, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0)
 		_bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
 
 	nsegs = -1;
 	error = 0;
 	switch (mem->md_type) {
 	case MEMDESC_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
 		    mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
 		    mem->md_opaque, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_VLIST:
 		error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX);
 		break;
 	case MEMDESC_PLIST:
 		error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, &nsegs, flags);
 		break;
 	case MEMDESC_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_UIO:
 		error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_MBUF:
 		error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
 		    NULL, &nsegs, flags);
 		break;
 	case MEMDESC_CCB:
 		error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
 		    flags);
 		break;
 	}
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_crp(bus_dma_tag_t dmat, bus_dmamap_t map, struct cryptop *crp,
     bus_dmamap_callback_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int error;
 	int nsegs;
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = 0;
 	switch (crp->crp_buf_type) {
 	case CRYPTO_BUF_CONTIG:
 		error = _bus_dmamap_load_buffer(dmat, map, crp->crp_buf,
 		    crp->crp_ilen, kernel_pmap, flags, NULL, &nsegs);
 		break;
 	case CRYPTO_BUF_MBUF:
 		error = _bus_dmamap_load_mbuf_sg(dmat, map, crp->crp_mbuf,
 		    NULL, &nsegs, flags);
 		break;
 	case CRYPTO_BUF_UIO:
 		error = _bus_dmamap_load_uio(dmat, map, crp->crp_uio, &nsegs,
 		    flags);
 		break;
 	}
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
Index: head/sys/kern/subr_sglist.c
===================================================================
--- head/sys/kern/subr_sglist.c	(revision 360573)
+++ head/sys/kern/subr_sglist.c	(revision 360574)
@@ -1,996 +1,971 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <sys/ktr.h>
 
 static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
 
 /*
  * Convenience macros to save the state of an sglist so it can be restored
  * if an append attempt fails.  Since sglist's only grow we only need to
  * save the current count of segments and the length of the ending segment.
  * Earlier segments will not be changed by an append, and the only change
  * that can occur to the ending segment is that it can be extended.
  */
 struct sgsave {
 	u_short sg_nseg;
 	size_t ss_len;
 };
 
 #define	SGLIST_SAVE(sg, sgsave) do {					\
 	(sgsave).sg_nseg = (sg)->sg_nseg;				\
 	if ((sgsave).sg_nseg > 0)					\
 		(sgsave).ss_len = (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len; \
 	else								\
 		(sgsave).ss_len = 0;					\
 } while (0)
 
 #define	SGLIST_RESTORE(sg, sgsave) do {					\
 	(sg)->sg_nseg = (sgsave).sg_nseg;				\
 	if ((sgsave).sg_nseg > 0)					\
 		(sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len = (sgsave).ss_len; \
 } while (0)
 
 /*
  * Append a single (paddr, len) to a sglist.  sg is the list and ss is
  * the current segment in the list.  If we run out of segments then
  * EFBIG will be returned.
  */
 static __inline int
 _sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
     vm_paddr_t paddr, size_t len)
 {
 	struct sglist_seg *ss;
 
 	ss = *ssp;
 	if (ss->ss_paddr + ss->ss_len == paddr)
 		ss->ss_len += len;
 	else {
 		if (sg->sg_nseg == sg->sg_maxseg)
 			return (EFBIG);
 		ss++;
 		ss->ss_paddr = paddr;
 		ss->ss_len = len;
 		sg->sg_nseg++;
 		*ssp = ss;
 	}
 	return (0);
 }
 
 /*
  * Worker routine to append a virtual address range (either kernel or
  * user) to a scatter/gather list.
  */
 static __inline int
 _sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
     size_t *donep)
 {
 	struct sglist_seg *ss;
 	vm_offset_t vaddr, offset;
 	vm_paddr_t paddr;
 	size_t seglen;
 	int error;
 
 	if (donep)
 		*donep = 0;
 	if (len == 0)
 		return (0);
 
 	/* Do the first page.  It may have an offset. */
 	vaddr = (vm_offset_t)buf;
 	offset = vaddr & PAGE_MASK;
 	if (pmap != NULL)
 		paddr = pmap_extract(pmap, vaddr);
 	else
 		paddr = pmap_kextract(vaddr);
 	seglen = MIN(len, PAGE_SIZE - offset);
 	if (sg->sg_nseg == 0) {
 		ss = sg->sg_segs;
 		ss->ss_paddr = paddr;
 		ss->ss_len = seglen;
 		sg->sg_nseg = 1;
 	} else {
 		ss = &sg->sg_segs[sg->sg_nseg - 1];
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error)
 			return (error);
 	}
 	vaddr += seglen;
 	len -= seglen;
 	if (donep)
 		*donep += seglen;
 
 	while (len > 0) {
 		seglen = MIN(len, PAGE_SIZE);
 		if (pmap != NULL)
 			paddr = pmap_extract(pmap, vaddr);
 		else
 			paddr = pmap_kextract(vaddr);
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error)
 			return (error);
 		vaddr += seglen;
 		len -= seglen;
 		if (donep)
 			*donep += seglen;
 	}
 
 	return (0);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe a kernel virtual address range.
  */
 int
 sglist_count(void *buf, size_t len)
 {
 	vm_offset_t vaddr, vendaddr;
 	vm_paddr_t lastaddr, paddr;
 	int nsegs;
 
 	if (len == 0)
 		return (0);
 
 	vaddr = trunc_page((vm_offset_t)buf);
 	vendaddr = (vm_offset_t)buf + len;
 	nsegs = 1;
 	lastaddr = pmap_kextract(vaddr);
 	vaddr += PAGE_SIZE;
 	while (vaddr < vendaddr) {
 		paddr = pmap_kextract(vaddr);
 		if (lastaddr + PAGE_SIZE != paddr)
 			nsegs++;
 		lastaddr = paddr;
 		vaddr += PAGE_SIZE;
 	}
 	return (nsegs);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe a buffer backed by an array of VM pages.
  */
 int
 sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
 {
 	vm_paddr_t lastaddr, paddr;
 	int i, nsegs;
 
 	if (len == 0)
 		return (0);
 
 	len += pgoff;
 	nsegs = 1;
 	lastaddr = VM_PAGE_TO_PHYS(m[0]);
 	for (i = 1; len > PAGE_SIZE; len -= PAGE_SIZE, i++) {
 		paddr = VM_PAGE_TO_PHYS(m[i]);
 		if (lastaddr + PAGE_SIZE != paddr)
 			nsegs++;
 		lastaddr = paddr;
 	}
 	return (nsegs);
 }
 
 /*
  * Determine the number of scatter/gather list elements needed to
  * describe an EXT_PGS buffer.
  */
 int
-sglist_count_ext_pgs(struct mbuf *m, size_t off, size_t len)
+sglist_count_mbuf_epg(struct mbuf *m, size_t off, size_t len)
 {
-	struct mbuf_ext_pgs *ext_pgs = &m->m_ext_pgs;
 	vm_paddr_t nextaddr, paddr;
 	size_t seglen, segoff;
 	int i, nsegs, pglen, pgoff;
 
 	if (len == 0)
 		return (0);
 
 	nsegs = 0;
-	if (ext_pgs->hdr_len != 0) {
-		if (off >= ext_pgs->hdr_len) {
-			off -= ext_pgs->hdr_len;
+	if (m->m_ext_pgs.hdr_len != 0) {
+		if (off >= m->m_ext_pgs.hdr_len) {
+			off -= m->m_ext_pgs.hdr_len;
 		} else {
-			seglen = ext_pgs->hdr_len - off;
+			seglen = m->m_ext_pgs.hdr_len - off;
 			segoff = off;
 			seglen = MIN(seglen, len);
 			off = 0;
 			len -= seglen;
 			nsegs += sglist_count(&m->m_epg_hdr[segoff],
 			    seglen);
 		}
 	}
 	nextaddr = 0;
-	pgoff = ext_pgs->first_pg_off;
-	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
-		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+	pgoff = m->m_ext_pgs.first_pg_off;
+	for (i = 0; i < m->m_ext_pgs.npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(&m->m_ext_pgs, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = MIN(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		if (paddr != nextaddr)
 			nsegs++;
 		nextaddr = paddr + seglen;
 		pgoff = 0;
 	};
 	if (len != 0) {
-		seglen = MIN(len, ext_pgs->trail_len - off);
+		seglen = MIN(len, m->m_ext_pgs.trail_len - off);
 		len -= seglen;
 		nsegs += sglist_count(&m->m_epg_trail[off], seglen);
 	}
 	KASSERT(len == 0, ("len != 0"));
 	return (nsegs);
 }
 
 /*
- * Determine the number of scatter/gather list elements needed to
- * describe an EXT_PGS mbuf.
- */
-int
-sglist_count_mb_ext_pgs(struct mbuf *m)
-{
-
-	MBUF_EXT_PGS_ASSERT(m);
-	return (sglist_count_ext_pgs(m, mtod(m, vm_offset_t), m->m_len));
-}
-
-/*
  * Allocate a scatter/gather list along with 'nsegs' segments.  The
  * 'mflags' parameters are the same as passed to malloc(9).  The caller
  * should use sglist_free() to free this list.
  */
 struct sglist *
 sglist_alloc(int nsegs, int mflags)
 {
 	struct sglist *sg;
 
 	sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
 	    M_SGLIST, mflags);
 	if (sg == NULL)
 		return (NULL);
 	sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
 	return (sg);
 }
 
 /*
  * Free a scatter/gather list allocated via sglist_allc().
  */
 void
 sglist_free(struct sglist *sg)
 {
 
 	if (sg == NULL)
 		return;
 
 	if (refcount_release(&sg->sg_refs))
 		free(sg, M_SGLIST);
 }
 
 /*
  * Append the segments to describe a single kernel virtual address
  * range to a scatter/gather list.  If there are insufficient
  * segments, then this fails with EFBIG.
  */
 int
 sglist_append(struct sglist *sg, void *buf, size_t len)
 {
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_buf(sg, buf, len, NULL, NULL);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append the segments to describe a bio's data to a scatter/gather list.
  * If there are insufficient segments, then this fails with EFBIG.
  *
  * NOTE: This function expects bio_bcount to be initialized.
  */
 int
 sglist_append_bio(struct sglist *sg, struct bio *bp)
 {
 	int error;
 
 	if ((bp->bio_flags & BIO_UNMAPPED) == 0)
 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
 	else
 		error = sglist_append_vmpages(sg, bp->bio_ma,
 		    bp->bio_ma_offset, bp->bio_bcount);
 	return (error);
 }
 
 /*
  * Append a single physical address range to a scatter/gather list.
  * If there are insufficient segments, then this fails with EFBIG.
  */
 int
 sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
 {
 	struct sglist_seg *ss;
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	if (len == 0)
 		return (0);
 
 	if (sg->sg_nseg == 0) {
 		sg->sg_segs[0].ss_paddr = paddr;
 		sg->sg_segs[0].ss_len = len;
 		sg->sg_nseg = 1;
 		return (0);
 	}
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_range(sg, &ss, paddr, len);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
- * Append the segments to describe an EXT_PGS buffer to a
- * scatter/gather list.  If there are insufficient segments, then this
- * fails with EFBIG.
+ * Append the segments of single multi-page mbuf.
+ * If there are insufficient segments, then this fails with EFBIG.
  */
 int
-sglist_append_ext_pgs(struct sglist *sg, struct mbuf *m, size_t off, size_t len)
+sglist_append_mbuf_epg(struct sglist *sg, struct mbuf *m, size_t off,
+    size_t len)
 {
-	struct mbuf_ext_pgs *ext_pgs = &m->m_ext_pgs;
 	size_t seglen, segoff;
 	vm_paddr_t paddr;
 	int error, i, pglen, pgoff;
 
+	MBUF_EXT_PGS_ASSERT(m);
+
 	error = 0;
-	if (ext_pgs->hdr_len != 0) {
-		if (off >= ext_pgs->hdr_len) {
-			off -= ext_pgs->hdr_len;
+	if (m->m_ext_pgs.hdr_len != 0) {
+		if (off >= m->m_ext_pgs.hdr_len) {
+			off -= m->m_ext_pgs.hdr_len;
 		} else {
-			seglen = ext_pgs->hdr_len - off;
+			seglen = m->m_ext_pgs.hdr_len - off;
 			segoff = off;
 			seglen = MIN(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = sglist_append(sg,
 			    &m->m_epg_hdr[segoff], seglen);
 		}
 	}
-	pgoff = ext_pgs->first_pg_off;
-	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
-		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+	pgoff = m->m_ext_pgs.first_pg_off;
+	for (i = 0; i < m->m_ext_pgs.npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(&m->m_ext_pgs, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = MIN(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		error = sglist_append_phys(sg, paddr, seglen);
 		pgoff = 0;
 	};
 	if (error == 0 && len > 0) {
-		seglen = MIN(len, ext_pgs->trail_len - off);
+		seglen = MIN(len, m->m_ext_pgs.trail_len - off);
 		len -= seglen;
 		error = sglist_append(sg,
 		    &m->m_epg_trail[off], seglen);
 	}
 	if (error == 0)
 		KASSERT(len == 0, ("len != 0"));
 	return (error);
 }
 
 /*
- * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
- * list.  If there are insufficient segments, then this fails with
- * EFBIG.
- */
-int
-sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
-{
-
-	/* for now, all unmapped mbufs are assumed to be EXT_PGS */
-	MBUF_EXT_PGS_ASSERT(m);
-	return (sglist_append_ext_pgs(sg, m, mtod(m, vm_offset_t), m->m_len));
-}
-
-/*
  * Append the segments that describe a single mbuf chain to a
  * scatter/gather list.  If there are insufficient segments, then this
  * fails with EFBIG.
  */
 int
 sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
 {
 	struct sgsave save;
 	struct mbuf *m;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	error = 0;
 	SGLIST_SAVE(sg, save);
 	for (m = m0; m != NULL; m = m->m_next) {
 		if (m->m_len > 0) {
 			if ((m->m_flags & M_NOMAP) != 0)
-				error = sglist_append_mb_ext_pgs(sg, m);
+				error = sglist_append_mbuf_epg(sg, m,
+				    mtod(m, vm_offset_t), m->m_len);
 			else
 				error = sglist_append(sg, m->m_data,
 				    m->m_len);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe a buffer spanning an array of VM
  * pages.  The buffer begins at an offset of 'pgoff' in the first
  * page.
  */
 int
 sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff,
     size_t len)
 {
 	struct sgsave save;
 	struct sglist_seg *ss;
 	vm_paddr_t paddr;
 	size_t seglen;
 	int error, i;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	if (len == 0)
 		return (0);
 
 	SGLIST_SAVE(sg, save);
 	i = 0;
 	if (sg->sg_nseg == 0) {
 		seglen = min(PAGE_SIZE - pgoff, len);
 		sg->sg_segs[0].ss_paddr = VM_PAGE_TO_PHYS(m[0]) + pgoff;
 		sg->sg_segs[0].ss_len = seglen;
 		sg->sg_nseg = 1;
 		pgoff = 0;
 		len -= seglen;
 		i++;
 	}
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	for (; len > 0; i++, len -= seglen) {
 		seglen = min(PAGE_SIZE - pgoff, len);
 		paddr = VM_PAGE_TO_PHYS(m[i]) + pgoff;
 		error = _sglist_append_range(sg, &ss, paddr, seglen);
 		if (error) {
 			SGLIST_RESTORE(sg, save);
 			return (error);
 		}
 		pgoff = 0;
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe a single user address range to a
  * scatter/gather list.  If there are insufficient segments, then this
  * fails with EFBIG.
  */
 int
 sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
 {
 	struct sgsave save;
 	int error;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = _sglist_append_buf(sg, buf, len,
 	    vmspace_pmap(td->td_proc->p_vmspace), NULL);
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append a subset of an existing scatter/gather list 'source' to a
  * the scatter/gather list 'sg'.  If there are insufficient segments,
  * then this fails with EFBIG.
  */
 int
 sglist_append_sglist(struct sglist *sg, struct sglist *source, size_t offset,
     size_t length)
 {
 	struct sgsave save;
 	struct sglist_seg *ss;
 	size_t seglen;
 	int error, i;
 
 	if (sg->sg_maxseg == 0 || length == 0)
 		return (EINVAL);
 	SGLIST_SAVE(sg, save);
 	error = EINVAL;
 	ss = &sg->sg_segs[sg->sg_nseg - 1];
 	for (i = 0; i < source->sg_nseg; i++) {
 		if (offset >= source->sg_segs[i].ss_len) {
 			offset -= source->sg_segs[i].ss_len;
 			continue;
 		}
 		seglen = source->sg_segs[i].ss_len - offset;
 		if (seglen > length)
 			seglen = length;
 		error = _sglist_append_range(sg, &ss,
 		    source->sg_segs[i].ss_paddr + offset, seglen);
 		if (error)
 			break;
 		offset = 0;
 		length -= seglen;
 		if (length == 0)
 			break;
 	}
 	if (length != 0)
 		error = EINVAL;
 	if (error)
 		SGLIST_RESTORE(sg, save);
 	return (error);
 }
 
 /*
  * Append the segments that describe a single uio to a scatter/gather
  * list.  If there are insufficient segments, then this fails with
  * EFBIG.
  */
 int
 sglist_append_uio(struct sglist *sg, struct uio *uio)
 {
 	struct iovec *iov;
 	struct sgsave save;
 	size_t resid, minlen;
 	pmap_t pmap;
 	int error, i;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	resid = uio->uio_resid;
 	iov = uio->uio_iov;
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 		    ("sglist_append_uio: USERSPACE but no thread"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = NULL;
 
 	error = 0;
 	SGLIST_SAVE(sg, save);
 	for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
 		 * until we have exhausted the residual count.
 		 */
 		minlen = MIN(resid, iov[i].iov_len);
 		if (minlen > 0) {
 			error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
 			    pmap, NULL);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);
 			}
 			resid -= minlen;
 		}
 	}
 	return (0);
 }
 
 /*
  * Append the segments that describe at most 'resid' bytes from a
  * single uio to a scatter/gather list.  If there are insufficient
  * segments, then only the amount that fits is appended.
  */
 int
 sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid)
 {
 	struct iovec *iov;
 	size_t done;
 	pmap_t pmap;
 	int error, len;
 
 	if (sg->sg_maxseg == 0)
 		return (EINVAL);
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 		    ("sglist_consume_uio: USERSPACE but no thread"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = NULL;
 
 	error = 0;
 	while (resid > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		len = iov->iov_len;
 		if (len == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (len > resid)
 			len = resid;
 
 		/*
 		 * Try to append this iovec.  If we run out of room,
 		 * then break out of the loop.
 		 */
 		error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
 		iov->iov_base = (char *)iov->iov_base + done;
 		iov->iov_len -= done;
 		uio->uio_resid -= done;
 		uio->uio_offset += done;
 		resid -= done;
 		if (error)
 			break;
 	}
 	return (0);
 }
 
 /*
  * Allocate and populate a scatter/gather list to describe a single
  * kernel virtual address range.
  */
 struct sglist *
 sglist_build(void *buf, size_t len, int mflags)
 {
 	struct sglist *sg;
 	int nsegs;
 
 	if (len == 0)
 		return (NULL);
 
 	nsegs = sglist_count(buf, len);
 	sg = sglist_alloc(nsegs, mflags);
 	if (sg == NULL)
 		return (NULL);
 	if (sglist_append(sg, buf, len) != 0) {
 		sglist_free(sg);
 		return (NULL);
 	}
 	return (sg);
 }
 
 /*
  * Clone a new copy of a scatter/gather list.
  */
 struct sglist *
 sglist_clone(struct sglist *sg, int mflags)
 {
 	struct sglist *new;
 
 	if (sg == NULL)
 		return (NULL);
 	new = sglist_alloc(sg->sg_maxseg, mflags);
 	if (new == NULL)
 		return (NULL);
 	new->sg_nseg = sg->sg_nseg;
 	bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
 	    sg->sg_nseg);
 	return (new);
 }
 
 /*
  * Calculate the total length of the segments described in a
  * scatter/gather list.
  */
 size_t
 sglist_length(struct sglist *sg)
 {
 	size_t space;
 	int i;
 
 	space = 0;
 	for (i = 0; i < sg->sg_nseg; i++)
 		space += sg->sg_segs[i].ss_len;
 	return (space);
 }
 
 /*
  * Split a scatter/gather list into two lists.  The scatter/gather
  * entries for the first 'length' bytes of the 'original' list are
  * stored in the '*head' list and are removed from 'original'.
  *
  * If '*head' is NULL, then a new list will be allocated using
  * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
  * ENOMEM will be returned.
  *
  * If '*head' is not NULL, it should point to an empty sglist.  If it
  * does not have enough room for the remaining space, then EFBIG will
  * be returned.  If '*head' is not empty, then EINVAL will be
  * returned.
  *
  * If 'original' is shared (refcount > 1), then EDOOFUS will be
  * returned.
  */
 int
 sglist_split(struct sglist *original, struct sglist **head, size_t length,
     int mflags)
 {
 	struct sglist *sg;
 	size_t space, split;
 	int count, i;
 
 	if (original->sg_refs > 1)
 		return (EDOOFUS);
 
 	/* Figure out how big of a sglist '*head' has to hold. */
 	count = 0;
 	space = 0;
 	split = 0;
 	for (i = 0; i < original->sg_nseg; i++) {
 		space += original->sg_segs[i].ss_len;
 		count++;
 		if (space >= length) {
 			/*
 			 * If 'length' falls in the middle of a
 			 * scatter/gather list entry, then 'split'
 			 * holds how much of that entry will remain in
 			 * 'original'.
 			 */
 			split = space - length;
 			break;
 		}
 	}
 
 	/* Nothing to do, so leave head empty. */
 	if (count == 0)
 		return (0);
 
 	if (*head == NULL) {
 		sg = sglist_alloc(count, mflags);
 		if (sg == NULL)
 			return (ENOMEM);
 		*head = sg;
 	} else {
 		sg = *head;
 		if (sg->sg_maxseg < count)
 			return (EFBIG);
 		if (sg->sg_nseg != 0)
 			return (EINVAL);
 	}
 
 	/* Copy 'count' entries to 'sg' from 'original'. */
 	bcopy(original->sg_segs, sg->sg_segs, count *
 	    sizeof(struct sglist_seg));
 	sg->sg_nseg = count;
 
 	/*
 	 * If we had to split a list entry, fixup the last entry in
 	 * 'sg' and the new first entry in 'original'.  We also
 	 * decrement 'count' by 1 since we will only be removing
 	 * 'count - 1' segments from 'original' now.
 	 */
 	if (split != 0) {
 		count--;
 		sg->sg_segs[count].ss_len -= split;
 		original->sg_segs[count].ss_paddr =
 		    sg->sg_segs[count].ss_paddr + split;
 		original->sg_segs[count].ss_len = split;
 	}
 
 	/* Trim 'count' entries from the front of 'original'. */
 	original->sg_nseg -= count;
 	bcopy(original->sg_segs + count, original->sg_segs, count *
 	    sizeof(struct sglist_seg));
 	return (0);
 }
 
 /*
  * Append the scatter/gather list elements in 'second' to the
  * scatter/gather list 'first'.  If there is not enough space in
  * 'first', EFBIG is returned.
  */
 int
 sglist_join(struct sglist *first, struct sglist *second)
 {
 	struct sglist_seg *flast, *sfirst;
 	int append;
 
 	/* If 'second' is empty, there is nothing to do. */
 	if (second->sg_nseg == 0)
 		return (0);
 
 	/*
 	 * If the first entry in 'second' can be appended to the last entry
 	 * in 'first' then set append to '1'.
 	 */
 	append = 0;
 	flast = &first->sg_segs[first->sg_nseg - 1];
 	sfirst = &second->sg_segs[0];
 	if (first->sg_nseg != 0 &&
 	    flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
 		append = 1;
 
 	/* Make sure 'first' has enough room. */
 	if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
 		return (EFBIG);
 
 	/* Merge last in 'first' and first in 'second' if needed. */
 	if (append)
 		flast->ss_len += sfirst->ss_len;
 
 	/* Append new segments from 'second' to 'first'. */
 	bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
 	    (second->sg_nseg - append) * sizeof(struct sglist_seg));
 	first->sg_nseg += second->sg_nseg - append;
 	sglist_reset(second);
 	return (0);
 }
 
 /*
  * Generate a new scatter/gather list from a range of an existing
  * scatter/gather list.  The 'offset' and 'length' parameters specify
  * the logical range of the 'original' list to extract.  If that range
  * is not a subset of the length of 'original', then EINVAL is
  * returned.  The new scatter/gather list is stored in '*slice'.
  *
  * If '*slice' is NULL, then a new list will be allocated using
  * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
  * ENOMEM will be returned.
  *
  * If '*slice' is not NULL, it should point to an empty sglist.  If it
  * does not have enough room for the remaining space, then EFBIG will
  * be returned.  If '*slice' is not empty, then EINVAL will be
  * returned.
  */
 int
 sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
     size_t length, int mflags)
 {
 	struct sglist *sg;
 	size_t space, end, foffs, loffs;
 	int count, i, fseg;
 
 	/* Nothing to do. */
 	if (length == 0)
 		return (0);
 
 	/* Figure out how many segments '*slice' needs to have. */
 	end = offset + length;
 	space = 0;
 	count = 0;
 	fseg = 0;
 	foffs = loffs = 0;
 	for (i = 0; i < original->sg_nseg; i++) {
 		space += original->sg_segs[i].ss_len;
 		if (space > offset) {
 			/*
 			 * When we hit the first segment, store its index
 			 * in 'fseg' and the offset into the first segment
 			 * of 'offset' in 'foffs'.
 			 */
 			if (count == 0) {
 				fseg = i;
 				foffs = offset - (space -
 				    original->sg_segs[i].ss_len);
 				CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
 				    foffs);
 			}
 			count++;
 
 			/*
 			 * When we hit the last segment, break out of
 			 * the loop.  Store the amount of extra space
 			 * at the end of this segment in 'loffs'.
 			 */
 			if (space >= end) {
 				loffs = space - end;
 				CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
 				    loffs);
 				break;
 			}
 		}
 	}
 
 	/* If we never hit 'end', then 'length' ran off the end, so fail. */
 	if (space < end)
 		return (EINVAL);
 
 	if (*slice == NULL) {
 		sg = sglist_alloc(count, mflags);
 		if (sg == NULL)
 			return (ENOMEM);
 		*slice = sg;
 	} else {
 		sg = *slice;
 		if (sg->sg_maxseg < count)
 			return (EFBIG);
 		if (sg->sg_nseg != 0)
 			return (EINVAL);
 	}
 
 	/*
 	 * Copy over 'count' segments from 'original' starting at
 	 * 'fseg' to 'sg'.
 	 */
 	bcopy(original->sg_segs + fseg, sg->sg_segs,
 	    count * sizeof(struct sglist_seg));
 	sg->sg_nseg = count;
 
 	/* Fixup first and last segments if needed. */
 	if (foffs != 0) {
 		sg->sg_segs[0].ss_paddr += foffs;
 		sg->sg_segs[0].ss_len -= foffs;
 		CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
 		    (long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
 	}
 	if (loffs != 0) {
 		sg->sg_segs[count - 1].ss_len -= loffs;
 		CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
 		    sg->sg_segs[count - 1].ss_len);
 	}
 	return (0);
 }
Index: head/sys/sys/sglist.h
===================================================================
--- head/sys/sys/sglist.h	(revision 360573)
+++ head/sys/sys/sglist.h	(revision 360574)
@@ -1,118 +1,116 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * A scatter/gather list describes a group of physical address ranges.
  * Each physical address range consists of a starting address and a
  * length.
  */
 
 #ifndef __SGLIST_H__
 #define	__SGLIST_H__
 
 #include <sys/refcount.h>
 
 struct sglist_seg {
 	vm_paddr_t	ss_paddr;
 	size_t		ss_len;
 };
 
 struct sglist {
 	struct sglist_seg *sg_segs;
 	u_int		sg_refs;
 	u_short		sg_nseg;
 	u_short		sg_maxseg;
 };
 
 struct bio;
 struct mbuf;
 struct uio;
 
 static __inline void
 sglist_init(struct sglist *sg, u_short maxsegs, struct sglist_seg *segs)
 {
 
 	sg->sg_segs = segs;
 	sg->sg_nseg = 0;
 	sg->sg_maxseg = maxsegs;
 	refcount_init(&sg->sg_refs, 1);
 }
 
 static __inline void
 sglist_reset(struct sglist *sg)
 {
 
 	sg->sg_nseg = 0;
 }
 
 static __inline struct sglist *
 sglist_hold(struct sglist *sg)
 {
 
 	refcount_acquire(&sg->sg_refs);
 	return (sg);
 }
 
 struct sglist *sglist_alloc(int nsegs, int mflags);
 int	sglist_append(struct sglist *sg, void *buf, size_t len);
 int	sglist_append_bio(struct sglist *sg, struct bio *bp);
-int	sglist_append_ext_pgs(struct sglist *sg, struct mbuf *m, size_t off,
-	    size_t len);
-int	sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m);
 int	sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
+int	sglist_append_mbuf_epg(struct sglist *sg, struct mbuf *m0, size_t off,
+	    size_t len);
 int	sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
 	    size_t len);
 int	sglist_append_sglist(struct sglist *sg, struct sglist *source,
 	    size_t offset, size_t length);
 int	sglist_append_uio(struct sglist *sg, struct uio *uio);
 int	sglist_append_user(struct sglist *sg, void *buf, size_t len,
 	    struct thread *td);
 int	sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff,
 	    size_t len);
 struct sglist *sglist_build(void *buf, size_t len, int mflags);
 struct sglist *sglist_clone(struct sglist *sg, int mflags);
 int	sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
 int	sglist_count(void *buf, size_t len);
-int	sglist_count_ext_pgs(struct mbuf *m, size_t off, size_t len);
-int	sglist_count_mb_ext_pgs(struct mbuf *m);
+int	sglist_count_mbuf_epg(struct mbuf *m, size_t off, size_t len);
 int	sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
 void	sglist_free(struct sglist *sg);
 int	sglist_join(struct sglist *first, struct sglist *second);
 size_t	sglist_length(struct sglist *sg);
 int	sglist_slice(struct sglist *original, struct sglist **slice,
 	    size_t offset, size_t length, int mflags);
 int	sglist_split(struct sglist *original, struct sglist **head,
 	    size_t length, int mflags);
 
 #endif	/* !__SGLIST_H__ */