Index: head/sys/dev/cxgb/cxgb_adapter.h
===================================================================
--- head/sys/dev/cxgb/cxgb_adapter.h	(revision 278976)
+++ head/sys/dev/cxgb/cxgb_adapter.h	(revision 278977)
@@ -1,580 +1,579 @@
 /**************************************************************************
 
 Copyright (c) 2007-2009, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Chelsio Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 $FreeBSD$
 
 ***************************************************************************/
 
 
 #ifndef _CXGB_ADAPTER_H_
 #define _CXGB_ADAPTER_H_
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/condvar.h>
 #include <sys/buf_ring.h>
 #include <sys/taskqueue.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_dl.h>
 #include <netinet/in.h>
 #include <netinet/tcp_lro.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 
 #include <sys/bus_dma.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <cxgb_osdep.h>
-#include <sys/mbufq.h>
 
 struct adapter;
 struct sge_qset;
 extern int cxgb_debug;
 
 #ifdef DEBUG_LOCKING
 #define MTX_INIT(lock, lockname, class, flags) \
 	do { \
 		printf("initializing %s at %s:%d\n", lockname, __FILE__, __LINE__); \
 		mtx_init((lock), lockname, class, flags);		\
 	} while (0)
 
 #define MTX_DESTROY(lock) \
 	do { \
 		printf("destroying %s at %s:%d\n", (lock)->lock_object.lo_name, __FILE__, __LINE__); \
 		mtx_destroy((lock));					\
 	} while (0)
 
 #else
 #define MTX_INIT mtx_init
 #define MTX_DESTROY mtx_destroy
 #endif
 
 enum {
 	LF_NO = 0,
 	LF_MAYBE,
 	LF_YES
 };
 
 struct port_info {
 	struct adapter	*adapter;
 	struct ifnet	*ifp;
 	int		if_flags;
 	int		flags;
 	const struct port_type_info *port_type;
 	struct cphy	phy;
 	struct cmac	mac;
 	struct timeval	last_refreshed;
 	struct link_config link_config;
 	struct ifmedia	media;
 	struct mtx	lock;
 	uint32_t	port_id;
 	uint32_t	tx_chan;
 	uint32_t	txpkt_intf;
 	uint32_t        first_qset;
 	uint32_t	nqsets;
 	int		link_fault;
 
 	uint8_t		hw_addr[ETHER_ADDR_LEN];
 	struct callout	link_check_ch;
 	struct task	link_check_task;
 	struct task	timer_reclaim_task;
 	struct cdev     *port_cdev;
 
 #define PORT_LOCK_NAME_LEN 32
 #define PORT_NAME_LEN 32
 	char            lockbuf[PORT_LOCK_NAME_LEN];
 	char            namebuf[PORT_NAME_LEN];
 } __aligned(L1_CACHE_BYTES);
 
 enum {
 	/* adapter flags */
 	FULL_INIT_DONE	= (1 << 0),
 	USING_MSI	= (1 << 1),
 	USING_MSIX	= (1 << 2),
 	QUEUES_BOUND	= (1 << 3),
 	FW_UPTODATE	= (1 << 4),
 	TPS_UPTODATE	= (1 << 5),
 	CXGB_SHUTDOWN	= (1 << 6),
 	CXGB_OFLD_INIT	= (1 << 7),
 	TP_PARITY_INIT	= (1 << 8),
 	CXGB_BUSY	= (1 << 9),
 	TOM_INIT_DONE	= (1 << 10),
 
 	/* port flags */
 	DOOMED		= (1 << 0),
 };
 #define IS_DOOMED(p)	(p->flags & DOOMED)
 #define SET_DOOMED(p)	do {p->flags |= DOOMED;} while (0)
 #define IS_BUSY(sc)	(sc->flags & CXGB_BUSY)
 #define SET_BUSY(sc)	do {sc->flags |= CXGB_BUSY;} while (0)
 #define CLR_BUSY(sc)	do {sc->flags &= ~CXGB_BUSY;} while (0)
 
 #define FL_Q_SIZE	4096
 #define JUMBO_Q_SIZE	1024
 #define RSPQ_Q_SIZE	2048
 #define TX_ETH_Q_SIZE	1024
 #define TX_OFLD_Q_SIZE	1024
 #define TX_CTRL_Q_SIZE	256
 
 enum { TXQ_ETH = 0,
        TXQ_OFLD = 1,
        TXQ_CTRL = 2, };
 
 
 /* 
  * work request size in bytes
  */
 #define WR_LEN (WR_FLITS * 8)
 #define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt_lso))
 
 struct lro_state {
 	unsigned short enabled;
 	struct lro_ctrl ctrl;
 };
 
 #define RX_BUNDLE_SIZE 8
 
 struct rsp_desc;
 
 struct sge_rspq {
 	uint32_t	credits;
 	uint32_t	size;
 	uint32_t	cidx;
 	uint32_t	gen;
 	uint32_t	polling;
 	uint32_t	holdoff_tmr;
 	uint32_t	next_holdoff;
 	uint32_t        imm_data;
 	uint32_t        async_notif;
 	uint32_t	cntxt_id;
 	uint32_t        offload_pkts;
 	uint32_t        pure_rsps;
 	uint32_t        unhandled_irqs;
 	uint32_t        starved;
 
 	bus_addr_t	phys_addr;
 	bus_dma_tag_t	desc_tag;
 	bus_dmamap_t	desc_map;
 
 	struct t3_mbuf_hdr rspq_mh;
 	struct rsp_desc	*desc;
 	struct mtx      lock;
 #define RSPQ_NAME_LEN  32
 	char            lockbuf[RSPQ_NAME_LEN];
 	uint32_t	rspq_dump_start;
 	uint32_t	rspq_dump_count;
 };
 
 struct rx_desc;
 struct rx_sw_desc;
 
 struct sge_fl {
 	uint32_t	buf_size;
 	uint32_t	credits;
 	uint32_t	size;
 	uint32_t	cidx;
 	uint32_t	pidx;
 	uint32_t	gen;
 	uint32_t	db_pending;
 	bus_addr_t	phys_addr;
 	uint32_t	cntxt_id;
 	uint32_t	empty;
 	bus_dma_tag_t	desc_tag;
 	bus_dmamap_t	desc_map;
 	bus_dma_tag_t   entry_tag;
 	uma_zone_t      zone;
 	struct rx_desc	*desc;
 	struct rx_sw_desc *sdesc;
 	int             type;
 };
 
 struct tx_desc;
 struct tx_sw_desc;
 
 #define TXQ_TRANSMITTING    0x1
 
 struct sge_txq {
 	uint64_t	flags;
 	uint32_t	in_use;
 	uint32_t	size;
 	uint32_t	processed;
 	uint32_t	cleaned;
 	uint32_t	stop_thres;
 	uint32_t	cidx;
 	uint32_t	pidx;
 	uint32_t	gen;
 	uint32_t	unacked;
 	uint32_t	db_pending;
 	struct tx_desc	*desc;
 	struct tx_sw_desc *sdesc;
 	uint32_t	token;
 	bus_addr_t	phys_addr;
 	struct task     qresume_task;
 	struct task     qreclaim_task;
 	uint32_t	cntxt_id;
 	uint64_t	stops;
 	uint64_t	restarts;
 	bus_dma_tag_t	desc_tag;
 	bus_dmamap_t	desc_map;
 	bus_dma_tag_t   entry_tag;
-	struct mbuf_head sendq;
+	struct mbufq	sendq;
 
 	struct buf_ring *txq_mr;
 	struct ifaltq	*txq_ifq;
 	struct callout	txq_timer;
 	struct callout	txq_watchdog;
 	uint64_t        txq_coalesced;
 	uint32_t        txq_skipped;
 	uint32_t        txq_enqueued;
 	uint32_t	txq_dump_start;
 	uint32_t	txq_dump_count;
 	uint64_t	txq_direct_packets;
 	uint64_t	txq_direct_bytes;	
 	uint64_t	txq_frees;
 	struct sg_ent  txq_sgl[TX_MAX_SEGS / 2 + 1];
 };
      	
 #define SGE_PSTAT_MAX (SGE_PSTAT_VLANINS+1)
 
 #define QS_EXITING              0x1
 #define QS_RUNNING              0x2
 #define QS_BOUND                0x4
 #define	QS_FLUSHING		0x8
 #define	QS_TIMEOUT		0x10
 
 struct sge_qset {
 	struct sge_rspq		rspq;
 	struct sge_fl		fl[SGE_RXQ_PER_SET];
 	struct lro_state        lro;
 	struct sge_txq		txq[SGE_TXQ_PER_SET];
 	uint32_t                txq_stopped;       /* which Tx queues are stopped */
 	struct port_info        *port;
 	struct adapter          *adap;
 	int                     idx; /* qset # */
 	int                     qs_flags;
 	int			coalescing;
 	struct cv		qs_cv;
 	struct mtx		lock;
 #define QS_NAME_LEN 32
 	char                    namebuf[QS_NAME_LEN];
 };
 
 struct sge {
 	struct sge_qset	        qs[SGE_QSETS];
 	struct mtx              reg_lock;
 };
 
 struct filter_info;
 
 typedef int (*cpl_handler_t)(struct sge_qset *, struct rsp_desc *,
     struct mbuf *);
 
 struct adapter {
 	SLIST_ENTRY(adapter)	link;
 	device_t		dev;
 	int			flags;
 
 	/* PCI register resources */
 	int			regs_rid;
 	struct resource		*regs_res;
 	int			udbs_rid;
 	struct resource		*udbs_res;
 	bus_space_handle_t	bh;
 	bus_space_tag_t		bt;
 	bus_size_t              mmio_len;
 	uint32_t                link_width;
 	
 	/* DMA resources */
 	bus_dma_tag_t		parent_dmat;
 	bus_dma_tag_t		rx_dmat;
 	bus_dma_tag_t		rx_jumbo_dmat;
 	bus_dma_tag_t		tx_dmat;
 
 	/* Interrupt resources */
 	struct resource		*irq_res;
 	int			irq_rid;
 	void			*intr_tag;
 
 	uint32_t		msix_regs_rid;
 	struct resource		*msix_regs_res;
 
 	struct resource		*msix_irq_res[SGE_QSETS];
 	int			msix_irq_rid[SGE_QSETS];
 	void			*msix_intr_tag[SGE_QSETS];
 	uint8_t                 rxpkt_map[8]; /* maps RX_PKT interface values to port ids */
 	uint8_t                 rrss_map[SGE_QSETS]; /* revers RSS map table */
 	uint16_t                rspq_map[RSS_TABLE_SIZE];     /* maps 7-bit cookie to qidx */
 	union {
 		uint8_t                 fill[SGE_QSETS];
 		uint64_t                coalesce;
 	} u;
 
 #define tunq_fill u.fill
 #define tunq_coalesce u.coalesce
 	
 	struct filter_info      *filters;
 	
 	/* Tasks */
 	struct task		slow_intr_task;
 	struct task		tick_task;
 	struct taskqueue	*tq;
 	struct callout		cxgb_tick_ch;
 	struct callout		sge_timer_ch;
 
 	/* Register lock for use by the hardware layer */
 	struct mtx		mdio_lock;
 	struct mtx		elmer_lock;
 
 	/* Bookkeeping for the hardware layer */
 	struct adapter_params  params;
 	unsigned int slow_intr_mask;
 	unsigned long irq_stats[IRQ_NUM_STATS];
 
 	struct sge              sge;
 	struct mc7              pmrx;
 	struct mc7              pmtx;
 	struct mc7              cm;
 	struct mc5              mc5;
 
 	struct port_info	port[MAX_NPORTS];
 	device_t		portdev[MAX_NPORTS];
 #ifdef TCP_OFFLOAD
 	void 			*tom_softc;
 	void 			*iwarp_softc;
 #endif
 	char                    fw_version[64];
 	char                    port_types[MAX_NPORTS + 1];
 	uint32_t                open_device_map;
 #ifdef TCP_OFFLOAD
 	int			offload_map;
 #endif
 	struct mtx              lock;
 	driver_intr_t           *cxgb_intr;
 	int                     msi_count;
 
 #define ADAPTER_LOCK_NAME_LEN	32
 	char                    lockbuf[ADAPTER_LOCK_NAME_LEN];
 	char                    reglockbuf[ADAPTER_LOCK_NAME_LEN];
 	char                    mdiolockbuf[ADAPTER_LOCK_NAME_LEN];
 	char                    elmerlockbuf[ADAPTER_LOCK_NAME_LEN];
 
 	int			timestamp;
 
 #ifdef TCP_OFFLOAD
 #define NUM_CPL_HANDLERS	0xa7
 	cpl_handler_t cpl_handler[NUM_CPL_HANDLERS] __aligned(CACHE_LINE_SIZE);
 #endif
 };
 
 struct t3_rx_mode {
 	
 	uint32_t                idx;
 	struct port_info        *port;
 };
 
 #define MDIO_LOCK(adapter)	mtx_lock(&(adapter)->mdio_lock)
 #define MDIO_UNLOCK(adapter)	mtx_unlock(&(adapter)->mdio_lock)
 #define ELMR_LOCK(adapter)	mtx_lock(&(adapter)->elmer_lock)
 #define ELMR_UNLOCK(adapter)	mtx_unlock(&(adapter)->elmer_lock)
 
 
 #define PORT_LOCK(port)		     mtx_lock(&(port)->lock);
 #define PORT_UNLOCK(port)	     mtx_unlock(&(port)->lock);
 #define PORT_LOCK_INIT(port, name)   mtx_init(&(port)->lock, name, 0, MTX_DEF)
 #define PORT_LOCK_DEINIT(port)       mtx_destroy(&(port)->lock)
 #define PORT_LOCK_ASSERT_NOTOWNED(port) mtx_assert(&(port)->lock, MA_NOTOWNED)
 #define PORT_LOCK_ASSERT_OWNED(port) mtx_assert(&(port)->lock, MA_OWNED)
 
 #define ADAPTER_LOCK(adap)	mtx_lock(&(adap)->lock);
 #define ADAPTER_UNLOCK(adap)	mtx_unlock(&(adap)->lock);
 #define ADAPTER_LOCK_INIT(adap, name) mtx_init(&(adap)->lock, name, 0, MTX_DEF)
 #define ADAPTER_LOCK_DEINIT(adap) mtx_destroy(&(adap)->lock)
 #define ADAPTER_LOCK_ASSERT_NOTOWNED(adap) mtx_assert(&(adap)->lock, MA_NOTOWNED)
 #define ADAPTER_LOCK_ASSERT_OWNED(adap) mtx_assert(&(adap)->lock, MA_OWNED)
 
 
 static __inline uint32_t
 t3_read_reg(adapter_t *adapter, uint32_t reg_addr)
 {
 	return (bus_space_read_4(adapter->bt, adapter->bh, reg_addr));
 }
 
 static __inline void
 t3_write_reg(adapter_t *adapter, uint32_t reg_addr, uint32_t val)
 {
 	bus_space_write_4(adapter->bt, adapter->bh, reg_addr, val);
 }
 
 static __inline void
 t3_os_pci_read_config_4(adapter_t *adapter, int reg, uint32_t *val)
 {
 	*val = pci_read_config(adapter->dev, reg, 4);
 }
 
 static __inline void
 t3_os_pci_write_config_4(adapter_t *adapter, int reg, uint32_t val)
 {
 	pci_write_config(adapter->dev, reg, val, 4);
 }
 
 static __inline void
 t3_os_pci_read_config_2(adapter_t *adapter, int reg, uint16_t *val)
 {
 	*val = pci_read_config(adapter->dev, reg, 2);
 }
 
 static __inline void
 t3_os_pci_write_config_2(adapter_t *adapter, int reg, uint16_t val)
 {
 	pci_write_config(adapter->dev, reg, val, 2);
 }
 
 static __inline uint8_t *
 t3_get_next_mcaddr(struct t3_rx_mode *rm)
 {
 	uint8_t *macaddr = NULL;
 	struct ifnet *ifp = rm->port->ifp;
 	struct ifmultiaddr *ifma;
 	int i = 0;
 
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		if (i == rm->idx) {
 			macaddr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
 			break;
 		}
 		i++;
 	}
 	if_maddr_runlock(ifp);
 	
 	rm->idx++;
 	return (macaddr);
 }
 
 static __inline void
 t3_init_rx_mode(struct t3_rx_mode *rm, struct port_info *port)
 {
 	rm->idx = 0;
 	rm->port = port;
 }
 
 static __inline struct port_info *
 adap2pinfo(struct adapter *adap, int idx)
 {
 	return &adap->port[idx];
 }
 
 int t3_os_find_pci_capability(adapter_t *adapter, int cap);
 int t3_os_pci_save_state(struct adapter *adapter);
 int t3_os_pci_restore_state(struct adapter *adapter);
 void t3_os_link_intr(struct port_info *);
 void t3_os_link_changed(adapter_t *adapter, int port_id, int link_status,
 			int speed, int duplex, int fc, int mac_was_reset);
 void t3_os_phymod_changed(struct adapter *adap, int port_id);
 void t3_sge_err_intr_handler(adapter_t *adapter);
 #ifdef TCP_OFFLOAD
 int t3_offload_tx(struct adapter *, struct mbuf *);
 #endif
 void t3_os_set_hw_addr(adapter_t *adapter, int port_idx, u8 hw_addr[]);
 int t3_mgmt_tx(adapter_t *adap, struct mbuf *m);
 int t3_register_cpl_handler(struct adapter *, int, cpl_handler_t);
 
 int t3_sge_alloc(struct adapter *);
 int t3_sge_free(struct adapter *);
 int t3_sge_alloc_qset(adapter_t *, uint32_t, int, int, const struct qset_params *,
     int, struct port_info *);
 void t3_free_sge_resources(adapter_t *, int);
 void t3_sge_start(adapter_t *);
 void t3_sge_stop(adapter_t *);
 void t3b_intr(void *data);
 void t3_intr_msi(void *data);
 void t3_intr_msix(void *data);
 
 int t3_sge_init_adapter(adapter_t *);
 int t3_sge_reset_adapter(adapter_t *);
 int t3_sge_init_port(struct port_info *);
 void t3_free_tx_desc(struct sge_qset *qs, int n, int qid);
 
 void t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad);
 
 void t3_add_attach_sysctls(adapter_t *sc);
 void t3_add_configured_sysctls(adapter_t *sc);
 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
     unsigned char *data);
 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
 
 /*
  * XXX figure out how we can return this to being private to sge
  */
 #define desc_reclaimable(q) ((int)((q)->processed - (q)->cleaned - TX_MAX_DESC))
 
 #define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
 
 static __inline struct sge_qset *
 fl_to_qset(struct sge_fl *q, int qidx)
 {
 	return container_of(q, struct sge_qset, fl[qidx]);
 }
 
 static __inline struct sge_qset *
 rspq_to_qset(struct sge_rspq *q)
 {
 	return container_of(q, struct sge_qset, rspq);
 }
 
 static __inline struct sge_qset *
 txq_to_qset(struct sge_txq *q, int qidx)
 {
 	return container_of(q, struct sge_qset, txq[qidx]);
 }
 
 #undef container_of
 
 #define OFFLOAD_DEVMAP_BIT (1 << MAX_NPORTS)
 static inline int offload_running(adapter_t *adapter)
 {
         return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
 }
 
 void cxgb_tx_watchdog(void *arg);
 int cxgb_transmit(struct ifnet *ifp, struct mbuf *m);
 void cxgb_qflush(struct ifnet *ifp);
 void t3_iterate(void (*)(struct adapter *, void *), void *);
 void cxgb_refresh_stats(struct port_info *);
 #endif
Index: head/sys/dev/cxgb/cxgb_sge.c
===================================================================
--- head/sys/dev/cxgb/cxgb_sge.c	(revision 278976)
+++ head/sys/dev/cxgb/cxgb_sge.c	(revision 278977)
@@ -1,3720 +1,3721 @@
 /**************************************************************************
 
 Copyright (c) 2007-2009, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Chelsio Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
  
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 ***************************************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus_dma.h>
 #include <sys/rman.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 #include <sys/sglist.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/bpf.h>	
 #include <net/ethernet.h>
 #include <net/if_vlan_var.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <cxgb_include.h>
 #include <sys/mvec.h>
 
 int	txq_fills = 0;
 int	multiq_tx_enable = 1;
 
 #ifdef TCP_OFFLOAD
 CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
 #endif
 
 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
     "size of per-queue mbuf ring");
 
 static int cxgb_tx_coalesce_force = 0;
 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
     &cxgb_tx_coalesce_force, 0,
     "coalesce small packets into a single work request regardless of ring state");
 
 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
 
 
 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
     &cxgb_tx_coalesce_enable_start, 0,
     "coalesce enable threshold");
 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
     &cxgb_tx_coalesce_enable_stop, 0,
     "coalesce disable threshold");
 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
     &cxgb_tx_reclaim_threshold, 0,
     "tx cleaning minimum threshold");
 
 /*
  * XXX don't re-enable this until TOE stops assuming
  * we have an m_ext
  */
 static int recycle_enable = 0;
 
 extern int cxgb_use_16k_clusters;
 extern int nmbjumbop;
 extern int nmbjumbo9;
 extern int nmbjumbo16;
 
 #define USE_GTS 0
 
 #define SGE_RX_SM_BUF_SIZE	1536
 #define SGE_RX_DROP_THRES	16
 #define SGE_RX_COPY_THRES	128
 
 /*
  * Period of the Tx buffer reclaim timer.  This timer does not need to run
  * frequently as Tx buffers are usually reclaimed by new Tx packets.
  */
 #define TX_RECLAIM_PERIOD       (hz >> 1)
 
 /* 
  * Values for sge_txq.flags
  */
 enum {
 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
 };
 
 struct tx_desc {
 	uint64_t	flit[TX_DESC_FLITS];
 } __packed;
 
 struct rx_desc {
 	uint32_t	addr_lo;
 	uint32_t	len_gen;
 	uint32_t	gen2;
 	uint32_t	addr_hi;
 } __packed;
 
 struct rsp_desc {               /* response queue descriptor */
 	struct rss_header	rss_hdr;
 	uint32_t		flags;
 	uint32_t		len_cq;
 	uint8_t			imm_data[47];
 	uint8_t			intr_gen;
 } __packed;
 
 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
 
 struct tx_sw_desc {                /* SW state per Tx descriptor */
 	struct mbuf	*m;
 	bus_dmamap_t	map;
 	int		flags;
 };
 
 struct rx_sw_desc {                /* SW state per Rx descriptor */
 	caddr_t		rxsd_cl;
 	struct mbuf	*m;
 	bus_dmamap_t	map;
 	int		flags;
 };
 
 struct txq_state {
 	unsigned int	compl;
 	unsigned int	gen;
 	unsigned int	pidx;
 };
 
 struct refill_fl_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 
 /*
  * Maps a number of flits to the number of Tx descriptors that can hold them.
  * The formula is
  *
  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
  *
  * HW allows up to 4 descriptors to be combined into a WR.
  */
 static uint8_t flit_desc_map[] = {
 	0,
 #if SGE_NUM_GENBITS == 1
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 #elif SGE_NUM_GENBITS == 2
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 #else
 # error "SGE_NUM_GENBITS must be 1 or 2"
 #endif
 };
 
 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)	
 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)	
 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)	
 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
 #define	TXQ_RING_DEQUEUE(qs) \
 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 
 int cxgb_debug = 0;
 
 static void sge_timer_cb(void *arg);
 static void sge_timer_reclaim(void *arg, int ncount);
 static void sge_txq_reclaim_handler(void *arg, int ncount);
 static void cxgb_start_locked(struct sge_qset *qs);
 
 /*
  * XXX need to cope with bursty scheduling by looking at a wider
  * window than we are now for determining the need for coalescing
  *
  */
 static __inline uint64_t
 check_pkt_coalesce(struct sge_qset *qs) 
 { 
         struct adapter *sc; 
         struct sge_txq *txq; 
 	uint8_t *fill;
 
 	if (__predict_false(cxgb_tx_coalesce_force))
 		return (1);
 	txq = &qs->txq[TXQ_ETH]; 
         sc = qs->port->adapter; 
 	fill = &sc->tunq_fill[qs->idx];
 
 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
 	/*
 	 * if the hardware transmit queue is more than 1/8 full
 	 * we mark it as coalescing - we drop back from coalescing
 	 * when we go below 1/32 full and there are no packets enqueued, 
 	 * this provides us with some degree of hysteresis
 	 */
         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
                 *fill = 0; 
         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
                 *fill = 1; 
 
 	return (sc->tunq_coalesce);
 } 
 
 #ifdef __LP64__
 static void
 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
 {
 	uint64_t wr_hilo;
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	wr_hilo = wr_hi;
 	wr_hilo |= (((uint64_t)wr_lo)<<32);
 #else
 	wr_hilo = wr_lo;
 	wr_hilo |= (((uint64_t)wr_hi)<<32);
 #endif	
 	wrp->wrh_hilo = wr_hilo;
 }
 #else
 static void
 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
 {
 
 	wrp->wrh_hi = wr_hi;
 	wmb();
 	wrp->wrh_lo = wr_lo;
 }
 #endif
 
 struct coalesce_info {
 	int count;
 	int nbytes;
 };
 
 static int
 coalesce_check(struct mbuf *m, void *arg)
 {
 	struct coalesce_info *ci = arg;
 	int *count = &ci->count;
 	int *nbytes = &ci->nbytes;
 
 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
 		(*count < 7) && (m->m_next == NULL))) {
 		*count += 1;
 		*nbytes += m->m_len;
 		return (1);
 	}
 	return (0);
 }
 
 static struct mbuf *
 cxgb_dequeue(struct sge_qset *qs)
 {
 	struct mbuf *m, *m_head, *m_tail;
 	struct coalesce_info ci;
 
 	
 	if (check_pkt_coalesce(qs) == 0) 
 		return TXQ_RING_DEQUEUE(qs);
 
 	m_head = m_tail = NULL;
 	ci.count = ci.nbytes = 0;
 	do {
 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
 		if (m_head == NULL) {
 			m_tail = m_head = m;
 		} else if (m != NULL) {
 			m_tail->m_nextpkt = m;
 			m_tail = m;
 		}
 	} while (m != NULL);
 	if (ci.count > 7)
 		panic("trying to coalesce %d packets in to one WR", ci.count);
 	return (m_head);
 }
 	
 /**
  *	reclaim_completed_tx - reclaims completed Tx descriptors
  *	@adapter: the adapter
  *	@q: the Tx queue to reclaim completed descriptors from
  *
  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
  *	and frees the associated buffers if possible.  Called with the Tx
  *	queue's lock held.
  */
 static __inline int
 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
 {
 	struct sge_txq *q = &qs->txq[queue];
 	int reclaim = desc_reclaimable(q);
 
 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
 
 	if (reclaim < reclaim_min)
 		return (0);
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	if (reclaim > 0) {
 		t3_free_tx_desc(qs, reclaim, queue);
 		q->cleaned += reclaim;
 		q->in_use -= reclaim;
 	}
 	if (isset(&qs->txq_stopped, TXQ_ETH))
                 clrbit(&qs->txq_stopped, TXQ_ETH);
 
 	return (reclaim);
 }
 
 /**
  *	should_restart_tx - are there enough resources to restart a Tx queue?
  *	@q: the Tx queue
  *
  *	Checks if there are enough descriptors to restart a suspended Tx queue.
  */
 static __inline int
 should_restart_tx(const struct sge_txq *q)
 {
 	unsigned int r = q->processed - q->cleaned;
 
 	return q->in_use - r < (q->size >> 1);
 }
 
 /**
  *	t3_sge_init - initialize SGE
  *	@adap: the adapter
  *	@p: the SGE parameters
  *
  *	Performs SGE initialization needed every time after a chip reset.
  *	We do not initialize any of the queue sets here, instead the driver
  *	top-level must request those individually.  We also do not enable DMA
  *	here, that should be done after the queues have been set up.
  */
 void
 t3_sge_init(adapter_t *adap, struct sge_params *p)
 {
 	u_int ctrl, ups;
 
 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
 
 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
 #if SGE_NUM_GENBITS == 1
 	ctrl |= F_EGRGENCTRL;
 #endif
 	if (adap->params.rev > 0) {
 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
 	}
 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
 		     V_LORCQDRBTHRSH(512));
 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
 		     adap->params.rev < T3_REV_C ? 1000 : 500);
 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
 }
 
 
 /**
  *	sgl_len - calculates the size of an SGL of the given capacity
  *	@n: the number of SGL entries
  *
  *	Calculates the number of flits needed for a scatter/gather list that
  *	can hold the given number of entries.
  */
 static __inline unsigned int
 sgl_len(unsigned int n)
 {
 	return ((3 * n) / 2 + (n & 1));
 }
 
 /**
  *	get_imm_packet - return the next ingress packet buffer from a response
  *	@resp: the response descriptor containing the packet data
  *
  *	Return a packet containing the immediate data of the given response.
  */
 static int
 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
 {
 
 	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
 		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
 	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
 		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
 	} else
 		m->m_len = IMMED_PKT_SIZE;
 	m->m_ext.ext_buf = NULL;
 	m->m_ext.ext_type = 0;
 	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); 
 	return (0);	
 }
 
 static __inline u_int
 flits_to_desc(u_int n)
 {
 	return (flit_desc_map[n]);
 }
 
 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
 		    F_HIRCQPARITYERROR)
 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
 		      F_RSPQDISABLED)
 
 /**
  *	t3_sge_err_intr_handler - SGE async event interrupt handler
  *	@adapter: the adapter
  *
  *	Interrupt handler for SGE asynchronous (non-data) events.
  */
 void
 t3_sge_err_intr_handler(adapter_t *adapter)
 {
 	unsigned int v, status;
 
 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
 	if (status & SGE_PARERR)
 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
 			 status & SGE_PARERR);
 	if (status & SGE_FRAMINGERR)
 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
 			 status & SGE_FRAMINGERR);
 	if (status & F_RSPQCREDITOVERFOW)
 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
 
 	if (status & F_RSPQDISABLED) {
 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
 
 		CH_ALERT(adapter,
 			 "packet delivered to disabled response queue (0x%x)\n",
 			 (v >> S_RSPQ0DISABLED) & 0xff);
 	}
 
 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
 	if (status & SGE_FATALERR)
 		t3_fatal_err(adapter);
 }
 
 void
 t3_sge_prep(adapter_t *adap, struct sge_params *p)
 {
 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
 
 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
 	nqsets *= adap->params.nports;
 
 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
 
 	while (!powerof2(fl_q_size))
 		fl_q_size--;
 
 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
 	    is_offload(adap);
 
 #if __FreeBSD_version >= 700111
 	if (use_16k) {
 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
 		jumbo_buf_size = MJUM16BYTES;
 	} else {
 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
 		jumbo_buf_size = MJUM9BYTES;
 	}
 #else
 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
 	jumbo_buf_size = MJUMPAGESIZE;
 #endif
 	while (!powerof2(jumbo_q_size))
 		jumbo_q_size--;
 
 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
 		device_printf(adap->dev,
 		    "Insufficient clusters and/or jumbo buffers.\n");
 
 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
 
 	for (i = 0; i < SGE_QSETS; ++i) {
 		struct qset_params *q = p->qset + i;
 
 		if (adap->params.nports > 2) {
 			q->coalesce_usecs = 50;
 		} else {
 #ifdef INVARIANTS			
 			q->coalesce_usecs = 10;
 #else
 			q->coalesce_usecs = 5;
 #endif			
 		}
 		q->polling = 0;
 		q->rspq_size = RSPQ_Q_SIZE;
 		q->fl_size = fl_q_size;
 		q->jumbo_size = jumbo_q_size;
 		q->jumbo_buf_size = jumbo_buf_size;
 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
 		q->cong_thres = 0;
 	}
 }
 
 int
 t3_sge_alloc(adapter_t *sc)
 {
 
 	/* The parent tag. */
 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
 				BUS_SPACE_UNRESTRICTED, /* nsegments */
 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
 				0,			/* flags */
 				NULL, NULL,		/* lock, lockarg */
 				&sc->parent_dmat)) {
 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * DMA tag for normal sized RX frames
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/* 
 	 * DMA tag for jumbo sized RX frames.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/* 
 	 * DMA tag for TX frames.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
 		NULL, NULL, &sc->tx_dmat)) {
 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 int
 t3_sge_free(struct adapter * sc)
 {
 
 	if (sc->tx_dmat != NULL)
 		bus_dma_tag_destroy(sc->tx_dmat);
 
 	if (sc->rx_jumbo_dmat != NULL)
 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
 
 	if (sc->rx_dmat != NULL)
 		bus_dma_tag_destroy(sc->rx_dmat);
 
 	if (sc->parent_dmat != NULL)
 		bus_dma_tag_destroy(sc->parent_dmat);
 
 	return (0);
 }
 
 void
 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
 {
 
 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
 	qs->rspq.polling = 0 /* p->polling */;
 }
 
 #if !defined(__i386__) && !defined(__amd64__)
 static void
 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct refill_fl_cb_arg *cb_arg = arg;
 	
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 
 }
 #endif
 /**
  *	refill_fl - refill an SGE free-buffer list
  *	@sc: the controller softc
  *	@q: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 {
 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 	struct rx_desc *d = &q->desc[q->pidx];
 	struct refill_fl_cb_arg cb_arg;
 	struct mbuf *m;
 	caddr_t cl;
 	int err;
 	
 	cb_arg.error = 0;
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 */
 		if (q->zone == zone_pack) {
 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
 				break;
 			cl = m->m_ext.ext_buf;			
 		} else {
 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
 				break;
 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 				uma_zfree(q->zone, cl);
 				break;
 			}
 		}
 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
 				uma_zfree(q->zone, cl);
 				goto done;
 			}
 			sd->flags |= RX_SW_DESC_MAP_CREATED;
 		}
 #if !defined(__i386__) && !defined(__amd64__)
 		err = bus_dmamap_load(q->entry_tag, sd->map,
 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
 		
 		if (err != 0 || cb_arg.error) {
 			if (q->zone == zone_pack)
 				uma_zfree(q->zone, cl);
 			m_free(m);
 			goto done;
 		}
 #else
 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
 #endif		
 		sd->flags |= RX_SW_DESC_INUSE;
 		sd->rxsd_cl = cl;
 		sd->m = m;
 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
 
 		d++;
 		sd++;
 
 		if (++q->pidx == q->size) {
 			q->pidx = 0;
 			q->gen ^= 1;
 			sd = q->sdesc;
 			d = q->desc;
 		}
 		q->credits++;
 		q->db_pending++;
 	}
 
 done:
 	if (q->db_pending >= 32) {
 		q->db_pending = 0;
 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 	}
 }
 
 
 /**
  *	free_rx_bufs - free the Rx buffers on an SGE free list
  *	@sc: the controle softc
  *	@q: the SGE free list to clean up
  *
  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
  *	this queue should be stopped before calling this function.
  */
 static void
 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
 {
 	u_int cidx = q->cidx;
 
 	while (q->credits--) {
 		struct rx_sw_desc *d = &q->sdesc[cidx];
 
 		if (d->flags & RX_SW_DESC_INUSE) {
 			bus_dmamap_unload(q->entry_tag, d->map);
 			bus_dmamap_destroy(q->entry_tag, d->map);
 			if (q->zone == zone_pack) {
 				m_init(d->m, zone_pack, MCLBYTES,
 				    M_NOWAIT, MT_DATA, M_EXT);
 				uma_zfree(zone_pack, d->m);
 			} else {
 				m_init(d->m, zone_mbuf, MLEN,
 				    M_NOWAIT, MT_DATA, 0);
 				uma_zfree(zone_mbuf, d->m);
 				uma_zfree(q->zone, d->rxsd_cl);
 			}			
 		}
 		
 		d->rxsd_cl = NULL;
 		d->m = NULL;
 		if (++cidx == q->size)
 			cidx = 0;
 	}
 }
 
 static __inline void
 __refill_fl(adapter_t *adap, struct sge_fl *fl)
 {
 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
 }
 
 static __inline void
 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
 {
 	uint32_t reclaimable = fl->size - fl->credits;
 
 	if (reclaimable > 0)
 		refill_fl(adap, fl, min(max, reclaimable));
 }
 
 /**
  *	recycle_rx_buf - recycle a receive buffer
  *	@adapter: the adapter
  *	@q: the SGE free list
  *	@idx: index of buffer to recycle
  *
  *	Recycles the specified buffer on the given free list by adding it at
  *	the next available slot on the list.
  */
 static void
 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
 {
 	struct rx_desc *from = &q->desc[idx];
 	struct rx_desc *to   = &q->desc[q->pidx];
 
 	q->sdesc[q->pidx] = q->sdesc[idx];
 	to->addr_lo = from->addr_lo;        // already big endian
 	to->addr_hi = from->addr_hi;        // likewise
 	wmb();	/* necessary ? */
 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
 	q->credits++;
 
 	if (++q->pidx == q->size) {
 		q->pidx = 0;
 		q->gen ^= 1;
 	}
 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 }
 
 static void
 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	uint32_t *addr;
 
 	addr = arg;
 	*addr = segs[0].ds_addr;
 }
 
 static int
 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
 {
 	size_t len = nelem * elem_size;
 	void *s = NULL;
 	void *p = NULL;
 	int err;
 
 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
 				      BUS_SPACE_MAXADDR_32BIT,
 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
 				      len, 0, NULL, NULL, tag)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
 		return (ENOMEM);
 	}
 
 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
 				    map)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
 		return (ENOMEM);
 	}
 
 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
 	bzero(p, len);
 	*(void **)desc = p;
 
 	if (sw_size) {
 		len = nelem * sw_size;
 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
 		*(void **)sdesc = s;
 	}
 	if (parent_entry_tag == NULL)
 		return (0);
 	    
 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
 		                      NULL, NULL, entry_tag)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
 		return (ENOMEM);
 	}
 	return (0);
 }
 
 static void
 sge_slow_intr_handler(void *arg, int ncount)
 {
 	adapter_t *sc = arg;
 
 	t3_slow_intr_handler(sc);
 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
 }
 
 /**
  *	sge_timer_cb - perform periodic maintenance of an SGE qset
  *	@data: the SGE queue set to maintain
  *
  *	Runs periodically from a timer to perform maintenance of an SGE queue
  *	set.  It performs two tasks:
  *
  *	a) Cleans up any completed Tx descriptors that may still be pending.
  *	Normal descriptor cleanup happens when new packets are added to a Tx
  *	queue so this timer is relatively infrequent and does any cleanup only
  *	if the Tx queue has not seen any new packets in a while.  We make a
  *	best effort attempt to reclaim descriptors, in that we don't wait
  *	around if we cannot get a queue's lock (which most likely is because
  *	someone else is queueing new packets and so will also handle the clean
  *	up).  Since control queues use immediate data exclusively we don't
  *	bother cleaning them up here.
  *
  *	b) Replenishes Rx queues that have run out due to memory shortage.
  *	Normally new Rx buffers are added when existing ones are consumed but
  *	when out of memory a queue can become empty.  We try to add only a few
  *	buffers here, the queue will be replenished fully as these new buffers
  *	are used up if memory shortage has subsided.
  *	
  *	c) Return coalesced response queue credits in case a response queue is
  *	starved.
  *
  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell 
  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
  */
 static void
 sge_timer_cb(void *arg)
 {
 	adapter_t *sc = arg;
 	if ((sc->flags & USING_MSIX) == 0) {
 		
 		struct port_info *pi;
 		struct sge_qset *qs;
 		struct sge_txq  *txq;
 		int i, j;
 		int reclaim_ofl, refill_rx;
 
 		if (sc->open_device_map == 0) 
 			return;
 
 		for (i = 0; i < sc->params.nports; i++) {
 			pi = &sc->port[i];
 			for (j = 0; j < pi->nqsets; j++) {
 				qs = &sc->sge.qs[pi->first_qset + j];
 				txq = &qs->txq[0];
 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || 
 				    (qs->fl[1].credits < qs->fl[1].size));
 				if (reclaim_ofl || refill_rx) {
 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
 					break;
 				}
 			}
 		}
 	}
 	
 	if (sc->params.nports > 2) {
 		int i;
 
 		for_each_port(sc, i) {
 			struct port_info *pi = &sc->port[i];
 
 			t3_write_reg(sc, A_SG_KDOORBELL, 
 				     F_SELEGRCNTX | 
 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
 		}
 	}	
 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
 	    sc->open_device_map != 0)
 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 }
 
 /*
  * This is meant to be a catch-all function to keep sge state private
  * to sge.c
  *
  */
 int
 t3_sge_init_adapter(adapter_t *sc)
 {
 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
 	return (0);
 }
 
 int
 t3_sge_reset_adapter(adapter_t *sc)
 {
 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 	return (0);
 }
 
 int
 t3_sge_init_port(struct port_info *pi)
 {
 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
 	return (0);
 }
 
 /**
  *	refill_rspq - replenish an SGE response queue
  *	@adapter: the adapter
  *	@q: the response queue to replenish
  *	@credits: how many new responses to make available
  *
  *	Replenishes a response queue by making the supplied number of responses
  *	available to HW.
  */
 static __inline void
 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
 {
 
 	/* mbufs are allocated on demand when a rspq entry is processed. */
 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
 }
 
 static void
 sge_txq_reclaim_handler(void *arg, int ncount)
 {
 	struct sge_qset *qs = arg;
 	int i;
 
 	for (i = 0; i < 3; i++)
 		reclaim_completed_tx(qs, 16, i);
 }
 
 static void
 sge_timer_reclaim(void *arg, int ncount)
 {
 	struct port_info *pi = arg;
 	int i, nqsets = pi->nqsets;
 	adapter_t *sc = pi->adapter;
 	struct sge_qset *qs;
 	struct mtx *lock;
 	
 	KASSERT((sc->flags & USING_MSIX) == 0,
 	    ("can't call timer reclaim for msi-x"));
 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[pi->first_qset + i];
 
 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
 			    &sc->sge.qs[0].rspq.lock;
 
 		if (mtx_trylock(lock)) {
 			/* XXX currently assume that we are *NOT* polling */
 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
 
 			if (qs->fl[0].credits < qs->fl[0].size - 16)
 				__refill_fl(sc, &qs->fl[0]);
 			if (qs->fl[1].credits < qs->fl[1].size - 16)
 				__refill_fl(sc, &qs->fl[1]);
 			
 			if (status & (1 << qs->rspq.cntxt_id)) {
 				if (qs->rspq.credits) {
 					refill_rspq(sc, &qs->rspq, 1);
 					qs->rspq.credits--;
 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, 
 					    1 << qs->rspq.cntxt_id);
 				}
 			}
 			mtx_unlock(lock);
 		}
 	}
 }
 
 /**
  *	init_qset_cntxt - initialize an SGE queue set context info
  *	@qs: the queue set
  *	@id: the queue set id
  *
  *	Initializes the TIDs and context ids for the queues of a queue set.
  */
 static void
 init_qset_cntxt(struct sge_qset *qs, u_int id)
 {
 
 	qs->rspq.cntxt_id = id;
 	qs->fl[0].cntxt_id = 2 * id;
 	qs->fl[1].cntxt_id = 2 * id + 1;
 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
 
-	mbufq_init(&qs->txq[TXQ_ETH].sendq);
-	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
-	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
+	/* XXX: a sane limit is needed instead of INT_MAX */
+	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
+	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
+	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
 }
 
 
 static void
 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
 {
 	txq->in_use += ndesc;
 	/*
 	 * XXX we don't handle stopping of queue
 	 * presumably start handles this when we bump against the end
 	 */
 	txqs->gen = txq->gen;
 	txq->unacked += ndesc;
 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
 	txq->unacked &= 31;
 	txqs->pidx = txq->pidx;
 	txq->pidx += ndesc;
 #ifdef INVARIANTS
 	if (((txqs->pidx > txq->cidx) &&
 		(txq->pidx < txqs->pidx) &&
 		(txq->pidx >= txq->cidx)) ||
 	    ((txqs->pidx < txq->cidx) &&
 		(txq->pidx >= txq-> cidx)) ||
 	    ((txqs->pidx < txq->cidx) &&
 		(txq->cidx < txqs->pidx)))
 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
 		    txqs->pidx, txq->pidx, txq->cidx);
 #endif
 	if (txq->pidx >= txq->size) {
 		txq->pidx -= txq->size;
 		txq->gen ^= 1;
 	}
 
 }
 
 /**
  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
  *	@m: the packet mbufs
  *      @nsegs: the number of segments 
  *
  * 	Returns the number of Tx descriptors needed for the given Ethernet
  * 	packet.  Ethernet packets require addition of WR and CPL headers.
  */
 static __inline unsigned int
 calc_tx_descs(const struct mbuf *m, int nsegs)
 {
 	unsigned int flits;
 
 	if (m->m_pkthdr.len <= PIO_LEN)
 		return 1;
 
 	flits = sgl_len(nsegs) + 2;
 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
 		flits++;
 
 	return flits_to_desc(flits);
 }
 
 /**
  *	make_sgl - populate a scatter/gather list for a packet
  *	@sgp: the SGL to populate
  *	@segs: the packet dma segments
  *	@nsegs: the number of segments
  *
  *	Generates a scatter/gather list for the buffers that make up a packet
  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
  *	appropriately.
  */
 static __inline void
 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
 {
 	int i, idx;
 	
 	for (idx = 0, i = 0; i < nsegs; i++) {
 		/*
 		 * firmware doesn't like empty segments
 		 */
 		if (segs[i].ds_len == 0)
 			continue;
 		if (i && idx == 0) 
 			++sgp;
 		
 		sgp->len[idx] = htobe32(segs[i].ds_len);
 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
 		idx ^= 1;
 	}
 	
 	if (idx) {
 		sgp->len[idx] = 0;
 		sgp->addr[idx] = 0;
 	}
 }
 	
 /**
  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
  *	@adap: the adapter
  *	@q: the Tx queue
  *
  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
  *	where the HW is going to sleep just after we checked, however,
  *	then the interrupt handler will detect the outstanding TX packet
  *	and ring the doorbell for us.
  *
  *	When GTS is disabled we unconditionally ring the doorbell.
  */
 static __inline void
 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
 {
 #if USE_GTS
 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
 #ifdef T3_TRACE
 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
 			  q->cntxt_id);
 #endif
 		t3_write_reg(adap, A_SG_KDOORBELL,
 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 	}
 #else
 	if (mustring || ++q->db_pending >= 32) {
 		wmb();            /* write descriptors before telling HW */
 		t3_write_reg(adap, A_SG_KDOORBELL,
 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 		q->db_pending = 0;
 	}
 #endif
 }
 
 static __inline void
 wr_gen2(struct tx_desc *d, unsigned int gen)
 {
 #if SGE_NUM_GENBITS == 2
 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
 #endif
 }
 
 /**
  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
  *	@ndesc: number of Tx descriptors spanned by the SGL
  *	@txd: first Tx descriptor to be written
  *	@txqs: txq state (generation and producer index)
  *	@txq: the SGE Tx queue
  *	@sgl: the SGL
  *	@flits: number of flits to the start of the SGL in the first descriptor
  *	@sgl_flits: the SGL size in flits
  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
  *
  *	Write a work request header and an associated SGL.  If the SGL is
  *	small enough to fit into one Tx descriptor it has already been written
  *	and we just need to write the WR header.  Otherwise we distribute the
  *	SGL across the number of descriptors it spans.
  */
 static void
 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
 {
 
 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
 	
 	if (__predict_true(ndesc == 1)) {
 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 		    V_WR_SGLSFLT(flits)) | wr_hi,
 		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
 		    wr_lo);
 
 		wr_gen2(txd, txqs->gen);
 		
 	} else {
 		unsigned int ogen = txqs->gen;
 		const uint64_t *fp = (const uint64_t *)sgl;
 		struct work_request_hdr *wp = wrp;
 		
 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
 		    V_WR_SGLSFLT(flits)) | wr_hi;
 		
 		while (sgl_flits) {
 			unsigned int avail = WR_FLITS - flits;
 
 			if (avail > sgl_flits)
 				avail = sgl_flits;
 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
 			sgl_flits -= avail;
 			ndesc--;
 			if (!sgl_flits)
 				break;
 			
 			fp += avail;
 			txd++;
 			txsd++;
 			if (++txqs->pidx == txq->size) {
 				txqs->pidx = 0;
 				txqs->gen ^= 1;
 				txd = txq->desc;
 				txsd = txq->sdesc;
 			}
 
 			/*
 			 * when the head of the mbuf chain
 			 * is freed all clusters will be freed
 			 * with it
 			 */
 			wrp = (struct work_request_hdr *)txd;
 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
 			    V_WR_SGLSFLT(1)) | wr_hi;
 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
 				    sgl_flits + 1)) |
 			    V_WR_GEN(txqs->gen)) | wr_lo;
 			wr_gen2(txd, txqs->gen);
 			flits = 1;
 		}
 		wrp->wrh_hi |= htonl(F_WR_EOP);
 		wmb();
 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
 		wr_gen2((struct tx_desc *)wp, ogen);
 	}
 }
 
 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
 
 #define GET_VTAG(cntrl, m) \
 do { \
 	if ((m)->m_flags & M_VLANTAG)					            \
 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
 } while (0)
 
 static int
 t3_encap(struct sge_qset *qs, struct mbuf **m)
 {
 	adapter_t *sc;
 	struct mbuf *m0;
 	struct sge_txq *txq;
 	struct txq_state txqs;
 	struct port_info *pi;
 	unsigned int ndesc, flits, cntrl, mlen;
 	int err, nsegs, tso_info = 0;
 
 	struct work_request_hdr *wrp;
 	struct tx_sw_desc *txsd;
 	struct sg_ent *sgp, *sgl;
 	uint32_t wr_hi, wr_lo, sgl_flits; 
 	bus_dma_segment_t segs[TX_MAX_SEGS];
 
 	struct tx_desc *txd;
 		
 	pi = qs->port;
 	sc = pi->adapter;
 	txq = &qs->txq[TXQ_ETH];
 	txd = &txq->desc[txq->pidx];
 	txsd = &txq->sdesc[txq->pidx];
 	sgl = txq->txq_sgl;
 
 	prefetch(txd);
 	m0 = *m;
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
 	
 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
 
 	if (m0->m_nextpkt != NULL) {
 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
 		ndesc = 1;
 		mlen = 0;
 	} else {
 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
 		    &m0, segs, &nsegs))) {
 			if (cxgb_debug)
 				printf("failed ... err=%d\n", err);
 			return (err);
 		}
 		mlen = m0->m_pkthdr.len;
 		ndesc = calc_tx_descs(m0, nsegs);
 	}
 	txq_prod(txq, ndesc, &txqs);
 
 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
 	txsd->m = m0;
 
 	if (m0->m_nextpkt != NULL) {
 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
 		int i, fidx;
 
 		if (nsegs > 7)
 			panic("trying to coalesce %d packets in to one WR", nsegs);
 		txq->txq_coalesced += nsegs;
 		wrp = (struct work_request_hdr *)txd;
 		flits = nsegs*2 + 1;
 
 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
 			struct cpl_tx_pkt_batch_entry *cbe;
 			uint64_t flit;
 			uint32_t *hflit = (uint32_t *)&flit;
 			int cflags = m0->m_pkthdr.csum_flags;
 
 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
 			GET_VTAG(cntrl, m0);
 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
 			if (__predict_false(!(cflags & CSUM_IP)))
 				cntrl |= F_TXPKT_IPCSUM_DIS;
 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
 			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
 				cntrl |= F_TXPKT_L4CSUM_DIS;
 
 			hflit[0] = htonl(cntrl);
 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
 			flit |= htobe64(1 << 24);
 			cbe = &cpl_batch->pkt_entry[i];
 			cbe->cntrl = hflit[0];
 			cbe->len = hflit[1];
 			cbe->addr = htobe64(segs[i].ds_addr);
 		}
 
 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 		    V_WR_SGLSFLT(flits)) |
 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
 		wr_lo = htonl(V_WR_LEN(flits) |
 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
 		set_wr_hdr(wrp, wr_hi, wr_lo);
 		wmb();
 		ETHER_BPF_MTAP(pi->ifp, m0);
 		wr_gen2(txd, txqs.gen);
 		check_ring_tx_db(sc, txq, 0);
 		return (0);		
 	} else if (tso_info) {
 		uint16_t eth_type;
 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
 		struct ether_header *eh;
 		void *l3hdr;
 		struct tcphdr *tcp;
 
 		txd->flit[2] = 0;
 		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
 		hdr->cntrl = htonl(cntrl);
 		hdr->len = htonl(mlen | 0x80000000);
 
 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
 			    m0, mlen, m0->m_pkthdr.tso_segsz,
 			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
 			panic("tx tso packet too small");
 		}
 
 		/* Make sure that ether, ip, tcp headers are all in m0 */
 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
 			if (__predict_false(m0 == NULL)) {
 				/* XXX panic probably an overreaction */
 				panic("couldn't fit header into mbuf");
 			}
 		}
 
 		eh = mtod(m0, struct ether_header *);
 		eth_type = eh->ether_type;
 		if (eth_type == htons(ETHERTYPE_VLAN)) {
 			struct ether_vlan_header *evh = (void *)eh;
 
 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
 			l3hdr = evh + 1;
 			eth_type = evh->evl_proto;
 		} else {
 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
 			l3hdr = eh + 1;
 		}
 
 		if (eth_type == htons(ETHERTYPE_IP)) {
 			struct ip *ip = l3hdr;
 
 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
 			tcp = (struct tcphdr *)(ip + 1);
 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
 			struct ip6_hdr *ip6 = l3hdr;
 
 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
 			    ("%s: CSUM_TSO with ip6_nxt %d",
 			    __func__, ip6->ip6_nxt));
 
 			tso_info |= F_LSO_IPV6;
 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
 			tcp = (struct tcphdr *)(ip6 + 1);
 		} else
 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
 
 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
 		hdr->lso_info = htonl(tso_info);
 
 		if (__predict_false(mlen <= PIO_LEN)) {
 			/*
 			 * pkt not undersized but fits in PIO_LEN
 			 * Indicates a TSO bug at the higher levels.
 			 */
 			txsd->m = NULL;
 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
 			flits = (mlen + 7) / 8 + 3;
 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
 					  F_WR_SOP | F_WR_EOP | txqs.compl);
 			wr_lo = htonl(V_WR_LEN(flits) |
 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
 			wmb();
 			ETHER_BPF_MTAP(pi->ifp, m0);
 			wr_gen2(txd, txqs.gen);
 			check_ring_tx_db(sc, txq, 0);
 			m_freem(m0);
 			return (0);
 		}
 		flits = 3;	
 	} else {
 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
 		
 		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
 			cntrl |= F_TXPKT_IPCSUM_DIS;
 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
 		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
 			cntrl |= F_TXPKT_L4CSUM_DIS;
 		cpl->cntrl = htonl(cntrl);
 		cpl->len = htonl(mlen | 0x80000000);
 
 		if (mlen <= PIO_LEN) {
 			txsd->m = NULL;
 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
 			flits = (mlen + 7) / 8 + 2;
 			
 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
 					  F_WR_SOP | F_WR_EOP | txqs.compl);
 			wr_lo = htonl(V_WR_LEN(flits) |
 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
 			wmb();
 			ETHER_BPF_MTAP(pi->ifp, m0);
 			wr_gen2(txd, txqs.gen);
 			check_ring_tx_db(sc, txq, 0);
 			m_freem(m0);
 			return (0);
 		}
 		flits = 2;
 	}
 	wrp = (struct work_request_hdr *)txd;
 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
 	make_sgl(sgp, segs, nsegs);
 
 	sgl_flits = sgl_len(nsegs);
 
 	ETHER_BPF_MTAP(pi->ifp, m0);
 
 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
 	wr_lo = htonl(V_WR_TID(txq->token));
 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
 	    sgl_flits, wr_hi, wr_lo);
 	check_ring_tx_db(sc, txq, 0);
 
 	return (0);
 }
 
 void
 cxgb_tx_watchdog(void *arg)
 {
 	struct sge_qset *qs = arg;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 
         if (qs->coalescing != 0 &&
 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
 	    TXQ_RING_EMPTY(qs))
                 qs->coalescing = 0; 
         else if (qs->coalescing == 0 &&
 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
                 qs->coalescing = 1;
 	if (TXQ_TRYLOCK(qs)) {
 		qs->qs_flags |= QS_FLUSHING;
 		cxgb_start_locked(qs);
 		qs->qs_flags &= ~QS_FLUSHING;
 		TXQ_UNLOCK(qs);
 	}
 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
 		    qs, txq->txq_watchdog.c_cpu);
 }
 
 static void
 cxgb_tx_timeout(void *arg)
 {
 	struct sge_qset *qs = arg;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 
 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
                 qs->coalescing = 1;	
 	if (TXQ_TRYLOCK(qs)) {
 		qs->qs_flags |= QS_TIMEOUT;
 		cxgb_start_locked(qs);
 		qs->qs_flags &= ~QS_TIMEOUT;
 		TXQ_UNLOCK(qs);
 	}
 }
 
 static void
 cxgb_start_locked(struct sge_qset *qs)
 {
 	struct mbuf *m_head = NULL;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 	struct port_info *pi = qs->port;
 	struct ifnet *ifp = pi->ifp;
 
 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
 		reclaim_completed_tx(qs, 0, TXQ_ETH);
 
 	if (!pi->link_config.link_ok) {
 		TXQ_RING_FLUSH(qs);
 		return;
 	}
 	TXQ_LOCK_ASSERT(qs);
 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
 	    pi->link_config.link_ok) {
 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
 
 		if (txq->size - txq->in_use <= TX_MAX_DESC)
 			break;
 
 		if ((m_head = cxgb_dequeue(qs)) == NULL)
 			break;
 		/*
 		 *  Encapsulation can modify our pointer, and or make it
 		 *  NULL on failure.  In that event, we can't requeue.
 		 */
 		if (t3_encap(qs, &m_head) || m_head == NULL)
 			break;
 
 		m_head = NULL;
 	}
 
 	if (txq->db_pending)
 		check_ring_tx_db(pi->adapter, txq, 1);
 
 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
 	    pi->link_config.link_ok)
 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
 		    qs, txq->txq_timer.c_cpu);
 	if (m_head != NULL)
 		m_freem(m_head);
 }
 
 static int
 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
 {
 	struct port_info *pi = qs->port;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 	struct buf_ring *br = txq->txq_mr;
 	int error, avail;
 
 	avail = txq->size - txq->in_use;
 	TXQ_LOCK_ASSERT(qs);
 
 	/*
 	 * We can only do a direct transmit if the following are true:
 	 * - we aren't coalescing (ring < 3/4 full)
 	 * - the link is up -- checked in caller
 	 * - there are no packets enqueued already
 	 * - there is space in hardware transmit queue 
 	 */
 	if (check_pkt_coalesce(qs) == 0 &&
 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
 		if (t3_encap(qs, &m)) {
 			if (m != NULL &&
 			    (error = drbr_enqueue(ifp, br, m)) != 0) 
 				return (error);
 		} else {
 			if (txq->db_pending)
 				check_ring_tx_db(pi->adapter, txq, 1);
 
 			/*
 			 * We've bypassed the buf ring so we need to update
 			 * the stats directly
 			 */
 			txq->txq_direct_packets++;
 			txq->txq_direct_bytes += m->m_pkthdr.len;
 		}
 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
 		return (error);
 
 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
 		cxgb_start_locked(qs);
 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
 		    qs, txq->txq_timer.c_cpu);
 	return (0);
 }
 
 int
 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct sge_qset *qs;
 	struct port_info *pi = ifp->if_softc;
 	int error, qidx = pi->first_qset;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
 	    ||(!pi->link_config.link_ok)) {
 		m_freem(m);
 		return (0);
 	}
 
 	/* check if flowid is set */
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)	
 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
 
 	qs = &pi->adapter->sge.qs[qidx];
 	
 	if (TXQ_TRYLOCK(qs)) {
 		/* XXX running */
 		error = cxgb_transmit_locked(ifp, qs, m);
 		TXQ_UNLOCK(qs);
 	} else
 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
 	return (error);
 }
 
 void
 cxgb_qflush(struct ifnet *ifp)
 {
 	/*
 	 * flush any enqueued mbufs in the buf_rings
 	 * and in the transmit queues
 	 * no-op for now
 	 */
 	return;
 }
 
 /**
  *	write_imm - write a packet into a Tx descriptor as immediate data
  *	@d: the Tx descriptor to write
  *	@m: the packet
  *	@len: the length of packet data to write as immediate data
  *	@gen: the generation bit value to write
  *
  *	Writes a packet as immediate data into a Tx descriptor.  The packet
  *	contains a work request at its beginning.  We must write the packet
  *	carefully so the SGE doesn't read accidentally before it's written in
  *	its entirety.
  */
 static __inline void
 write_imm(struct tx_desc *d, caddr_t src,
 	  unsigned int len, unsigned int gen)
 {
 	struct work_request_hdr *from = (struct work_request_hdr *)src;
 	struct work_request_hdr *to = (struct work_request_hdr *)d;
 	uint32_t wr_hi, wr_lo;
 
 	KASSERT(len <= WR_LEN && len >= sizeof(*from),
 	    ("%s: invalid len %d", __func__, len));
 	
 	memcpy(&to[1], &from[1], len - sizeof(*from));
 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
 	    V_WR_BCNTLFLT(len & 7));
 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
 	set_wr_hdr(to, wr_hi, wr_lo);
 	wmb();
 	wr_gen2(d, gen);
 }
 
 /**
  *	check_desc_avail - check descriptor availability on a send queue
  *	@adap: the adapter
  *	@q: the TX queue
  *	@m: the packet needing the descriptors
  *	@ndesc: the number of Tx descriptors needed
  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
  *
  *	Checks if the requested number of Tx descriptors is available on an
  *	SGE send queue.  If the queue is already suspended or not enough
  *	descriptors are available the packet is queued for later transmission.
  *	Must be called with the Tx queue locked.
  *
  *	Returns 0 if enough descriptors are available, 1 if there aren't
  *	enough descriptors and the packet has been queued, and 2 if the caller
  *	needs to retry because there weren't enough descriptors at the
  *	beginning of the call but some freed up in the mean time.
  */
 static __inline int
 check_desc_avail(adapter_t *adap, struct sge_txq *q,
 		 struct mbuf *m, unsigned int ndesc,
 		 unsigned int qid)
 {
 	/* 
 	 * XXX We currently only use this for checking the control queue
 	 * the control queue is only used for binding qsets which happens
 	 * at init time so we are guaranteed enough descriptors
 	 */
-	if (__predict_false(!mbufq_empty(&q->sendq))) {
-addq_exit:	mbufq_tail(&q->sendq, m);
+	if (__predict_false(mbufq_len(&q->sendq))) {
+addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
 		return 1;
 	}
 	if (__predict_false(q->size - q->in_use < ndesc)) {
 
 		struct sge_qset *qs = txq_to_qset(q, qid);
 
 		setbit(&qs->txq_stopped, qid);
 		if (should_restart_tx(q) &&
 		    test_and_clear_bit(qid, &qs->txq_stopped))
 			return 2;
 
 		q->stops++;
 		goto addq_exit;
 	}
 	return 0;
 }
 
 
 /**
  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
  *	@q: the SGE control Tx queue
  *
  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
  *	that send only immediate data (presently just the control queues) and
  *	thus do not have any mbufs
  */
 static __inline void
 reclaim_completed_tx_imm(struct sge_txq *q)
 {
 	unsigned int reclaim = q->processed - q->cleaned;
 
 	q->in_use -= reclaim;
 	q->cleaned += reclaim;
 }
 
 /**
  *	ctrl_xmit - send a packet through an SGE control Tx queue
  *	@adap: the adapter
  *	@q: the control queue
  *	@m: the packet
  *
  *	Send a packet through an SGE control Tx queue.  Packets sent through
  *	a control queue must fit entirely as immediate data in a single Tx
  *	descriptor and have no page fragments.
  */
 static int
 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
 {
 	int ret;
 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	
 	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
 
 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
 
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx_imm(q);
 
 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
 			TXQ_UNLOCK(qs);
 			return (ENOSPC);
 		}
 		goto again;
 	}
 	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
 	
 	q->in_use++;
 	if (++q->pidx >= q->size) {
 		q->pidx = 0;
 		q->gen ^= 1;
 	}
 	TXQ_UNLOCK(qs);
 	wmb();
 	t3_write_reg(adap, A_SG_KDOORBELL,
 	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 
 	m_free(m);
 	return (0);
 }
 
 
 /**
  *	restart_ctrlq - restart a suspended control queue
  *	@qs: the queue set cotaining the control queue
  *
  *	Resumes transmission on a suspended Tx control queue.
  */
 static void
 restart_ctrlq(void *data, int npending)
 {
 	struct mbuf *m;
 	struct sge_qset *qs = (struct sge_qset *)data;
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	adapter_t *adap = qs->port->adapter;
 
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx_imm(q);
 
 	while (q->in_use < q->size &&
 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
 
 		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
 		m_free(m);
 
 		if (++q->pidx >= q->size) {
 			q->pidx = 0;
 			q->gen ^= 1;
 		}
 		q->in_use++;
 	}
-	if (!mbufq_empty(&q->sendq)) {
+	if (mbufq_len(&q->sendq)) {
 		setbit(&qs->txq_stopped, TXQ_CTRL);
 
 		if (should_restart_tx(q) &&
 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
 			goto again;
 		q->stops++;
 	}
 	TXQ_UNLOCK(qs);
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 }
 
 
 /*
  * Send a management message through control queue 0
  */
 int
 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
 {
 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
 }
 
 /**
  *	free_qset - free the resources of an SGE queue set
  *	@sc: the controller owning the queue set
  *	@q: the queue set
  *
  *	Release the HW and SW resources associated with an SGE queue set, such
  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
  *	queue set must be quiesced prior to calling this.
  */
 static void
 t3_free_qset(adapter_t *sc, struct sge_qset *q)
 {
 	int i;
 	
 	reclaim_completed_tx(q, 0, TXQ_ETH);
 	if (q->txq[TXQ_ETH].txq_mr != NULL) 
 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
 	}
 
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		if (q->fl[i].desc) {
 			mtx_lock_spin(&sc->sge.reg_lock);
 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
 			mtx_unlock_spin(&sc->sge.reg_lock);
 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
 					q->fl[i].desc_map);
 			bus_dma_tag_destroy(q->fl[i].desc_tag);
 			bus_dma_tag_destroy(q->fl[i].entry_tag);
 		}
 		if (q->fl[i].sdesc) {
 			free_rx_bufs(sc, &q->fl[i]);
 			free(q->fl[i].sdesc, M_DEVBUF);
 		}
 	}
 
 	mtx_unlock(&q->lock);
 	MTX_DESTROY(&q->lock);
 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
 		if (q->txq[i].desc) {
 			mtx_lock_spin(&sc->sge.reg_lock);
 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
 			mtx_unlock_spin(&sc->sge.reg_lock);
 			bus_dmamap_unload(q->txq[i].desc_tag,
 					q->txq[i].desc_map);
 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
 					q->txq[i].desc_map);
 			bus_dma_tag_destroy(q->txq[i].desc_tag);
 			bus_dma_tag_destroy(q->txq[i].entry_tag);
 		}
 		if (q->txq[i].sdesc) {
 			free(q->txq[i].sdesc, M_DEVBUF);
 		}
 	}
 
 	if (q->rspq.desc) {
 		mtx_lock_spin(&sc->sge.reg_lock);
 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
 		mtx_unlock_spin(&sc->sge.reg_lock);
 		
 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
 			        q->rspq.desc_map);
 		bus_dma_tag_destroy(q->rspq.desc_tag);
 		MTX_DESTROY(&q->rspq.lock);
 	}
 
 #if defined(INET6) || defined(INET)
 	tcp_lro_free(&q->lro.ctrl);
 #endif
 
 	bzero(q, sizeof(*q));
 }
 
 /**
  *	t3_free_sge_resources - free SGE resources
  *	@sc: the adapter softc
  *
  *	Frees resources used by the SGE queue sets.
  */
 void
 t3_free_sge_resources(adapter_t *sc, int nqsets)
 {
 	int i;
 
 	for (i = 0; i < nqsets; ++i) {
 		TXQ_LOCK(&sc->sge.qs[i]);
 		t3_free_qset(sc, &sc->sge.qs[i]);
 	}
 }
 
 /**
  *	t3_sge_start - enable SGE
  *	@sc: the controller softc
  *
  *	Enables the SGE for DMAs.  This is the last step in starting packet
  *	transfers.
  */
 void
 t3_sge_start(adapter_t *sc)
 {
 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
 }
 
 /**
  *	t3_sge_stop - disable SGE operation
  *	@sc: the adapter
  *
  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
  *	from error interrupts) or from normal process context.  In the latter
  *	case it also disables any pending queue restart tasklets.  Note that
  *	if it is called in interrupt context it cannot disable the restart
  *	tasklets as it cannot wait, however the tasklets will have no effect
  *	since the doorbells are disabled and the driver will call this again
  *	later from process context, at which time the tasklets will be stopped
  *	if they are still running.
  */
 void
 t3_sge_stop(adapter_t *sc)
 {
 	int i, nqsets;
 	
 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
 
 	if (sc->tq == NULL)
 		return;
 	
 	for (nqsets = i = 0; i < (sc)->params.nports; i++) 
 		nqsets += sc->port[i].nqsets;
 #ifdef notyet
 	/*
 	 * 
 	 * XXX
 	 */
 	for (i = 0; i < nqsets; ++i) {
 		struct sge_qset *qs = &sc->sge.qs[i];
 		
 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 #endif
 }
 
 /**
  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
  *	@adapter: the adapter
  *	@q: the Tx queue to reclaim descriptors from
  *	@reclaimable: the number of descriptors to reclaim
  *      @m_vec_size: maximum number of buffers to reclaim
  *      @desc_reclaimed: returns the number of descriptors reclaimed
  *
  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
  *	Tx buffers.  Called with the Tx queue lock held.
  *
  *      Returns number of buffers of reclaimed   
  */
 void
 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
 {
 	struct tx_sw_desc *txsd;
 	unsigned int cidx, mask;
 	struct sge_txq *q = &qs->txq[queue];
 
 #ifdef T3_TRACE
 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
 #endif
 	cidx = q->cidx;
 	mask = q->size - 1;
 	txsd = &q->sdesc[cidx];
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	while (reclaimable--) {
 		prefetch(q->sdesc[(cidx + 1) & mask].m);
 		prefetch(q->sdesc[(cidx + 2) & mask].m);
 
 		if (txsd->m != NULL) {
 			if (txsd->flags & TX_SW_DESC_MAPPED) {
 				bus_dmamap_unload(q->entry_tag, txsd->map);
 				txsd->flags &= ~TX_SW_DESC_MAPPED;
 			}
 			m_freem_list(txsd->m);
 			txsd->m = NULL;
 		} else
 			q->txq_skipped++;
 		
 		++txsd;
 		if (++cidx == q->size) {
 			cidx = 0;
 			txsd = q->sdesc;
 		}
 	}
 	q->cidx = cidx;
 
 }
 
 /**
  *	is_new_response - check if a response is newly written
  *	@r: the response descriptor
  *	@q: the response queue
  *
  *	Returns true if a response descriptor contains a yet unprocessed
  *	response.
  */
 static __inline int
 is_new_response(const struct rsp_desc *r,
     const struct sge_rspq *q)
 {
 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
 }
 
 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
 
 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
 #define NOMEM_INTR_DELAY 2500
 
 #ifdef TCP_OFFLOAD
 /**
  *	write_ofld_wr - write an offload work request
  *	@adap: the adapter
  *	@m: the packet to send
  *	@q: the Tx queue
  *	@pidx: index of the first Tx descriptor to write
  *	@gen: the generation value to use
  *	@ndesc: number of descriptors the packet will occupy
  *
  *	Write an offload work request to send the supplied packet.  The packet
  *	data already carry the work request with most fields populated.
  */
 static void
 write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
     unsigned int pidx, unsigned int gen, unsigned int ndesc)
 {
 	unsigned int sgl_flits, flits;
 	int i, idx, nsegs, wrlen;
 	struct work_request_hdr *from;
 	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
 	struct tx_desc *d = &q->desc[pidx];
 	struct txq_state txqs;
 	struct sglist_seg *segs;
 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
 	struct sglist *sgl;
 
 	from = (void *)(oh + 1);	/* Start of WR within mbuf */
 	wrlen = m->m_len - sizeof(*oh);
 
 	if (!(oh->flags & F_HDR_SGL)) {
 		write_imm(d, (caddr_t)from, wrlen, gen);
 
 		/*
 		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
 		 * t3_push_frames and freed in wr_ack.  Others, like those sent
 		 * down by close_conn, t3_send_reset, etc. should be freed here.
 		 */
 		if (!(oh->flags & F_HDR_DF))
 			m_free(m);
 		return;
 	}
 
 	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
 
 	sgl = oh->sgl;
 	flits = wrlen / 8;
 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
 
 	nsegs = sgl->sg_nseg;
 	segs = sgl->sg_segs;
 	for (idx = 0, i = 0; i < nsegs; i++) {
 		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
 		if (i && idx == 0) 
 			++sgp;
 		sgp->len[idx] = htobe32(segs[i].ss_len);
 		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
 		idx ^= 1;
 	}
 	if (idx) {
 		sgp->len[idx] = 0;
 		sgp->addr[idx] = 0;
 	}
 
 	sgl_flits = sgl_len(nsegs);
 	txqs.gen = gen;
 	txqs.pidx = pidx;
 	txqs.compl = 0;
 
 	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
 	    from->wrh_hi, from->wrh_lo);
 }
 
 /**
  *	ofld_xmit - send a packet through an offload queue
  *	@adap: the adapter
  *	@q: the Tx offload queue
  *	@m: the packet
  *
  *	Send an offload packet through an SGE offload queue.
  */
 static int
 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
 {
 	int ret;
 	unsigned int ndesc;
 	unsigned int pidx, gen;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
 
 	ndesc = G_HDR_NDESC(oh->flags);
 
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
 			TXQ_UNLOCK(qs);
 			return (EINTR);
 		}
 		goto again;
 	}
 
 	gen = q->gen;
 	q->in_use += ndesc;
 	pidx = q->pidx;
 	q->pidx += ndesc;
 	if (q->pidx >= q->size) {
 		q->pidx -= q->size;
 		q->gen ^= 1;
 	}
 
 	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
 	check_ring_tx_db(adap, q, 1);
 	TXQ_UNLOCK(qs);
 
 	return (0);
 }
 
 /**
  *	restart_offloadq - restart a suspended offload queue
  *	@qs: the queue set cotaining the offload queue
  *
  *	Resumes transmission on a suspended Tx offload queue.
  */
 static void
 restart_offloadq(void *data, int npending)
 {
 	struct mbuf *m;
 	struct sge_qset *qs = data;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	adapter_t *adap = qs->port->adapter;
 	int cleaned;
 		
 	TXQ_LOCK(qs);
 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
 
-	while ((m = mbufq_peek(&q->sendq)) != NULL) {
+	while ((m = mbufq_first(&q->sendq)) != NULL) {
 		unsigned int gen, pidx;
 		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
 		unsigned int ndesc = G_HDR_NDESC(oh->flags);
 
 		if (__predict_false(q->size - q->in_use < ndesc)) {
 			setbit(&qs->txq_stopped, TXQ_OFLD);
 			if (should_restart_tx(q) &&
 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
 				goto again;
 			q->stops++;
 			break;
 		}
 
 		gen = q->gen;
 		q->in_use += ndesc;
 		pidx = q->pidx;
 		q->pidx += ndesc;
 		if (q->pidx >= q->size) {
 			q->pidx -= q->size;
 			q->gen ^= 1;
 		}
 		
 		(void)mbufq_dequeue(&q->sendq);
 		TXQ_UNLOCK(qs);
 		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
 		TXQ_LOCK(qs);
 	}
 #if USE_GTS
 	set_bit(TXQ_RUNNING, &q->flags);
 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
 #endif
 	TXQ_UNLOCK(qs);
 	wmb();
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 }
 
 /**
  *	t3_offload_tx - send an offload packet
  *	@m: the packet
  *
  *	Sends an offload packet.  We use the packet priority to select the
  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
  *	should be sent as regular or control, bits 1-3 select the queue set.
  */
 int
 t3_offload_tx(struct adapter *sc, struct mbuf *m)
 {
 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
 	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
 
 	if (oh->flags & F_HDR_CTRL) {
 		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
 		return (ctrl_xmit(sc, qs, m));
 	} else
 		return (ofld_xmit(sc, qs, m));
 }
 #endif
 
 static void
 restart_tx(struct sge_qset *qs)
 {
 	struct adapter *sc = qs->port->adapter;
 
 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
 		qs->txq[TXQ_OFLD].restarts++;
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 	}
 
 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
 		qs->txq[TXQ_CTRL].restarts++;
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 }
 
 /**
  *	t3_sge_alloc_qset - initialize an SGE queue set
  *	@sc: the controller softc
  *	@id: the queue set id
  *	@nports: how many Ethernet ports will be using this queue set
  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
  *	@p: configuration parameters for this queue set
  *	@ntxq: number of Tx queues for the queue set
  *	@pi: port info for queue set
  *
  *	Allocate resources and initialize an SGE queue set.  A queue set
  *	comprises a response queue, two Rx free-buffer queues, and up to 3
  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
  *	queue, offload queue, and control queue.
  */
 int
 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
 		  const struct qset_params *p, int ntxq, struct port_info *pi)
 {
 	struct sge_qset *q = &sc->sge.qs[id];
 	int i, ret = 0;
 
 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
 	q->port = pi;
 	q->adap = sc;
 
 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
 		goto err;
 	}
 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
 	    M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(sc->dev, "failed to allocate ifq\n");
 		goto err;
 	}
 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);	
 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
 
 	init_qset_cntxt(q, id);
 	q->idx = id;
 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
 		    &q->fl[0].desc, &q->fl[0].sdesc,
 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
 		printf("error %d from alloc ring fl0\n", ret);
 		goto err;
 	}
 
 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
 		    &q->fl[1].desc, &q->fl[1].sdesc,
 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
 		printf("error %d from alloc ring fl1\n", ret);
 		goto err;
 	}
 
 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
 		    &q->rspq.desc_tag, &q->rspq.desc_map,
 		    NULL, NULL)) != 0) {
 		printf("error %d from alloc ring rspq\n", ret);
 		goto err;
 	}
 
 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
 	    device_get_unit(sc->dev), irq_vec_idx);
 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
 
 	for (i = 0; i < ntxq; ++i) {
 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
 
 		if ((ret = alloc_ring(sc, p->txq_size[i],
 			    sizeof(struct tx_desc), sz,
 			    &q->txq[i].phys_addr, &q->txq[i].desc,
 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
 			    &q->txq[i].desc_map,
 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
 			printf("error %d from alloc ring tx %i\n", ret, i);
 			goto err;
 		}
-		mbufq_init(&q->txq[i].sendq);
+		mbufq_init(&q->txq[i].sendq, INT_MAX);
 		q->txq[i].gen = 1;
 		q->txq[i].size = p->txq_size[i];
 	}
 
 #ifdef TCP_OFFLOAD
 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
 #endif
 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
 
 	q->fl[0].gen = q->fl[1].gen = 1;
 	q->fl[0].size = p->fl_size;
 	q->fl[1].size = p->jumbo_size;
 
 	q->rspq.gen = 1;
 	q->rspq.cidx = 0;
 	q->rspq.size = p->rspq_size;
 
 	q->txq[TXQ_ETH].stop_thres = nports *
 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
 
 	q->fl[0].buf_size = MCLBYTES;
 	q->fl[0].zone = zone_pack;
 	q->fl[0].type = EXT_PACKET;
 
 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
 		q->fl[1].zone = zone_jumbo16;
 		q->fl[1].type = EXT_JUMBO16;
 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
 		q->fl[1].zone = zone_jumbo9;
 		q->fl[1].type = EXT_JUMBO9;		
 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
 		q->fl[1].zone = zone_jumbop;
 		q->fl[1].type = EXT_JUMBOP;
 	} else {
 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
 		ret = EDOOFUS;
 		goto err;
 	}
 	q->fl[1].buf_size = p->jumbo_buf_size;
 
 	/* Allocate and setup the lro_ctrl structure */
 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
 #if defined(INET6) || defined(INET)
 	ret = tcp_lro_init(&q->lro.ctrl);
 	if (ret) {
 		printf("error %d from tcp_lro_init\n", ret);
 		goto err;
 	}
 #endif
 	q->lro.ctrl.ifp = pi->ifp;
 
 	mtx_lock_spin(&sc->sge.reg_lock);
 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
 				   q->rspq.phys_addr, q->rspq.size,
 				   q->fl[0].buf_size, 1, 0);
 	if (ret) {
 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
 		goto err_unlock;
 	}
 
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
 					  q->fl[i].phys_addr, q->fl[i].size,
 					  q->fl[i].buf_size, p->cong_thres, 1,
 					  0);
 		if (ret) {
 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
 			goto err_unlock;
 		}
 	}
 
 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
 				 1, 0);
 	if (ret) {
 		printf("error %d from t3_sge_init_ecntxt\n", ret);
 		goto err_unlock;
 	}
 
 	if (ntxq > 1) {
 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
 					 USE_GTS, SGE_CNTXT_OFLD, id,
 					 q->txq[TXQ_OFLD].phys_addr,
 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
 		if (ret) {
 			printf("error %d from t3_sge_init_ecntxt\n", ret);
 			goto err_unlock;
 		}
 	}
 
 	if (ntxq > 2) {
 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
 					 SGE_CNTXT_CTRL, id,
 					 q->txq[TXQ_CTRL].phys_addr,
 					 q->txq[TXQ_CTRL].size,
 					 q->txq[TXQ_CTRL].token, 1, 0);
 		if (ret) {
 			printf("error %d from t3_sge_init_ecntxt\n", ret);
 			goto err_unlock;
 		}
 	}
 
 	mtx_unlock_spin(&sc->sge.reg_lock);
 	t3_update_qset_coalesce(q, p);
 
 	refill_fl(sc, &q->fl[0], q->fl[0].size);
 	refill_fl(sc, &q->fl[1], q->fl[1].size);
 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
 
 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
 		     V_NEWTIMER(q->rspq.holdoff_tmr));
 
 	return (0);
 
 err_unlock:
 	mtx_unlock_spin(&sc->sge.reg_lock);
 err:	
 	TXQ_LOCK(q);
 	t3_free_qset(sc, q);
 
 	return (ret);
 }
 
 /*
  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
  * ethernet data.  Hardware assistance with various checksums and any vlan tag
  * will also be taken into account here.
  */
 void
 t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
 {
 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
 	struct ifnet *ifp = pi->ifp;
 	
 	if (cpl->vlan_valid) {
 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
 		m->m_flags |= M_VLANTAG;
 	} 
 
 	m->m_pkthdr.rcvif = ifp;
 	/*
 	 * adjust after conversion to mbuf chain
 	 */
 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
 	m->m_len -= (sizeof(*cpl) + ethpad);
 	m->m_data += (sizeof(*cpl) + ethpad);
 
 	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
 		struct ether_header *eh = mtod(m, void *);
 		uint16_t eh_type;
 
 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 			struct ether_vlan_header *evh = mtod(m, void *);
 
 			eh_type = evh->evl_proto;
 		} else
 			eh_type = eh->ether_type;
 
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    eh_type == htons(ETHERTYPE_IP)) {
 			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m->m_pkthdr.csum_data = 0xffff;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    eh_type == htons(ETHERTYPE_IPV6)) {
 			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 	}
 }
 
 /**
  *	get_packet - return the next ingress packet buffer from a free list
  *	@adap: the adapter that received the packet
  *	@drop_thres: # of remaining buffers before we start dropping packets
  *	@qs: the qset that the SGE free list holding the packet belongs to
  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
  *      @r: response descriptor 
  *
  *	Get the next packet from a free list and complete setup of the
  *	sk_buff.  If the packet is small we make a copy and recycle the
  *	original buffer, otherwise we use the original buffer itself.  If a
  *	positive drop threshold is supplied packets are dropped and their
  *	buffers recycled if (a) the number of remaining buffers is under the
  *	threshold and the packet is too big to copy, or (b) the packet should
  *	be copied but there is no memory for the copy.
  */
 static int
 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
 {
 
 	unsigned int len_cq =  ntohl(r->len_cq);
 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
 	int mask, cidx = fl->cidx;
 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
 	uint32_t len = G_RSPD_LEN(len_cq);
 	uint32_t flags = M_EXT;
 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
 	caddr_t cl;
 	struct mbuf *m;
 	int ret = 0;
 
 	mask = fl->size - 1;
 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);	
 
 	fl->credits--;
 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
 	
 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
 	    sopeop == RSPQ_SOP_EOP) {
 		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 			goto skip_recycle;
 		cl = mtod(m, void *);
 		memcpy(cl, sd->rxsd_cl, len);
 		recycle_rx_buf(adap, fl, fl->cidx);
 		m->m_pkthdr.len = m->m_len = len;
 		m->m_flags = 0;
 		mh->mh_head = mh->mh_tail = m;
 		ret = 1;
 		goto done;
 	} else {
 	skip_recycle:
 		bus_dmamap_unload(fl->entry_tag, sd->map);
 		cl = sd->rxsd_cl;
 		m = sd->m;
 
 		if ((sopeop == RSPQ_SOP_EOP) ||
 		    (sopeop == RSPQ_SOP))
 			flags |= M_PKTHDR;
 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
 		if (fl->zone == zone_pack) {
 			/*
 			 * restore clobbered data pointer
 			 */
 			m->m_data = m->m_ext.ext_buf;
 		} else {
 			m_cljset(m, cl, fl->type);
 		}
 		m->m_len = len;
 	}		
 	switch(sopeop) {
 	case RSPQ_SOP_EOP:
 		ret = 1;
 		/* FALLTHROUGH */
 	case RSPQ_SOP:
 		mh->mh_head = mh->mh_tail = m;
 		m->m_pkthdr.len = len;
 		break;
 	case RSPQ_EOP:
 		ret = 1;
 		/* FALLTHROUGH */
 	case RSPQ_NSOP_NEOP:
 		if (mh->mh_tail == NULL) {
 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
 			m_freem(m);
 			break;
 		}
 		mh->mh_tail->m_next = m;
 		mh->mh_tail = m;
 		mh->mh_head->m_pkthdr.len += len;
 		break;
 	}
 	if (cxgb_debug)
 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
 done:
 	if (++fl->cidx == fl->size)
 		fl->cidx = 0;
 
 	return (ret);
 }
 
 /**
  *	handle_rsp_cntrl_info - handles control information in a response
  *	@qs: the queue set corresponding to the response
  *	@flags: the response control flags
  *
  *	Handles the control information of an SGE response, such as GTS
  *	indications and completion credits for the queue set's Tx queues.
  *	HW coalesces credits, we don't do any extra SW coalescing.
  */
 static __inline void
 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
 {
 	unsigned int credits;
 
 #if USE_GTS
 	if (flags & F_RSPD_TXQ0_GTS)
 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
 #endif
 	credits = G_RSPD_TXQ0_CR(flags);
 	if (credits) 
 		qs->txq[TXQ_ETH].processed += credits;
 
 	credits = G_RSPD_TXQ2_CR(flags);
 	if (credits)
 		qs->txq[TXQ_CTRL].processed += credits;
 
 # if USE_GTS
 	if (flags & F_RSPD_TXQ1_GTS)
 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
 # endif
 	credits = G_RSPD_TXQ1_CR(flags);
 	if (credits)
 		qs->txq[TXQ_OFLD].processed += credits;
 
 }
 
 static void
 check_ring_db(adapter_t *adap, struct sge_qset *qs,
     unsigned int sleeping)
 {
 	;
 }
 
 /**
  *	process_responses - process responses from an SGE response queue
  *	@adap: the adapter
  *	@qs: the queue set to which the response queue belongs
  *	@budget: how many responses can be processed in this round
  *
  *	Process responses from an SGE response queue up to the supplied budget.
  *	Responses include received packets as well as credits and other events
  *	for the queues that belong to the response queue's queue set.
  *	A negative budget is effectively unlimited.
  *
  *	Additionally choose the interrupt holdoff time for the next interrupt
  *	on this queue.  If the system is under memory shortage use a fairly
  *	long delay to help recovery.
  */
 static int
 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 {
 	struct sge_rspq *rspq = &qs->rspq;
 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
 	int budget_left = budget;
 	unsigned int sleeping = 0;
 #if defined(INET6) || defined(INET)
 	int lro_enabled = qs->lro.enabled;
 	int skip_lro;
 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
 #endif
 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
 #ifdef DEBUG	
 	static int last_holdoff = 0;
 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
 		last_holdoff = rspq->holdoff_tmr;
 	}
 #endif
 	rspq->next_holdoff = rspq->holdoff_tmr;
 
 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
 		int eth, eop = 0, ethpad = 0;
 		uint32_t flags = ntohl(r->flags);
 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
 		uint8_t opcode = r->rss_hdr.opcode;
 		
 		eth = (opcode == CPL_RX_PKT);
 		
 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
 			struct mbuf *m;
 
 			if (cxgb_debug)
 				printf("async notification\n");
 
 			if (mh->mh_head == NULL) {
 				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
 				m = mh->mh_head;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 			}
 			if (m == NULL)
 				goto no_mem;
 
                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
 			opcode = CPL_ASYNC_NOTIF;
 			eop = 1;
                         rspq->async_notif++;
 			goto skip;
 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
 			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
 
 			if (m == NULL) {	
 		no_mem:
 				rspq->next_holdoff = NOMEM_INTR_DELAY;
 				budget_left--;
 				break;
 			}
 			if (mh->mh_head == NULL)
 				mh->mh_head = m;
                         else 
 				mh->mh_tail->m_next = m;
 			mh->mh_tail = m;
 
 			get_imm_packet(adap, r, m);
 			mh->mh_head->m_pkthdr.len += m->m_len;
 			eop = 1;
 			rspq->imm_data++;
 		} else if (r->len_cq) {
 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
 			
 			eop = get_packet(adap, drop_thresh, qs, mh, r);
 			if (eop) {
 				if (r->rss_hdr.hash_type && !adap->timestamp) {
 					M_HASHTYPE_SET(mh->mh_head, M_HASHTYPE_OPAQUE);
 					mh->mh_head->m_pkthdr.flowid = rss_hash;
 				}
 			}
 			
 			ethpad = 2;
 		} else {
 			rspq->pure_rsps++;
 		}
 	skip:
 		if (flags & RSPD_CTRL_MASK) {
 			sleeping |= flags & RSPD_GTS_MASK;
 			handle_rsp_cntrl_info(qs, flags);
 		}
 
 		if (!eth && eop) {
 			rspq->offload_pkts++;
 #ifdef TCP_OFFLOAD
 			adap->cpl_handler[opcode](qs, r, mh->mh_head);
 #else
 			m_freem(mh->mh_head);
 #endif
 			mh->mh_head = NULL;
 		} else if (eth && eop) {
 			struct mbuf *m = mh->mh_head;
 
 			t3_rx_eth(adap, m, ethpad);
 
 			/*
 			 * The T304 sends incoming packets on any qset.  If LRO
 			 * is also enabled, we could end up sending packet up
 			 * lro_ctrl->ifp's input.  That is incorrect.
 			 *
 			 * The mbuf's rcvif was derived from the cpl header and
 			 * is accurate.  Skip LRO and just use that.
 			 */
 #if defined(INET6) || defined(INET)
 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
 
 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
 			    ) {
 				/* successfully queue'd for LRO */
 			} else
 #endif
 			{
 				/*
 				 * LRO not enabled, packet unsuitable for LRO,
 				 * or unable to queue.  Pass it up right now in
 				 * either case.
 				 */
 				struct ifnet *ifp = m->m_pkthdr.rcvif;
 				(*ifp->if_input)(ifp, m);
 			}
 			mh->mh_head = NULL;
 
 		}
 
 		r++;
 		if (__predict_false(++rspq->cidx == rspq->size)) {
 			rspq->cidx = 0;
 			rspq->gen ^= 1;
 			r = rspq->desc;
 		}
 
 		if (++rspq->credits >= 64) {
 			refill_rspq(adap, rspq, rspq->credits);
 			rspq->credits = 0;
 		}
 		__refill_fl_lt(adap, &qs->fl[0], 32);
 		__refill_fl_lt(adap, &qs->fl[1], 32);
 		--budget_left;
 	}
 
 #if defined(INET6) || defined(INET)
 	/* Flush LRO */
 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
 		tcp_lro_flush(lro_ctrl, queued);
 	}
 #endif
 
 	if (sleeping)
 		check_ring_db(adap, qs, sleeping);
 
 	mb();  /* commit Tx queue processed updates */
 	if (__predict_false(qs->txq_stopped > 1))
 		restart_tx(qs);
 
 	__refill_fl_lt(adap, &qs->fl[0], 512);
 	__refill_fl_lt(adap, &qs->fl[1], 512);
 	budget -= budget_left;
 	return (budget);
 }
 
 /*
  * A helper function that processes responses and issues GTS.
  */
 static __inline int
 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
 {
 	int work;
 	static int last_holdoff = 0;
 	
 	work = process_responses(adap, rspq_to_qset(rq), -1);
 
 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
 		printf("next_holdoff=%d\n", rq->next_holdoff);
 		last_holdoff = rq->next_holdoff;
 	}
 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
 	
 	return (work);
 }
 
 
 /*
  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
  * Handles data events from SGE response queues as well as error and other
  * async events as they all use the same interrupt pin.  We use one SGE
  * response queue per port in this mode and protect all response queues with
  * queue 0's lock.
  */
 void
 t3b_intr(void *data)
 {
 	uint32_t i, map;
 	adapter_t *adap = data;
 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
 	
 	t3_write_reg(adap, A_PL_CLI, 0);
 	map = t3_read_reg(adap, A_SG_DATA_INTR);
 
 	if (!map) 
 		return;
 
 	if (__predict_false(map & F_ERRINTR)) {
 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
 	}
 
 	mtx_lock(&q0->lock);
 	for_each_port(adap, i)
 	    if (map & (1 << i))
 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
 	mtx_unlock(&q0->lock);
 }
 
 /*
  * The MSI interrupt handler.  This needs to handle data events from SGE
  * response queues as well as error and other async events as they all use
  * the same MSI vector.  We use one SGE response queue per port in this mode
  * and protect all response queues with queue 0's lock.
  */
 void
 t3_intr_msi(void *data)
 {
 	adapter_t *adap = data;
 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
 	int i, new_packets = 0;
 
 	mtx_lock(&q0->lock);
 
 	for_each_port(adap, i)
 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq)) 
 		    new_packets = 1;
 	mtx_unlock(&q0->lock);
 	if (new_packets == 0) {
 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
 	}
 }
 
 void
 t3_intr_msix(void *data)
 {
 	struct sge_qset *qs = data;
 	adapter_t *adap = qs->port->adapter;
 	struct sge_rspq *rspq = &qs->rspq;
 
 	if (process_responses_gts(adap, rspq) == 0)
 		rspq->unhandled_irqs++;
 }
 
 #define QDUMP_SBUF_SIZE		32 * 400
 static int
 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_rspq *rspq;
 	struct sge_qset *qs;
 	int i, err, dump_end, idx;
 	struct sbuf *sb;
 	struct rsp_desc *rspd;
 	uint32_t data[4];
 	
 	rspq = arg1;
 	qs = rspq_to_qset(rspq);
 	if (rspq->rspq_dump_count == 0) 
 		return (0);
 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", rspq->rspq_dump_count);
 		rspq->rspq_dump_count = 0;
 		return (EINVAL);
 	}
 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    rspq->rspq_dump_start);
 		rspq->rspq_dump_start = 0;
 		return (EINVAL);
 	}
 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
 	if (err)
 		return (err);
 	err = sysctl_wire_old_buffer(req, 0);
 	if (err)
 		return (err);
 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
 
 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
 	
 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
 	
 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
 		idx = i & (RSPQ_Q_SIZE-1);
 		
 		rspd = &rspq->desc[idx];
 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
 		    be32toh(rspd->len_cq), rspd->intr_gen);
 	}
 
 	err = sbuf_finish(sb);
 	/* Output a trailing NUL. */
 	if (err == 0)
 		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }	
 
 static int
 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_txq *txq;
 	struct sge_qset *qs;
 	int i, j, err, dump_end;
 	struct sbuf *sb;
 	struct tx_desc *txd;
 	uint32_t *WR, wr_hi, wr_lo, gen;
 	uint32_t data[4];
 	
 	txq = arg1;
 	qs = txq_to_qset(txq, TXQ_ETH);
 	if (txq->txq_dump_count == 0) {
 		return (0);
 	}
 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", txq->txq_dump_count);
 		txq->txq_dump_count = 1;
 		return (EINVAL);
 	}
 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    txq->txq_dump_start);
 		txq->txq_dump_start = 0;
 		return (EINVAL);
 	}
 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
 	if (err)
 		return (err);
 	err = sysctl_wire_old_buffer(req, 0);
 	if (err)
 		return (err);
 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
 
 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16), 
 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
 	    txq->txq_dump_start,
 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
 
 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
 	for (i = txq->txq_dump_start; i < dump_end; i++) {
 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
 		WR = (uint32_t *)txd->flit;
 		wr_hi = ntohl(WR[0]);
 		wr_lo = ntohl(WR[1]);		
 		gen = G_WR_GEN(wr_lo);
 		
 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
 		    wr_hi, wr_lo, gen);
 		for (j = 2; j < 30; j += 4) 
 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
 
 	}
 	err = sbuf_finish(sb);
 	/* Output a trailing NUL. */
 	if (err == 0)
 		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }
 
 static int
 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_txq *txq;
 	struct sge_qset *qs;
 	int i, j, err, dump_end;
 	struct sbuf *sb;
 	struct tx_desc *txd;
 	uint32_t *WR, wr_hi, wr_lo, gen;
 	
 	txq = arg1;
 	qs = txq_to_qset(txq, TXQ_CTRL);
 	if (txq->txq_dump_count == 0) {
 		return (0);
 	}
 	if (txq->txq_dump_count > 256) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", txq->txq_dump_count);
 		txq->txq_dump_count = 1;
 		return (EINVAL);
 	}
 	if (txq->txq_dump_start > 255) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    txq->txq_dump_start);
 		txq->txq_dump_start = 0;
 		return (EINVAL);
 	}
 
 	err = sysctl_wire_old_buffer(req, 0);
 	if (err != 0)
 		return (err);
 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
 	    txq->txq_dump_start,
 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
 
 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
 	for (i = txq->txq_dump_start; i < dump_end; i++) {
 		txd = &txq->desc[i & (255)];
 		WR = (uint32_t *)txd->flit;
 		wr_hi = ntohl(WR[0]);
 		wr_lo = ntohl(WR[1]);		
 		gen = G_WR_GEN(wr_lo);
 		
 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
 		    wr_hi, wr_lo, gen);
 		for (j = 2; j < 30; j += 4) 
 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
 
 	}
 	err = sbuf_finish(sb);
 	/* Output a trailing NUL. */
 	if (err == 0)
 		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }
 
 static int
 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
 {
 	adapter_t *sc = arg1;
 	struct qset_params *qsp = &sc->params.sge.qset[0]; 
 	int coalesce_usecs;	
 	struct sge_qset *qs;
 	int i, j, err, nqsets = 0;
 	struct mtx *lock;
 
 	if ((sc->flags & FULL_INIT_DONE) == 0)
 		return (ENXIO);
 		
 	coalesce_usecs = qsp->coalesce_usecs;
         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
 
 	if (err != 0) {
 		return (err);
 	}
 	if (coalesce_usecs == qsp->coalesce_usecs)
 		return (0);
 
 	for (i = 0; i < sc->params.nports; i++) 
 		for (j = 0; j < sc->port[i].nqsets; j++)
 			nqsets++;
 
 	coalesce_usecs = max(1, coalesce_usecs);
 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[i];
 		qsp = &sc->params.sge.qset[i];
 		qsp->coalesce_usecs = coalesce_usecs;
 		
 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
 			    &sc->sge.qs[0].rspq.lock;
 
 		mtx_lock(lock);
 		t3_update_qset_coalesce(qs, qsp);
 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
 		mtx_unlock(lock);
 	}
 
 	return (0);
 }
 
 static int
 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
 {
 	adapter_t *sc = arg1;
 	int rc, timestamp;
 
 	if ((sc->flags & FULL_INIT_DONE) == 0)
 		return (ENXIO);
 
 	timestamp = sc->timestamp;
 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
 
 	if (rc != 0)
 		return (rc);
 
 	if (timestamp != sc->timestamp) {
 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
 		sc->timestamp = timestamp;
 	}
 
 	return (0);
 }
 
 void
 t3_add_attach_sysctls(adapter_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
 	/* random information */
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 	    "firmware_version",
 	    CTLFLAG_RD, sc->fw_version,
 	    0, "firmware version");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
 	    "hw_revision",
 	    CTLFLAG_RD, &sc->params.rev,
 	    0, "chip model");
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 	    "port_types",
 	    CTLFLAG_RD, sc->port_types,
 	    0, "type of ports");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "enable_debug",
 	    CTLFLAG_RW, &cxgb_debug,
 	    0, "enable verbose debugging output");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
 	    CTLFLAG_RD, &sc->tunq_coalesce,
 	    "#tunneled packets freed");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "txq_overrun",
 	    CTLFLAG_RD, &txq_fills,
 	    0, "#times txq overrun");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
 	    "core_clock",
 	    CTLFLAG_RD, &sc->params.vpd.cclk,
 	    0, "core clock frequency (in KHz)");
 }
 
 
 static const char *rspq_name = "rspq";
 static const char *txq_names[] =
 {
 	"txq_eth",
 	"txq_ofld",
 	"txq_ctrl"	
 };
 
 static int
 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *p = arg1;
 	uint64_t *parg;
 
 	if (!p)
 		return (EINVAL);
 
 	cxgb_refresh_stats(p);
 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
 
 	return (sysctl_handle_64(oidp, parg, 0, req));
 }
 
 void
 t3_add_configured_sysctls(adapter_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 	int i, j;
 	
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 	    "intr_coal",
 	    CTLTYPE_INT|CTLFLAG_RW, sc,
 	    0, t3_set_coalesce_usecs,
 	    "I", "interrupt coalescing timer (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 	    "pkt_timestamp",
 	    CTLTYPE_INT | CTLFLAG_RW, sc,
 	    0, t3_pkt_timestamp,
 	    "I", "provide packet timestamp instead of connection hash");
 
 	for (i = 0; i < sc->params.nports; i++) {
 		struct port_info *pi = &sc->port[i];
 		struct sysctl_oid *poid;
 		struct sysctl_oid_list *poidlist;
 		struct mac_stats *mstats = &pi->mac.stats;
 		
 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, 
 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
 		poidlist = SYSCTL_CHILDREN(poid);
 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
 		    0, "#queue sets");
 
 		for (j = 0; j < pi->nqsets; j++) {
 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
 					  *ctrlqpoid, *lropoid;
 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
 					       *txqpoidlist, *ctrlqpoidlist,
 					       *lropoidlist;
 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
 			
 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
 			
 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, 
 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
 			qspoidlist = SYSCTL_CHILDREN(qspoid);
 
 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
 					CTLFLAG_RD, &qs->fl[0].empty, 0,
 					"freelist #0 empty");
 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
 					CTLFLAG_RD, &qs->fl[1].empty, 0,
 					"freelist #1 empty");
 
 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
 
 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
 
 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
 
 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
 			lropoidlist = SYSCTL_CHILDREN(lropoid);
 
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
 			    CTLFLAG_RD, &qs->rspq.size,
 			    0, "#entries in response queue");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
 			    CTLFLAG_RD, &qs->rspq.cidx,
 			    0, "consumer index");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
 			    CTLFLAG_RD, &qs->rspq.credits,
 			    0, "#credits");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
 			    CTLFLAG_RD, &qs->rspq.starved,
 			    0, "#times starved");
 			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
 			    CTLFLAG_RD, &qs->rspq.phys_addr,
 			    "physical_address_of the queue");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
 			    0, "start rspq dump entry");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
 			    0, "#rspq entries to dump");
 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
 			    0, t3_dump_rspq, "A", "dump of the response queue");
 
 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
 			    "#tunneled packets dropped");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
-			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
+			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
 			    0, "#tunneled packets waiting to be sent");
 #if 0			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
 			    0, "#tunneled packets queue producer index");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
 			    0, "#tunneled packets queue consumer index");
 #endif			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
 			    0, "#tunneled packets processed by the card");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
 			    CTLFLAG_RD, &txq->cleaned,
 			    0, "#tunneled packets cleaned");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
 			    CTLFLAG_RD, &txq->in_use,
 			    0, "#tunneled packet slots in use");
 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
 			    CTLFLAG_RD, &txq->txq_frees,
 			    "#tunneled packets freed");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
 			    CTLFLAG_RD, &txq->txq_skipped,
 			    0, "#tunneled packet descriptors skipped");
 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
 			    CTLFLAG_RD, &txq->txq_coalesced,
 			    "#tunneled packets coalesced");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
 			    CTLFLAG_RD, &txq->txq_enqueued,
 			    0, "#tunneled packets enqueued to hardware");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
 			    CTLFLAG_RD, &qs->txq_stopped,
 			    0, "tx queues stopped");
 			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
 			    CTLFLAG_RD, &txq->phys_addr,
 			    "physical_address_of the queue");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
 			    0, "txq generation");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
 			    CTLFLAG_RD, &txq->cidx,
 			    0, "hardware queue cidx");			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
 			    CTLFLAG_RD, &txq->pidx,
 			    0, "hardware queue pidx");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
 			    0, "txq start idx for dump");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
 			    0, "txq #entries to dump");			
 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
 
 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
 			    0, "ctrlq start idx for dump");
 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
 			    0, "ctrl #entries to dump");			
 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
 
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
 		}
 
 		/* Now add a node for mac stats. */
 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
 		    CTLFLAG_RD, NULL, "MAC statistics");
 		poidlist = SYSCTL_CHILDREN(poid);
 
 		/*
 		 * We (ab)use the length argument (arg2) to pass on the offset
 		 * of the data that we are interested in.  This is only required
 		 * for the quad counters that are updated from the hardware (we
 		 * make sure that we return the latest value).
 		 * sysctl_handle_macstat first updates *all* the counters from
 		 * the hardware, and then returns the latest value of the
 		 * requested counter.  Best would be to update only the
 		 * requested counter from hardware, but t3_mac_update_stats()
 		 * hides all the register details and we don't want to dive into
 		 * all that here.
 		 */
 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
     sysctl_handle_macstat, "QU", 0)
 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
 		CXGB_SYSCTL_ADD_QUAD(rx_short);
 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
 #undef CXGB_SYSCTL_ADD_QUAD
 
 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
     CTLFLAG_RD, &mstats->a, 0)
 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
 		CXGB_SYSCTL_ADD_ULONG(num_resets);
 		CXGB_SYSCTL_ADD_ULONG(link_faults);
 #undef CXGB_SYSCTL_ADD_ULONG
 	}
 }
 	
 /**
  *	t3_get_desc - dump an SGE descriptor for debugging purposes
  *	@qs: the queue set
  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
  *	@idx: the descriptor index in the queue
  *	@data: where to dump the descriptor contents
  *
  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
  *	size of the descriptor.
  */
 int
 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
 		unsigned char *data)
 {
 	if (qnum >= 6)
 		return (EINVAL);
 
 	if (qnum < 3) {
 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
 			return -EINVAL;
 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
 		return sizeof(struct tx_desc);
 	}
 
 	if (qnum == 3) {
 		if (!qs->rspq.desc || idx >= qs->rspq.size)
 			return (EINVAL);
 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
 		return sizeof(struct rsp_desc);
 	}
 
 	qnum -= 4;
 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
 		return (EINVAL);
 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
 	return sizeof(struct rx_desc);
 }
Index: head/sys/dev/cxgb/sys/mbufq.h
===================================================================
--- head/sys/dev/cxgb/sys/mbufq.h	(revision 278976)
+++ head/sys/dev/cxgb/sys/mbufq.h	(nonexistent)
@@ -1,123 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007-2008, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD$
-
-***************************************************************************/
-
-#ifndef CXGB_MBUFQ_H_
-#define CXGB_MBUFQ_H_
-
-struct mbuf_head {
-	struct mbuf *head;
-	struct mbuf *tail;
-	uint32_t     qlen;
-	uint32_t     qsize;
-	struct mtx   lock;
-};
-
-static __inline void
-mbufq_init(struct mbuf_head *l)
-{
-	l->head = l->tail = NULL;
-	l->qlen = l->qsize = 0;
-}
-
-static __inline int
-mbufq_empty(struct mbuf_head *l)
-{
-	return (l->head == NULL);
-}
-
-static __inline int
-mbufq_len(struct mbuf_head *l)
-{
-	return (l->qlen);
-}
-
-static __inline int
-mbufq_size(struct mbuf_head *l)
-{
-	return (l->qsize);
-}
-
-static __inline int
-mbufq_head_size(struct mbuf_head *l)
-{
-	return (l->head ? l->head->m_pkthdr.len : 0);
-}
-
-static __inline void
-mbufq_tail(struct mbuf_head *l, struct mbuf *m)
-{
-	l->qlen++;
-	if (l->head == NULL)
-		l->head = m;
-	else
-		l->tail->m_nextpkt = m;
-	l->tail = m;
-	l->qsize += m->m_pkthdr.len;
-}
-
-static __inline struct mbuf *
-mbufq_dequeue(struct mbuf_head *l)
-{
-	struct mbuf *m;
-
-	m = l->head;
-	if (m) {
-		if (m == l->tail) 
-			l->head = l->tail = NULL;
-		else
-			l->head = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		l->qlen--;
-		l->qsize -= m->m_pkthdr.len;
-	}
-
-	return (m);
-}
-
-static __inline struct mbuf *
-mbufq_peek(const struct mbuf_head *l)
-{
-	return (l->head);
-}
-
-static __inline void
-mbufq_append(struct mbuf_head *a, struct mbuf_head *b)
-{
-	if (a->tail) 
-		a->tail->m_nextpkt = b->head;
-	if (b->tail)
-		a->tail = b->tail;
-	a->qlen += b->qlen;
-	a->qsize += b->qsize;
-	
-	
-}
-#endif  /* CXGB_MBUFQ_H_ */

Property changes on: head/sys/dev/cxgb/sys/mbufq.h
___________________________________________________________________
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Index: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
===================================================================
--- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 278976)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 278977)
@@ -1,1813 +1,1813 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/sockstate.h>
 #include <sys/sockopt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <sys/priv.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <net/route.h>
 
 #include "cxgb_include.h"
 #include "ulp/tom/cxgb_l2t.h"
 #include "ulp/tom/cxgb_tom.h"
 #include "ulp/tom/cxgb_toepcb.h"
 
 VNET_DECLARE(int, tcp_do_autosndbuf);
 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
 VNET_DECLARE(int, tcp_autosndbuf_max);
 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 extern int always_keepalive;
 
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  * of the messages sent by the host but that are part of the TCP payload and
  * therefore consume TCP sequence space.  Tx connection parameters that
  * operate in TCP sequence space are affected by the HW additions and need to
  * compensate for them to accurately track TCP sequence numbers. This array
  * contains the compensating extra lengths for ULP packets.  It is indexed by
  * a packet's ULP submode.
  */
 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 
 static void t3_release_offload_resources(struct toepcb *);
 static void send_reset(struct toepcb *toep);
 
 /*
  * Called after the last CPL for the toepcb has been received.
  *
  * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
  * time this function exits.
  */
 static int
 toepcb_release(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	int rc;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
 	    ("%s: double release?", __func__));
 
 	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
 
 	toep->tp_flags |= TP_CPL_DONE;
 	toep->tp_inp = NULL;
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	if (!(toep->tp_flags & TP_ATTACHED))
 		t3_release_offload_resources(toep);
 
 	rc = in_pcbrele_wlocked(inp);
 	if (!rc)
 		INP_WUNLOCK(inp);
 	return (rc);
 }
 
 /*
  * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
  * hanging off it.  If the TOE driver is also done with the toepcb we'll release
  * all offload resources.
  */
 static void
 toepcb_detach(struct inpcb *inp)
 {
 	struct toepcb *toep;
 	struct tcpcb *tp;
 
 	KASSERT(inp, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
 
 	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
 	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
 	    toep, inp, tp);
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->tp_flags &= ~TP_ATTACHED;
 
 	if (toep->tp_flags & TP_CPL_DONE)
 		t3_release_offload_resources(toep);
 }
 
 void
 t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	toepcb_detach(tp->t_inpcb);
 }
 
 static int
 alloc_atid(struct tid_info *t, void *ctx)
 {
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union active_open_entry *p = t->afree;
 
 		atid = (p - t->atid_tab) + t->atid_base;
 		t->afree = p->next;
 		p->ctx = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 
 	return (atid);
 }
 
 static void
 free_atid(struct tid_info *t, int atid)
 {
 	union active_open_entry *p = atid2entry(t, atid);
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 void
 insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, 1);
 }
 
 void
 update_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct tom_data *td, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = NULL;
 	atomic_add_int(&t->tids_in_use, -1);
 }
 
 /* use ctx as a next pointer in the tid release list */
 void
 queue_tid_release(struct toedev *tod, unsigned int tid)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	void **p = &td->tid_maps.tid_tab[tid];
 	struct adapter *sc = tod->tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	*p = td->tid_release_list;
 	td->tid_release_list = p;
 	if (!*p)
 		taskqueue_enqueue(sc->tq, &td->tid_release_task);
 	mtx_unlock(&td->tid_release_lock);
 }
 
 /*
  * Populate a TID_RELEASE WR.
  */
 static inline void
 mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
 {
 
 	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 }
 
 void
 release_tid(struct toedev *tod, unsigned int tid, int qset)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 	struct cpl_tid_release *cpl;
 #ifdef INVARIANTS
 	struct tid_info *t = &td->tid_maps;
 #endif
 
 	KASSERT(tid >= 0 && tid < t->ntids,
 	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m) {
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 	} else
 		queue_tid_release(tod, tid);
 
 }
 
 void
 t3_process_tid_release_list(void *data, int pending)
 {
 	struct mbuf *m;
 	struct tom_data *td = data;
 	struct adapter *sc = td->tod.tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	while (td->tid_release_list) {
 		void **p = td->tid_release_list;
 		unsigned int tid = p - td->tid_maps.tid_tab;
 		struct cpl_tid_release *cpl;
 
 		td->tid_release_list = (void **)*p;
 		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
 		if (m == NULL)
 			break;	/* XXX: who reschedules the release task? */
 		mtx_unlock(&td->tid_release_lock);
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 		mtx_lock(&td->tid_release_lock);
 	}
 	mtx_unlock(&td->tid_release_lock);
 }
 
 static void
 close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct cpl_close_con_req *req;
 
 	if (toep->tp_flags & TP_FIN_SENT)
 		return;
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
 	req->rsvd = 0;
 
 	toep->tp_flags |= TP_FIN_SENT;
 	t3_offload_tx(sc, m);
 }
 
 static inline void
 make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
     struct mbuf *tail)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct sockbuf *snd;
 
 	inp_lock_assert(tp->t_inpcb);
 	snd = so_sockbuf_snd(so);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	/* len includes the length of any HW ULP additions */
 	req->len = htonl(len);
 	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 	/* V_TX_ULP_SUBMODE sets both the mode and submode */
 	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
 	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
 	req->sndseq = htonl(tp->snd_nxt);
 	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 		struct adapter *sc = toep->tp_tod->tod_softc;
 		int cpu_idx = sc->rrss_map[toep->tp_qset];
 
 		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 		    V_TX_CPU_IDX(cpu_idx));
 
 		/* Sendbuffer is in units of 32KB. */
 		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
 			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
 		else
 			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 
 		toep->tp_flags |= TP_DATASENT;
 	}
 }
 
 /*
  * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
  * TOM_XXX_MOVE to some common header file.
  */
 /*
  * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
  * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
  * for the second gen bit flit.  This leaves us with 12 flits.
  *
  * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
  * The first desc has a tx_data_wr (which includes the WR header), the rest have
  * the WR header only.  All descs have the second gen bit flit.
  *
  * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
  * desc has a tx_data_wr (which includes the WR header), the rest have the WR
  * header only.  All descs have the second gen bit flit.
  *
  * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
  *
  */
 #define IMM_LEN 96
 static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
 static int sgllen_to_descs[TX_MAX_SEGS] = {
 	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
 	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
 	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
 	4, 4, 4, 4, 4, 4		/* 30 - 35 */
 };
 #if 0
 static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
 	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
 };
 #endif
 #if SGE_NUM_GENBITS != 2
 #error "SGE_NUM_GENBITS really must be 2"
 #endif
 
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct mbuf *m0, *sndptr, *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	int bytes, ndesc, total_bytes = 0, mlen;
 	struct sockbuf *snd;
 	struct sglist *sgl;
 	struct ofld_hdr *oh;
 	caddr_t dst;
 	struct tx_data_wr *wr;
 
 	inp_lock_assert(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
 	SOCKBUF_LOCK(snd);
 
 	/*
 	 * Autosize the send buffer.
 	 */
 	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
 		if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) &&
 		    sbused(snd) < VNET(tcp_autosndbuf_max)) {
 			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
 			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
 			    so, curthread))
 				snd->sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
 		sndptr = toep->tp_m_last->m_next;
 	else
 		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 
 	/* Nothing to send or no WRs available for sending data */
 	if (toep->tp_wr_avail == 0 || sndptr == NULL)
 		goto out;
 
 	/* Something to send and at least 1 WR available */
 	while (toep->tp_wr_avail && sndptr != NULL) {
 
 		m0 = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m0 == NULL)
 			break;
 		oh = mtod(m0, struct ofld_hdr *);
 		wr = (void *)(oh + 1);
 		dst = (void *)(wr + 1);
 
 		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
 		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
 		    V_HDR_QSET(toep->tp_qset);
 
 		/*
 		 * Try to construct an immediate data WR if possible.  Stuff as
 		 * much data into it as possible, one whole mbuf at a time.
 		 */
 		mlen = sndptr->m_len;
 		ndesc = bytes = 0;
 		while (mlen <= IMM_LEN - bytes) {
 			bcopy(sndptr->m_data, dst, mlen);
 			bytes += mlen;
 			dst += mlen;
 
 			if (!(sndptr = sndptr->m_next))
 				break;
 			mlen = sndptr->m_len;
 		}
 
 		if (bytes) {
 
 			/* Was able to fit 'bytes' bytes in an immediate WR */
 
 			ndesc = 1;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 
 			m0->m_len += bytes;
 			m0->m_pkthdr.len = m0->m_len;
 
 		} else {
 			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
 
 			/* Need to make an SGL */
 
 			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
 			if (sgl == NULL)
 				break;
 
 			for (m = sndptr; m != NULL; m = m->m_next) {
 				if ((mlen = m->m_len) > 0) {
 					if (sglist_append(sgl, m->m_data, mlen))
 					    break;
 				}
 				bytes += mlen;
 			}
 			sndptr = m;
 			if (bytes == 0) {
 				sglist_free(sgl);
 				break;
 			}
 			ndesc = sgllen_to_descs[sgl->sg_nseg];
 			oh->flags |= F_HDR_SGL;
 			oh->sgl = sgl;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 		}
 
 		oh->flags |= V_HDR_NDESC(ndesc);
 		oh->plen = bytes;
 
 		snd->sb_sndptr = sndptr;
 		snd->sb_sndptroff += bytes;
 		if (sndptr == NULL) {
 			snd->sb_sndptr = snd->sb_mbtail;
 			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
 			toep->tp_m_last = snd->sb_mbtail;
 		} else
 			toep->tp_m_last = NULL;
 
 		total_bytes += bytes;
 
 		toep->tp_wr_avail -= ndesc;
 		toep->tp_wr_unacked += ndesc;
 
 		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
 
 		enqueue_wr(toep, m0);
 		l2t_send(sc, m0, toep->tp_l2t);
 	}
 out:
 	SOCKBUF_UNLOCK(snd);
 
 	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
 		close_conn(sc, toep);
 
 	return (total_bytes);
 }
 
 static int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
 	if (m == NULL)
 		return (0);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wrh_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 	t3_offload_tx(sc, m);
 	return (credits);
 }
 
 void
 t3_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *so_rcv = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int must_send;
 
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK(so_rcv);
 	KASSERT(toep->tp_enqueued >= sbused(so_rcv),
 	    ("%s: sbused(so_rcv) > enqueued", __func__));
 	toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv);
 	toep->tp_enqueued = sbused(so_rcv);
 	SOCKBUF_UNLOCK(so_rcv);
 
 	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
 	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
 		toep->tp_rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 }
 
 static int
 do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
 
 	m_freem(m);
 	return (0);
 }
 
 int
 t3_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp_inpcbtosocket(inp);
 #if defined(KTR)
 	unsigned int tid = toep->tp_tid;
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
 	    toep->tp_flags);
 
 	toep->tp_flags |= TP_SEND_FIN;
 	t3_push_frames(so, 1);
 
 	return (0);
 }
 
 int
 t3_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 
 	t3_push_frames(so, 1);
 	return (0);
 }
 
 /* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i = 0, mss;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
 		++i;
 
 	return (i);
 }
 
 static inline void
 purge_wr_queue(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct ofld_hdr *oh;
 
 	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
 		oh = mtod(m, struct ofld_hdr *);
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(m);
 	}
 }
 
 /*
  * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
  * entry, etc.)
  */
 static void
 t3_release_offload_resources(struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	/*
 	 * The TOM explicitly detaches its toepcb from the system's inp before
 	 * it releases the offload resources.
 	 */
 	if (toep->tp_inp) {
 		panic("%s: inp %p still attached to toepcb %p",
 		    __func__, toep->tp_inp, toep);
 	}
 
 	if (toep->tp_wr_avail != toep->tp_wr_max)
 		purge_wr_queue(toep);
 
 	if (toep->tp_l2t) {
 		l2t_release(td->l2t, toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
 
 	if (toep->tp_tid >= 0)
 		release_tid(tod, toep->tp_tid, toep->tp_qset);
 
 	toepcb_free(toep);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 unsigned long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	SOCKBUF_LOCK(&so->so_snd);
 	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Update TCP PCB */
 	tp->tod = toep->tp_tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->tp_inp = inp;
 	toep->tp_flags |= TP_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	INP_WLOCK_ASSERT(inp);
 
 	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
 	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->tp_inp = NULL;
 	toep->tp_flags &= ~TP_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /*
  * Socket could be a listening socket, and we may not have a toepcb at all at
  * this time.
  */
 uint32_t
 calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
 {
 	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
 	    V_MSS_IDX(mtu_idx);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
 		int keepalive = always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0h |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
 
 	return (htobe32(opt0h));
 }
 
 uint32_t
 calc_opt0l(struct socket *so, int rcv_bufsize)
 {
 	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
 
 	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
 
 	if (so != NULL)		/* optional because noone cares about IP TOS */
 		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
 
 	return (htobe32(opt0l));
 }
 
 /*
  * Convert an ACT_OPEN_RPL status to an errno.
  */
 static int
 act_open_rpl_status_to_errno(int status)
 {
 	switch (status) {
 	case CPL_ERR_CONN_RESET:
 		return (ECONNREFUSED);
 	case CPL_ERR_ARP_MISS:
 		return (EHOSTUNREACH);
 	case CPL_ERR_CONN_TIMEDOUT:
 		return (ETIMEDOUT);
 	case CPL_ERR_TCAM_FULL:
 		return (EAGAIN);
 	case CPL_ERR_CONN_EXIST:
 		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
 		return (EAGAIN);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Return whether a failed active open has allocated a TID
  */
 static inline int
 act_open_has_tid(int status)
 {
 	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
 	       status != CPL_ERR_ARP_MISS;
 }
 
 /*
  * Active open failed.
  */
 static int
 do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	struct cpl_act_open_rpl *rpl = mtod(m, void *);
 	unsigned int atid = G_TID(ntohl(rpl->atid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	int s = rpl->status, rc;
 
 	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
 
 	free_atid(&td->tid_maps, atid);
 	toep->tp_tid = -1;
 
 	if (act_open_has_tid(s))
 		queue_tid_release(tod, GET_TID(rpl));
 
 	rc = act_open_rpl_status_to_errno(s);
 	if (rc != EAGAIN)
 		INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	toe_connect_failed(tod, inp, rc);
 	toepcb_release(toep);	/* unlocks inp */
 	if (rc != EAGAIN)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Send an active open request.
  *
  * State of affairs on entry:
  * soisconnecting (so_state |= SS_ISCONNECTING)
  * tcbinfo not locked (this has changed - used to be WLOCKed)
  * inp WLOCKed
  * tp->t_state = TCPS_SYN_SENT
  * rtalloc1, RT_UNLOCK on rt.
  */
 int
 t3_connect(struct toedev *tod, struct socket *so,
     struct rtentry *rt, struct sockaddr *nam)
 {
 	struct mbuf *m = NULL;
 	struct l2t_entry *e = NULL;
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct cpl_act_open_req *cpl;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep;
 	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
 	struct sockaddr *gw;
 	struct ifnet *ifp = rt->rt_ifp;
 	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
 
 	INP_WLOCK_ASSERT(inp);
 
 	toep = toepcb_alloc(tod);
 	if (toep == NULL)
 		goto failed;
 
 	atid = alloc_atid(&td->tid_maps, toep);
 	if (atid < 0)
 		goto failed;
 
 	qset = pi->first_qset + (arc4random() % pi->nqsets);
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m == NULL)
 		goto failed;
 
 	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
 	e = t3_l2t_get(pi, ifp, gw);
 	if (e == NULL)
 		goto failed;
 
 	toep->tp_l2t = e;
 	toep->tp_tid = atid;	/* used to double check response */
 	toep->tp_qset = qset;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	offload_socket(so, toep);
 
 	/*
 	 * The kernel sets request_r_scale based on sb_max whereas we need to
 	 * take hardware's MAX_RCV_WND into account too.  This is normally a
 	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
 	 */
 	if (tp->t_flags & TF_REQ_SCALE)
 		rscale = tp->request_r_scale = select_rcv_wscale();
 	else
 		rscale = 0;
 	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
 	cpu_idx = sc->rrss_map[qset];
 
 	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
 	cpl->wr.wrh_lo = 0;
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
 	    &cpl->peer_port);
 	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
 	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
 	cpl->params = 0;
 	cpl->opt2 = calc_opt2(cpu_idx);
 
 	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
 	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
 
 	if (l2t_send(sc, m, e) == 0)
 		return (0);
 
 	undo_offload_socket(so);
 
 failed:
 	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
 	    __func__, atid, toep, e, m);
 
 	if (atid >= 0)
 		free_atid(&td->tid_maps, atid);
 
 	if (e)
 		l2t_release(td->l2t, e);
 
 	if (toep)
 		toepcb_free(toep);
 
 	m_freem(m);
 
 	return (ENOMEM);
 }
 
 /*
  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
  * send multiple ABORT_REQs for the same connection and also that we do not try
  * to send a message after the connection has closed.
  */
 static void
 send_reset(struct toepcb *toep)
 {
 
 	struct cpl_abort_req *req;
 	unsigned int tid = toep->tp_tid;
 	struct inpcb *inp = toep->tp_inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
 	    toep->tp_flags);
 
 	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
 		return;
 
 	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
 
 	/* Purge the send queue */
 	sbflush(so_sockbuf_snd(so));
 	purge_wr_queue(toep);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
 	req->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
 	req->rsvd0 = htonl(tp->snd_nxt);
 	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	if (tp->t_state == TCPS_SYN_SENT)
-		mbufq_tail(&toep->out_of_order_queue, m); /* defer */
+		(void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */
 	else
 		l2t_send(sc, m, toep->tp_l2t);
 }
 
 int
 t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	send_reset(tp->t_toe);
 	return (0);
 }
 
 /*
  * Handler for RX_DATA CPL messages.
  */
 static int
 do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_data *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *so_rcv;	
 
 	/* Advance over CPL */
 	m_adj(m, sizeof(*hdr));
 
 	/* XXX: revisit.  This comes from the T4 TOM */
 	if (__predict_false(inp == NULL)) {
 		/*
 		 * do_pass_establish failed and must be attempting to abort the
 		 * connection.  Meanwhile, the T4 has sent us data for such a
 		 * connection.
 		 */
 #ifdef notyet
 		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
 		    ("%s: inp NULL and tid isn't being aborted", __func__));
 #endif
 		m_freem(m);
 		return (0);
 	}
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
 		toep->tp_delack_mode = hdr->dack_mode;
 
 	tp = intotcpcb(inp);
 
 #ifdef INVARIANTS
 	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
 		log(LOG_ERR,
 		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
 		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
 	}
 #endif
 	tp->rcv_nxt += m->m_pkthdr.len;
 	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
 	    ("%s: negative window size", __func__));
 	tp->rcv_wnd -= m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 
 	so  = inp->inp_socket;
 	so_rcv = &so->so_rcv;
 	SOCKBUF_LOCK(so_rcv);
 
 	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, m->m_pkthdr.len);
 		SOCKBUF_UNLOCK(so_rcv);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_WLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 
 		m_freem(m);
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	if (so_rcv->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
 		unsigned int hiwat = so_rcv->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
 			so_rcv->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->tp_rx_credits += newsize - hiwat;
 	}
 
 	toep->tp_enqueued += m->m_pkthdr.len;
 	sbappendstream_locked(so_rcv, m, 0);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(so_rcv);
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /*
  * Handler for PEER_CLOSE CPL messages.
  */
 static int
 do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_peer_close *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 
 	socantrcvmore(so);
 	tp->rcv_nxt++;
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
 	case TCPS_FIN_WAIT_1:
 		tp->t_state = TCPS_CLOSING;
 		break;
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 
 		m_freem(m);
 		return (0);
 	default:
 		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
  */
 static int
 do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
 	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
 
 	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 	
 		m_freem(m);
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tp->t_state = TCPS_FIN_WAIT_2;
 		break;
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR,
 		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handle an ABORT_RPL_RSS CPL message.
  */
 static int
 do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 
 	/*
 	 * Ignore replies to post-close aborts indicating that the abort was
 	 * requested too late.  These connections are terminated when we get
 	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
 	 * arrives the TID is either no longer used or it has been recycled.
 	 */
 	if (rpl->status == CPL_ERR_ABORT_FAILED) {
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_rpl_synqe(qs, r, m));
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
 	    rpl->status);
 
 	inp = toep->tp_inp;
 	INP_WLOCK(inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
 			toep->tp_flags |= TP_ABORT_RPL_RCVD;
 			INP_WUNLOCK(inp);
 		} else {
 			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
 			toep->tp_flags &= TP_ABORT_RPL_PENDING;
 			toepcb_release(toep);	/* no more CPLs expected */
 		}
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Convert the status code of an ABORT_REQ into a FreeBSD error code.
  */
 static int
 abort_status_to_errno(struct tcpcb *tp, int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Returns whether an ABORT_REQ_RSS message is a negative advice.
  */
 static inline int
 is_neg_adv_abort(unsigned int status)
 {
 	return status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
 void
 send_abort_rpl(struct toedev *tod, int tid, int qset)
 {
 	struct mbuf *reply;
 	struct cpl_abort_rpl *rpl;
 	struct adapter *sc = tod->tod_softc;
 
 	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
 	if (!reply)
 		CXGB_UNIMPLEMENTED();
 
 	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
 	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
 	rpl->cmd = CPL_ABORT_NO_RST;
 
 	t3_offload_tx(sc, reply);
 }
 
 /*
  * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
  * ignore this request except that we need to reply to it.
  */
 static int
 do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	const struct cpl_abort_req_rss *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int qset = toep->tp_qset;
 
 	if (is_neg_adv_abort(req->status)) {
 		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
 		    __func__, req->status, tid, toep->tp_flags);
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_req_synqe(qs, r, m));
 
 	inp = toep->tp_inp;
 	INP_INFO_WLOCK(&V_tcbinfo);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 
 	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
 	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
 	    req->status);
 
 	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
 		toep->tp_flags |= TP_ABORT_REQ_RCVD;
 		toep->tp_flags |= TP_ABORT_SHUTDOWN;
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		m_freem(m);
 		return (0);
 	}
 	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
 
 	/*
 	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
 	 * the T3's reply to our reset instead.
 	 */
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		toep->tp_flags |= TP_ABORT_RPL_SENT;
 		INP_WUNLOCK(inp);
 	} else {
 		so_error_set(so, abort_status_to_errno(tp, req->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 		toepcb_release(toep);	/* no more CPLs expected */
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	send_abort_rpl(tod, tid, qset);
 	m_freem(m);
 	return (0);
 }
 
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = toep->tp_tod->tod_softc;
 
 	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
 
 	if (G_TCPOPT_TSTAMP(tcpopt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;
 		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
 		tp->ts_recent = 0;		/* XXX */
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
 	}
 
 	if (G_TCPOPT_SACK(tcpopt))
 		tp->t_flags |= TF_SACK_PERMIT;
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;
 
 	if (G_TCPOPT_WSCALE_OK(tcpopt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
 	}
 
 }
 
 /*
  * The ISS and IRS are from after the exchange of SYNs and are off by 1.
  */
 void
 make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
     uint16_t cpl_tcpopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	long bufsize;
 	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
 	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
 	uint16_t tcpopt = be16toh(cpl_tcpopt);
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp->t_state = TCPS_ESTABLISHED;
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
 	    toep->tp_tid, toep, inp);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = toep->tp_rx_credits << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
 	 * If we were unable to send all rx credits via opt0, save the remainder
 	 * in rx_credits so that they can be handed over with the next credit
 	 * update.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	bufsize = select_rcv_wnd(so);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	soisconnected(so);
 }
 
 /*
  * Fill in the right TID for CPL messages waiting in the out-of-order queue
  * and send them to the TOE.
  */
 static void
 fixup_and_send_ofo(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = toep->tp_inp;
 	unsigned int tid = toep->tp_tid;
 
 	inp_lock_assert(inp);
 
 	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
 		struct ofld_hdr *oh = mtod(m, void *);
 		/*
 		 * A variety of messages can be waiting but the fields we'll
 		 * be touching are common to all so any message type will do.
 		 */
 		struct cpl_close_con_req *p = (void *)(oh + 1);
 
 		p->wr.wrh_lo = htonl(V_WR_TID(tid));
 		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
 		t3_offload_tx(sc, m);
 	}
 }
 
 /*
  * Process a CPL_ACT_ESTABLISH message.
  */
 static int
 do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_act_establish *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so; 
 
 	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
 
 	free_atid(&td->tid_maps, atid);
 
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	KASSERT(toep->tp_qset == qs->idx,
 	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
 	KASSERT(toep->tp_tid == atid,
 	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
 
 	toep->tp_tid = tid;
 	insert_tid(td, toep, tid);
 
 	if (inp->inp_flags & INP_DROPPED) {
 		/* socket closed by the kernel before hw told us it connected */
 		send_reset(toep);
 		goto done;
 	}
 
 	KASSERT(tp->t_state == TCPS_SYN_SENT,
 	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
 
 	so = inp->inp_socket;
 	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
 
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
 	 */
 	if (mbufq_len(&toep->out_of_order_queue))
 		fixup_and_send_ofo(toep);
 
 done:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Process an acknowledgment of WR completion.  Advance snd_una and send the
  * next batch of work requests from the write queue.
  */
 static void
 wr_ack(struct toepcb *toep, struct mbuf *m)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	struct socket *so;
 	unsigned int credits = ntohs(hdr->credits);
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	struct sockbuf *snd;
 	struct mbuf *p;
 	struct ofld_hdr *oh;
 
 	inp_wlock(inp);
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 	toep->tp_wr_avail += credits;
 	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
 		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
 
 	while (credits) {
 		p = peek_wr(toep);
 
 		if (__predict_false(!p)) {
 			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
 			    "tid %u, state %u, wr_avail %u", __func__, credits,
 			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
 			    "nothing pending, state %u wr_avail=%u\n",
 			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
 
 		oh = mtod(p, struct ofld_hdr *);
 
 		KASSERT(credits >= G_HDR_NDESC(oh->flags),
 		    ("%s: partial credits?  %d %d", __func__, credits,
 		    G_HDR_NDESC(oh->flags)));
 
 		dequeue_wr(toep);
 		credits -= G_HDR_NDESC(oh->flags);
 		bytes += oh->plen;
 
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(p);
 	}
 
 	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
 		goto out_free;
 
 	if (tp->snd_una != snd_una) {
 		tp->snd_una = snd_una;
 		tp->ts_recent_age = tcp_ts_getticks();
 		if (tp->snd_una == tp->snd_nxt)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 
 	snd = so_sockbuf_snd(so);
 	if (bytes) {
 		SOCKBUF_LOCK(snd);
 		sbdrop_locked(snd, bytes);
 		so_sowwakeup_locked(so);
 	}
 
 	if (snd->sb_sndptroff < sbused(snd))
 		t3_push_frames(so, 0);
 
 out_free:
 	inp_wunlock(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Handler for TX_DATA_ACK CPL messages.
  */
 static int
 do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	/* XXX bad race */
 	if (toep)
 		wr_ack(toep, m);
 
 	return (0);
 }
 
 void
 t3_init_cpl_io(struct adapter *sc)
 {
 	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
 	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
 	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
 	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
 	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
 	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
 	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
 	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
 	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
 }
 #endif
Index: head/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
===================================================================
--- head/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	(revision 278976)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h	(revision 278977)
@@ -1,95 +1,95 @@
 /*-
  * Copyright (c) 2007-2009, Chelsio Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * 2. Neither the name of the Chelsio Corporation nor the names of its
  *    contributors may be used to endorse or promote products derived from
  *    this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef CXGB_TOEPCB_H_
 #define CXGB_TOEPCB_H_
 #include <sys/bus.h>
 #include <sys/condvar.h>
-#include <sys/mbufq.h>
+#include <sys/limits.h>
 
 #define TP_DATASENT         	(1 << 0)
 #define TP_TX_WAIT_IDLE      	(1 << 1)
 #define TP_FIN_SENT          	(1 << 2)
 #define TP_ABORT_RPL_PENDING 	(1 << 3)
 #define TP_ABORT_SHUTDOWN    	(1 << 4)
 #define TP_ABORT_RPL_RCVD    	(1 << 5)
 #define TP_ABORT_REQ_RCVD    	(1 << 6)
 #define TP_ATTACHED	    	(1 << 7)
 #define TP_CPL_DONE		(1 << 8)
 #define TP_IS_A_SYNQ_ENTRY	(1 << 9)
 #define TP_ABORT_RPL_SENT	(1 << 10)
 #define TP_SEND_FIN          	(1 << 11)
 #define TP_SYNQE_EXPANDED	(1 << 12)
 
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */
 	int 			tp_flags;
 	struct toedev 		*tp_tod;
 	struct l2t_entry 	*tp_l2t;
 	int			tp_tid;
 	int 			tp_wr_max;
 	int 			tp_wr_avail;
 	int 			tp_wr_unacked;
 	int 			tp_delack_mode;
 	int 			tp_ulp_mode;
 	int 			tp_qset;
 	int 			tp_enqueued;
 	int 			tp_rx_credits;
 
 	struct inpcb 		*tp_inp;
 	struct mbuf		*tp_m_last;
 
-	struct mbuf_head 	wr_list;
-	struct mbuf_head 	out_of_order_queue;
+	struct mbufq 		wr_list;
+	struct mbufq 		out_of_order_queue;
 };
 
 static inline void
 reset_wr_list(struct toepcb *toep)
 {
-	mbufq_init(&toep->wr_list);
+	mbufq_init(&toep->wr_list, INT_MAX);	/* XXX: sane limit needed */
 }
 
 static inline void
 enqueue_wr(struct toepcb *toep, struct mbuf *m)
 {
-	mbufq_tail(&toep->wr_list, m);
+	(void )mbufq_enqueue(&toep->wr_list, m);
 }
 
 static inline struct mbuf *
 peek_wr(const struct toepcb *toep)
 {
-	return (mbufq_peek(&toep->wr_list));
+	return (mbufq_first(&toep->wr_list));
 }
 
 static inline struct mbuf *
 dequeue_wr(struct toepcb *toep)
 {
 	return (mbufq_dequeue(&toep->wr_list));
 }
 
 #endif
Index: head/sys/dev/xen/netfront/mbufq.h
===================================================================
--- head/sys/dev/xen/netfront/mbufq.h	(revision 278976)
+++ head/sys/dev/xen/netfront/mbufq.h	(nonexistent)
@@ -1,123 +0,0 @@
-/**************************************************************************
-
-Copyright (c) 2007, Chelsio Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Chelsio Corporation nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD$
-
-***************************************************************************/
-
-#ifndef CXGB_MBUFQ_H_
-#define CXGB_MBUFQ_H_
-
-struct mbuf_head {
-	struct mbuf *head;
-	struct mbuf *tail;
-	uint32_t     qlen;
-	uint32_t     qsize;
-	struct mtx   lock;
-};
-
-static __inline void
-mbufq_init(struct mbuf_head *l)
-{
-	l->head = l->tail = NULL;
-	l->qlen = l->qsize = 0;
-}
-
-static __inline int
-mbufq_empty(struct mbuf_head *l)
-{
-	return (l->head == NULL);
-}
-
-static __inline int
-mbufq_len(struct mbuf_head *l)
-{
-	return (l->qlen);
-}
-
-static __inline int
-mbufq_size(struct mbuf_head *l)
-{
-	return (l->qsize);
-}
-
-static __inline int
-mbufq_head_size(struct mbuf_head *l)
-{
-	return (l->head ? l->head->m_pkthdr.len : 0);
-}
-
-static __inline void
-mbufq_tail(struct mbuf_head *l, struct mbuf *m)
-{
-	l->qlen++;
-	if (l->head == NULL)
-		l->head = m;
-	else
-		l->tail->m_nextpkt = m;
-	l->tail = m;
-	l->qsize += m->m_pkthdr.len;
-}
-
-static __inline struct mbuf *
-mbufq_dequeue(struct mbuf_head *l)
-{
-	struct mbuf *m;
-
-	m = l->head;
-	if (m) {
-		if (m == l->tail) 
-			l->head = l->tail = NULL;
-		else
-			l->head = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		l->qlen--;
-		l->qsize -= m->m_pkthdr.len;
-	}
-
-	return (m);
-}
-
-static __inline struct mbuf *
-mbufq_peek(struct mbuf_head *l)
-{
-	return (l->head);
-}
-
-static __inline void
-mbufq_append(struct mbuf_head *a, struct mbuf_head *b)
-{
-	if (a->tail) 
-		a->tail->m_nextpkt = b->head;
-	if (b->tail)
-		a->tail = b->tail;
-	a->qlen += b->qlen;
-	a->qsize += b->qsize;
-	
-	
-}
-#endif  /* CXGB_MBUFQ_H_ */

Property changes on: head/sys/dev/xen/netfront/mbufq.h
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: head/sys/dev/xen/netfront/netfront.c
===================================================================
--- head/sys/dev/xen/netfront/netfront.c	(revision 278976)
+++ head/sys/dev/xen/netfront/netfront.c	(revision 278977)
@@ -1,2232 +1,2231 @@
 /*-
  * Copyright (c) 2004-2006 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
+#include <sys/limits.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 
 #include <net/bpf.h>
 
 #include <net/if_types.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/if_ether.h>
 #if __FreeBSD_version >= 700000
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/clock.h>      /* for DELAY */
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/frame.h>
 #include <machine/vmparam.h>
 
 #include <sys/bus.h>
 #include <sys/rman.h>
 
 #include <machine/intr_machdep.h>
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/gnttab.h>
 #include <xen/interface/memory.h>
 #include <xen/interface/io/netif.h>
 #include <xen/xenbus/xenbusvar.h>
 
 #include <machine/xen/xenvar.h>
 
-#include <dev/xen/netfront/mbufq.h>
-
 #include "xenbus_if.h"
 
 /* Features supported by all backends.  TSO and LRO can be negotiated */
 #define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP)
 
 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
 #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
 
 #if __FreeBSD_version >= 700000
 /*
  * Should the driver do LRO on the RX end
  *  this can be toggled on the fly, but the
  *  interface must be reset (down/up) for it
  *  to take effect.
  */
 static int xn_enable_lro = 1;
 TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro);
 #else
 
 #define IFCAP_TSO4	0
 #define CSUM_TSO	0
 
 #endif
 
 #ifdef CONFIG_XEN
 static int MODPARM_rx_copy = 0;
 module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
 MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
 static int MODPARM_rx_flip = 0;
 module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
 MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
 #else
 static const int MODPARM_rx_copy = 1;
 static const int MODPARM_rx_flip = 0;
 #endif
 
 /**
  * \brief The maximum allowed data fragments in a single transmit
  *        request.
  *
  * This limit is imposed by the backend driver.  We assume here that
  * we are dealing with a Linux driver domain and have set our limit
  * to mirror the Linux MAX_SKB_FRAGS constant.
  */
 #define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
 
 #define RX_COPY_THRESHOLD 256
 
 #define net_ratelimit() 0
 
 struct netfront_info;
 struct netfront_rx_info;
 
 static void xn_txeof(struct netfront_info *);
 static void xn_rxeof(struct netfront_info *);
 static void network_alloc_rx_buffers(struct netfront_info *);
 
 static void xn_tick_locked(struct netfront_info *);
 static void xn_tick(void *);
 
 static void xn_intr(void *);
 static inline int xn_count_frags(struct mbuf *m);
 static int  xn_assemble_tx_request(struct netfront_info *sc,
 				   struct mbuf *m_head);
 static void xn_start_locked(struct ifnet *);
 static void xn_start(struct ifnet *);
 static int  xn_ioctl(struct ifnet *, u_long, caddr_t);
 static void xn_ifinit_locked(struct netfront_info *);
 static void xn_ifinit(void *);
 static void xn_stop(struct netfront_info *);
 static void xn_query_features(struct netfront_info *np);
 static int  xn_configure_features(struct netfront_info *np);
 #ifdef notyet
 static void xn_watchdog(struct ifnet *);
 #endif
 
 #ifdef notyet
 static void netfront_closing(device_t dev);
 #endif
 static void netif_free(struct netfront_info *info);
 static int netfront_detach(device_t dev);
 
 static int talk_to_backend(device_t dev, struct netfront_info *info);
 static int create_netdev(device_t dev);
 static void netif_disconnect_backend(struct netfront_info *info);
 static int setup_device(device_t dev, struct netfront_info *info);
 static void free_ring(int *ref, void *ring_ptr_ref);
 
 static int  xn_ifmedia_upd(struct ifnet *ifp);
 static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
 
 /* Xenolinux helper functions */
 int network_connect(struct netfront_info *);
 
 static void xn_free_rx_ring(struct netfront_info *);
 
 static void xn_free_tx_ring(struct netfront_info *);
 
 static int xennet_get_responses(struct netfront_info *np,
 	struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
 	struct mbuf **list, int *pages_flipped_p);
 
 #define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT)
 
 #define INVALID_P2M_ENTRY (~0UL)
 
 /*
  * Mbuf pointers. We need these to keep track of the virtual addresses
  * of our mbuf chains since we can only convert from virtual to physical,
  * not the other way around.  The size must track the free index arrays.
  */
 struct xn_chain_data {
 	struct mbuf    *xn_tx_chain[NET_TX_RING_SIZE+1];
 	int		xn_tx_chain_cnt;
 	struct mbuf    *xn_rx_chain[NET_RX_RING_SIZE+1];
 };
 
 struct net_device_stats
 {
 	u_long	rx_packets;		/* total packets received	*/
 	u_long	tx_packets;		/* total packets transmitted	*/
 	u_long	rx_bytes;		/* total bytes received 	*/
 	u_long	tx_bytes;		/* total bytes transmitted	*/
 	u_long	rx_errors;		/* bad packets received		*/
 	u_long	tx_errors;		/* packet transmit problems	*/
 	u_long	rx_dropped;		/* no space in linux buffers	*/
 	u_long	tx_dropped;		/* no space available in linux	*/
 	u_long	multicast;		/* multicast packets received	*/
 	u_long	collisions;
 
 	/* detailed rx_errors: */
 	u_long	rx_length_errors;
 	u_long	rx_over_errors;		/* receiver ring buff overflow	*/
 	u_long	rx_crc_errors;		/* recved pkt with crc error	*/
 	u_long	rx_frame_errors;	/* recv'd frame alignment error */
 	u_long	rx_fifo_errors;		/* recv'r fifo overrun		*/
 	u_long	rx_missed_errors;	/* receiver missed packet	*/
 
 	/* detailed tx_errors */
 	u_long	tx_aborted_errors;
 	u_long	tx_carrier_errors;
 	u_long	tx_fifo_errors;
 	u_long	tx_heartbeat_errors;
 	u_long	tx_window_errors;
 	
 	/* for cslip etc */
 	u_long	rx_compressed;
 	u_long	tx_compressed;
 };
 
 struct netfront_info {
 	struct ifnet *xn_ifp;
 #if __FreeBSD_version >= 700000
 	struct lro_ctrl xn_lro;
 #endif
 
 	struct net_device_stats stats;
 	u_int tx_full;
 
 	netif_tx_front_ring_t tx;
 	netif_rx_front_ring_t rx;
 
 	struct mtx   tx_lock;
 	struct mtx   rx_lock;
 	struct mtx   sc_lock;
 
 	xen_intr_handle_t xen_intr_handle;
 	u_int copying_receiver;
 	u_int carrier;
 	u_int maxfrags;
 		
 	/* Receive-ring batched refills. */
 #define RX_MIN_TARGET 32
 #define RX_MAX_TARGET NET_RX_RING_SIZE
 	int rx_min_target;
 	int rx_max_target;
 	int rx_target;
 
 	grant_ref_t gref_tx_head;
 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; 
 	grant_ref_t gref_rx_head;
 	grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; 
 
 	device_t		xbdev;
 	int			tx_ring_ref;
 	int			rx_ring_ref;
 	uint8_t			mac[ETHER_ADDR_LEN];
 	struct xn_chain_data	xn_cdata;	/* mbufs */
-	struct mbuf_head	xn_rx_batch;	/* head of the batch queue */
+	struct mbufq		xn_rx_batch;	/* batch queue */
 
 	int			xn_if_flags;
 	struct callout	        xn_stat_ch;
 
 	u_long			rx_pfn_array[NET_RX_RING_SIZE];
 	multicall_entry_t	rx_mcl[NET_RX_RING_SIZE+1];
 	mmu_update_t		rx_mmu[NET_RX_RING_SIZE];
 	struct ifmedia		sc_media;
 };
 
 #define rx_mbufs xn_cdata.xn_rx_chain
 #define tx_mbufs xn_cdata.xn_tx_chain
 
 #define XN_LOCK_INIT(_sc, _name) \
         mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \
         mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF);  \
         mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF)
 
 #define XN_RX_LOCK(_sc)           mtx_lock(&(_sc)->rx_lock)
 #define XN_RX_UNLOCK(_sc)         mtx_unlock(&(_sc)->rx_lock)
 
 #define XN_TX_LOCK(_sc)           mtx_lock(&(_sc)->tx_lock)
 #define XN_TX_UNLOCK(_sc)         mtx_unlock(&(_sc)->tx_lock)
 
 #define XN_LOCK(_sc)           mtx_lock(&(_sc)->sc_lock); 
 #define XN_UNLOCK(_sc)         mtx_unlock(&(_sc)->sc_lock); 
 
 #define XN_LOCK_ASSERT(_sc)    mtx_assert(&(_sc)->sc_lock, MA_OWNED); 
 #define XN_RX_LOCK_ASSERT(_sc)    mtx_assert(&(_sc)->rx_lock, MA_OWNED); 
 #define XN_TX_LOCK_ASSERT(_sc)    mtx_assert(&(_sc)->tx_lock, MA_OWNED); 
 #define XN_LOCK_DESTROY(_sc)   mtx_destroy(&(_sc)->rx_lock); \
                                mtx_destroy(&(_sc)->tx_lock); \
                                mtx_destroy(&(_sc)->sc_lock);
 
 struct netfront_rx_info {
 	struct netif_rx_response rx;
 	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 };
 
 #define netfront_carrier_on(netif)	((netif)->carrier = 1)
 #define netfront_carrier_off(netif)	((netif)->carrier = 0)
 #define netfront_carrier_ok(netif)	((netif)->carrier)
 
 /* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */
 
 static inline void
 add_id_to_freelist(struct mbuf **list, uintptr_t id)
 {
 	KASSERT(id != 0,
 		("%s: the head item (0) must always be free.", __func__));
 	list[id] = list[0];
 	list[0]  = (struct mbuf *)id;
 }
 
 static inline unsigned short
 get_id_from_freelist(struct mbuf **list)
 {
 	uintptr_t id;
 
 	id = (uintptr_t)list[0];
 	KASSERT(id != 0,
 		("%s: the head item (0) must always remain free.", __func__));
 	list[0] = list[id];
 	return (id);
 }
 
 static inline int
 xennet_rxidx(RING_IDX idx)
 {
 	return idx & (NET_RX_RING_SIZE - 1);
 }
 
 static inline struct mbuf *
 xennet_get_rx_mbuf(struct netfront_info *np, RING_IDX ri)
 {
 	int i = xennet_rxidx(ri);
 	struct mbuf *m;
 
 	m = np->rx_mbufs[i];
 	np->rx_mbufs[i] = NULL;
 	return (m);
 }
 
 static inline grant_ref_t
 xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri)
 {
 	int i = xennet_rxidx(ri);
 	grant_ref_t ref = np->grant_rx_ref[i];
 	KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n"));
 	np->grant_rx_ref[i] = GRANT_REF_INVALID;
 	return ref;
 }
 
 #define IPRINTK(fmt, args...) \
     printf("[XEN] " fmt, ##args)
 #ifdef INVARIANTS
 #define WPRINTK(fmt, args...) \
     printf("[XEN] " fmt, ##args)
 #else
 #define WPRINTK(fmt, args...)
 #endif
 #ifdef DEBUG
 #define DPRINTK(fmt, args...) \
     printf("[XEN] %s: " fmt, __func__, ##args)
 #else
 #define DPRINTK(fmt, args...)
 #endif
 
 /**
  * Read the 'mac' node at the given device's node in the store, and parse that
  * as colon-separated octets, placing result the given mac array.  mac must be
  * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
  * Return 0 on success, or errno on error.
  */
 static int 
 xen_net_read_mac(device_t dev, uint8_t mac[])
 {
 	int error, i;
 	char *s, *e, *macstr;
 	const char *path;
 
 	path = xenbus_get_node(dev);
 	error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
 	if (error == ENOENT) {
 		/*
 		 * Deal with missing mac XenStore nodes on devices with
 		 * HVM emulation (the 'ioemu' configuration attribute)
 		 * enabled.
 		 *
 		 * The HVM emulator may execute in a stub device model
 		 * domain which lacks the permission, only given to Dom0,
 		 * to update the guest's XenStore tree.  For this reason,
 		 * the HVM emulator doesn't even attempt to write the
 		 * front-side mac node, even when operating in Dom0.
 		 * However, there should always be a mac listed in the
 		 * backend tree.  Fallback to this version if our query
 		 * of the front side XenStore location doesn't find
 		 * anything.
 		 */
 		path = xenbus_get_otherend_path(dev);
 		error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
 	}
 	if (error != 0) {
 		xenbus_dev_fatal(dev, error, "parsing %s/mac", path);
 		return (error);
 	}
 
 	s = macstr;
 	for (i = 0; i < ETHER_ADDR_LEN; i++) {
 		mac[i] = strtoul(s, &e, 16);
 		if (s == e || (e[0] != ':' && e[0] != 0)) {
 			free(macstr, M_XENBUS);
 			return (ENOENT);
 		}
 		s = &e[1];
 	}
 	free(macstr, M_XENBUS);
 	return (0);
 }
 
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffers for communication with the backend, and
  * inform the backend of the appropriate details for those.  Switch to
  * Connected state.
  */
 static int 
 netfront_probe(device_t dev)
 {
 
 	if (!strcmp(xenbus_get_type(dev), "vif")) {
 		device_set_desc(dev, "Virtual Network Interface");
 		return (0);
 	}
 
 	return (ENXIO);
 }
 
 static int
 netfront_attach(device_t dev)
 {	
 	int err;
 
 	err = create_netdev(dev);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "creating netdev");
 		return (err);
 	}
 
 #if __FreeBSD_version >= 700000
 	SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
 	    OID_AUTO, "enable_lro", CTLFLAG_RW,
 	    &xn_enable_lro, 0, "Large Receive Offload");
 #endif
 
 	return (0);
 }
 
 static int
 netfront_suspend(device_t dev)
 {
 	struct netfront_info *info = device_get_softc(dev);
 
 	XN_RX_LOCK(info);
 	XN_TX_LOCK(info);
 	netfront_carrier_off(info);
 	XN_TX_UNLOCK(info);
 	XN_RX_UNLOCK(info);
 	return (0);
 }
 
 /**
  * We are reconnecting to the backend, due to a suspend/resume, or a backend
  * driver restart.  We tear down our netif structure and recreate it, but
  * leave the device-layer structures intact so that this is transparent to the
  * rest of the kernel.
  */
 static int
 netfront_resume(device_t dev)
 {
 	struct netfront_info *info = device_get_softc(dev);
 
 	netif_disconnect_backend(info);
 	return (0);
 }
 
 /* Common code used when first setting up, and when resuming. */
 static int 
 talk_to_backend(device_t dev, struct netfront_info *info)
 {
 	const char *message;
 	struct xs_transaction xst;
 	const char *node = xenbus_get_node(dev);
 	int err;
 
 	err = xen_net_read_mac(dev, info->mac);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "parsing %s/mac", node);
 		goto out;
 	}
 
 	/* Create shared ring, alloc event channel. */
 	err = setup_device(dev, info);
 	if (err)
 		goto out;
 	
  again:
 	err = xs_transaction_start(&xst);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "starting transaction");
 		goto destroy_ring;
 	}
 	err = xs_printf(xst, node, "tx-ring-ref","%u",
 			info->tx_ring_ref);
 	if (err) {
 		message = "writing tx ring-ref";
 		goto abort_transaction;
 	}
 	err = xs_printf(xst, node, "rx-ring-ref","%u",
 			info->rx_ring_ref);
 	if (err) {
 		message = "writing rx ring-ref";
 		goto abort_transaction;
 	}
 	err = xs_printf(xst, node,
 			"event-channel", "%u",
 			xen_intr_port(info->xen_intr_handle));
 	if (err) {
 		message = "writing event-channel";
 		goto abort_transaction;
 	}
 	err = xs_printf(xst, node, "request-rx-copy", "%u",
 			info->copying_receiver);
 	if (err) {
 		message = "writing request-rx-copy";
 		goto abort_transaction;
 	}
 	err = xs_printf(xst, node, "feature-rx-notify", "%d", 1);
 	if (err) {
 		message = "writing feature-rx-notify";
 		goto abort_transaction;
 	}
 	err = xs_printf(xst, node, "feature-sg", "%d", 1);
 	if (err) {
 		message = "writing feature-sg";
 		goto abort_transaction;
 	}
 #if __FreeBSD_version >= 700000
 	err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1);
 	if (err) {
 		message = "writing feature-gso-tcpv4";
 		goto abort_transaction;
 	}
 #endif
 
 	err = xs_transaction_end(xst, 0);
 	if (err) {
 		if (err == EAGAIN)
 			goto again;
 		xenbus_dev_fatal(dev, err, "completing transaction");
 		goto destroy_ring;
 	}
 	
 	return 0;
 	
  abort_transaction:
 	xs_transaction_end(xst, 1);
 	xenbus_dev_fatal(dev, err, "%s", message);
  destroy_ring:
 	netif_free(info);
  out:
 	return err;
 }
 
 static int 
 setup_device(device_t dev, struct netfront_info *info)
 {
 	netif_tx_sring_t *txs;
 	netif_rx_sring_t *rxs;
 	int error;
 	struct ifnet *ifp;
 	
 	ifp = info->xn_ifp;
 
 	info->tx_ring_ref = GRANT_REF_INVALID;
 	info->rx_ring_ref = GRANT_REF_INVALID;
 	info->rx.sring = NULL;
 	info->tx.sring = NULL;
 
 	txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (!txs) {
 		error = ENOMEM;
 		xenbus_dev_fatal(dev, error, "allocating tx ring page");
 		goto fail;
 	}
 	SHARED_RING_INIT(txs);
 	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
 	error = xenbus_grant_ring(dev, virt_to_mfn(txs), &info->tx_ring_ref);
 	if (error)
 		goto fail;
 
 	rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (!rxs) {
 		error = ENOMEM;
 		xenbus_dev_fatal(dev, error, "allocating rx ring page");
 		goto fail;
 	}
 	SHARED_RING_INIT(rxs);
 	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
 
 	error = xenbus_grant_ring(dev, virt_to_mfn(rxs), &info->rx_ring_ref);
 	if (error)
 		goto fail;
 
 	error = xen_intr_alloc_and_bind_local_port(dev,
 	    xenbus_get_otherend_id(dev), /*filter*/NULL, xn_intr, info,
 	    INTR_TYPE_NET | INTR_MPSAFE | INTR_ENTROPY, &info->xen_intr_handle);
 
 	if (error) {
 		xenbus_dev_fatal(dev, error,
 				 "xen_intr_alloc_and_bind_local_port failed");
 		goto fail;
 	}
 
 	return (0);
 	
  fail:
 	netif_free(info);
 	return (error);
 }
 
 #ifdef INET
 /**
  * If this interface has an ipv4 address, send an arp for it. This
  * helps to get the network going again after migrating hosts.
  */
 static void
 netfront_send_fake_arp(device_t dev, struct netfront_info *info)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	
 	ifp = info->xn_ifp;
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			arp_ifinit(ifp, ifa);
 		}
 	}
 }
 #endif
 
 /**
  * Callback received when the backend's state changes.
  */
 static void
 netfront_backend_changed(device_t dev, XenbusState newstate)
 {
 	struct netfront_info *sc = device_get_softc(dev);
 		
 	DPRINTK("newstate=%d\n", newstate);
 
 	switch (newstate) {
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateConnected:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
 	case XenbusStateReconfigured:
 	case XenbusStateReconfiguring:
 		break;
 	case XenbusStateInitWait:
 		if (xenbus_get_state(dev) != XenbusStateInitialising)
 			break;
 		if (network_connect(sc) != 0)
 			break;
 		xenbus_set_state(dev, XenbusStateConnected);
 #ifdef INET
 		netfront_send_fake_arp(dev, sc);
 #endif
 		break;
 	case XenbusStateClosing:
 		xenbus_set_state(dev, XenbusStateClosed);
 		break;
 	}
 }
 
 static void
 xn_free_rx_ring(struct netfront_info *sc)
 {
 #if 0
 	int i;
 	
 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
 		if (sc->xn_cdata.rx_mbufs[i] != NULL) {
 			m_freem(sc->rx_mbufs[i]);
 			sc->rx_mbufs[i] = NULL;
 		}
 	}
 	
 	sc->rx.rsp_cons = 0;
 	sc->xn_rx_if->req_prod = 0;
 	sc->xn_rx_if->event = sc->rx.rsp_cons ;
 #endif
 }
 
 static void
 xn_free_tx_ring(struct netfront_info *sc)
 {
 #if 0
 	int i;
 	
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
 		if (sc->tx_mbufs[i] != NULL) {
 			m_freem(sc->tx_mbufs[i]);
 			sc->xn_cdata.xn_tx_chain[i] = NULL;
 		}
 	}
 	
 	return;
 #endif
 }
 
 /**
  * \brief Verify that there is sufficient space in the Tx ring
  *        buffer for a maximally sized request to be enqueued.
  *
  * A transmit request requires a transmit descriptor for each packet
  * fragment, plus up to 2 entries for "options" (e.g. TSO).
  */
 static inline int
 xn_tx_slot_available(struct netfront_info *np)
 {
 	return (RING_FREE_REQUESTS(&np->tx) > (MAX_TX_REQ_FRAGS + 2));
 }
 
 static void
 netif_release_tx_bufs(struct netfront_info *np)
 {
 	int i;
 
 	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
 		struct mbuf *m;
 
 		m = np->tx_mbufs[i];
 
 		/*
 		 * We assume that no kernel addresses are
 		 * less than NET_TX_RING_SIZE.  Any entry
 		 * in the table that is below this number
 		 * must be an index from free-list tracking.
 		 */
 		if (((uintptr_t)m) <= NET_TX_RING_SIZE)
 			continue;
 		gnttab_end_foreign_access_ref(np->grant_tx_ref[i]);
 		gnttab_release_grant_reference(&np->gref_tx_head,
 		    np->grant_tx_ref[i]);
 		np->grant_tx_ref[i] = GRANT_REF_INVALID;
 		add_id_to_freelist(np->tx_mbufs, i);
 		np->xn_cdata.xn_tx_chain_cnt--;
 		if (np->xn_cdata.xn_tx_chain_cnt < 0) {
 			panic("%s: tx_chain_cnt must be >= 0", __func__);
 		}
 		m_free(m);
 	}
 }
 
 static void
 network_alloc_rx_buffers(struct netfront_info *sc)
 {
 	int otherend_id = xenbus_get_otherend_id(sc->xbdev);
 	unsigned short id;
 	struct mbuf *m_new;
 	int i, batch_target, notify;
 	RING_IDX req_prod;
 	struct xen_memory_reservation reservation;
 	grant_ref_t ref;
 	int nr_flips;
 	netif_rx_request_t *req;
 	vm_offset_t vaddr;
 	u_long pfn;
 	
 	req_prod = sc->rx.req_prod_pvt;
 
 	if (__predict_false(sc->carrier == 0))
 		return;
 	
 	/*
 	 * Allocate mbufs greedily, even though we batch updates to the
 	 * receive ring. This creates a less bursty demand on the memory
 	 * allocator, and so should reduce the chance of failed allocation
 	 * requests both for ourself and for other kernel subsystems.
 	 *
 	 * Here we attempt to maintain rx_target buffers in flight, counting
 	 * buffers that we have yet to process in the receive ring.
 	 */
 	batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons);
 	for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) {
 		MGETHDR(m_new, M_NOWAIT, MT_DATA);
 		if (m_new == NULL) {
 			printf("%s: MGETHDR failed\n", __func__);
 			goto no_mbuf;
 		}
 
 		if (m_cljget(m_new, M_NOWAIT, MJUMPAGESIZE) == NULL) {
 			printf("%s: m_cljget failed\n", __func__);
 			m_freem(m_new);
 
 no_mbuf:
 			if (i != 0)
 				goto refill;
 			/*
 			 * XXX set timer
 			 */
 			break;
 		}
 		m_new->m_len = m_new->m_pkthdr.len = MJUMPAGESIZE;
 		
 		/* queue the mbufs allocated */
-		mbufq_tail(&sc->xn_rx_batch, m_new);
+		(void )mbufq_enqueue(&sc->xn_rx_batch, m_new);
 	}
 	
 	/*
 	 * If we've allocated at least half of our target number of entries,
 	 * submit them to the backend - we have enough to make the overhead
 	 * of submission worthwhile.  Otherwise wait for more mbufs and
 	 * request entries to become available.
 	 */
 	if (i < (sc->rx_target/2)) {
 		if (req_prod >sc->rx.sring->req_prod)
 			goto push;
 		return;
 	}
 
 	/*
 	 * Double floating fill target if we risked having the backend
 	 * run out of empty buffers for receive traffic.  We define "running
 	 * low" as having less than a fourth of our target buffers free
 	 * at the time we refilled the queue. 
 	 */
 	if ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) {
 		sc->rx_target *= 2;
 		if (sc->rx_target > sc->rx_max_target)
 			sc->rx_target = sc->rx_max_target;
 	}
 
 refill:
 	for (nr_flips = i = 0; ; i++) {
 		if ((m_new = mbufq_dequeue(&sc->xn_rx_batch)) == NULL)
 			break;
 
 		m_new->m_ext.ext_arg1 = (vm_paddr_t *)(uintptr_t)(
 				vtophys(m_new->m_ext.ext_buf) >> PAGE_SHIFT);
 
 		id = xennet_rxidx(req_prod + i);
 
 		KASSERT(sc->rx_mbufs[id] == NULL, ("non-NULL xm_rx_chain"));
 		sc->rx_mbufs[id] = m_new;
 
 		ref = gnttab_claim_grant_reference(&sc->gref_rx_head);
 		KASSERT(ref != GNTTAB_LIST_END,
 			("reserved grant references exhuasted"));
 		sc->grant_rx_ref[id] = ref;
 
 		vaddr = mtod(m_new, vm_offset_t);
 		pfn = vtophys(vaddr) >> PAGE_SHIFT;
 		req = RING_GET_REQUEST(&sc->rx, req_prod + i);
 
 		if (sc->copying_receiver == 0) {
 			gnttab_grant_foreign_transfer_ref(ref,
 			    otherend_id, pfn);
 			sc->rx_pfn_array[nr_flips] = PFNTOMFN(pfn);
 			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 				/* Remove this page before passing
 				 * back to Xen.
 				 */
 				set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 				MULTI_update_va_mapping(&sc->rx_mcl[i],
 				    vaddr, 0, 0);
 			}
 			nr_flips++;
 		} else {
 			gnttab_grant_foreign_access_ref(ref,
 			    otherend_id,
 			    PFNTOMFN(pfn), 0);
 		}
 		req->id = id;
 		req->gref = ref;
 		
 		sc->rx_pfn_array[i] =
 		    vtomach(mtod(m_new,vm_offset_t)) >> PAGE_SHIFT;
 	} 
 	
 	KASSERT(i, ("no mbufs processed")); /* should have returned earlier */
 	KASSERT(mbufq_len(&sc->xn_rx_batch) == 0, ("not all mbufs processed"));
 	/*
 	 * We may have allocated buffers which have entries outstanding
 	 * in the page * update queue -- make sure we flush those first!
 	 */
 	PT_UPDATES_FLUSH();
 	if (nr_flips != 0) {
 #ifdef notyet
 		/* Tell the ballon driver what is going on. */
 		balloon_update_driver_allowance(i);
 #endif
 		set_xen_guest_handle(reservation.extent_start, sc->rx_pfn_array);
 		reservation.nr_extents   = i;
 		reservation.extent_order = 0;
 		reservation.address_bits = 0;
 		reservation.domid        = DOMID_SELF;
 
 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 			/* After all PTEs have been zapped, flush the TLB. */
 			sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
 			    UVMF_TLB_FLUSH|UVMF_ALL;
 	
 			/* Give away a batch of pages. */
 			sc->rx_mcl[i].op = __HYPERVISOR_memory_op;
 			sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
 			sc->rx_mcl[i].args[1] =  (u_long)&reservation;
 			/* Zap PTEs and give away pages in one big multicall. */
 			(void)HYPERVISOR_multicall(sc->rx_mcl, i+1);
 
 			if (__predict_false(sc->rx_mcl[i].result != i ||
 			    HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 			    &reservation) != i))
 				panic("%s: unable to reduce memory "
 				    "reservation\n", __func__);
 		}
 	} else {
 		wmb();
 	}
 			
 	/* Above is a suitable barrier to ensure backend will see requests. */
 	sc->rx.req_prod_pvt = req_prod + i;
 push:
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->rx, notify);
 	if (notify)
 		xen_intr_signal(sc->xen_intr_handle);
 }
 
 static void
 xn_rxeof(struct netfront_info *np)
 {
 	struct ifnet *ifp;
 #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6))
 	struct lro_ctrl *lro = &np->xn_lro;
 	struct lro_entry *queued;
 #endif
 	struct netfront_rx_info rinfo;
 	struct netif_rx_response *rx = &rinfo.rx;
 	struct netif_extra_info *extras = rinfo.extras;
 	RING_IDX i, rp;
 	multicall_entry_t *mcl;
 	struct mbuf *m;
-	struct mbuf_head rxq, errq;
+	struct mbufq rxq, errq;
 	int err, pages_flipped = 0, work_to_do;
 
 	do {
 		XN_RX_LOCK_ASSERT(np);
 		if (!netfront_carrier_ok(np))
 			return;
 
-		mbufq_init(&errq);
-		mbufq_init(&rxq);
+		/* XXX: there should be some sane limit. */
+		mbufq_init(&errq, INT_MAX);
+		mbufq_init(&rxq, INT_MAX);
 
 		ifp = np->xn_ifp;
 	
 		rp = np->rx.sring->rsp_prod;
 		rmb();	/* Ensure we see queued responses up to 'rp'. */
 
 		i = np->rx.rsp_cons;
 		while ((i != rp)) {
 			memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
 			memset(extras, 0, sizeof(rinfo.extras));
 
 			m = NULL;
 			err = xennet_get_responses(np, &rinfo, rp, &i, &m,
 			    &pages_flipped);
 
 			if (__predict_false(err)) {
 				if (m)
-					mbufq_tail(&errq, m);
+					(void )mbufq_enqueue(&errq, m);
 				np->stats.rx_errors++;
 				continue;
 			}
 
 			m->m_pkthdr.rcvif = ifp;
 			if ( rx->flags & NETRXF_data_validated ) {
 				/* Tell the stack the checksums are okay */
 				/*
 				 * XXX this isn't necessarily the case - need to add
 				 * check
 				 */
 				
 				m->m_pkthdr.csum_flags |=
 					(CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID
 					    | CSUM_PSEUDO_HDR);
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 
 			np->stats.rx_packets++;
 			np->stats.rx_bytes += m->m_pkthdr.len;
 
-			mbufq_tail(&rxq, m);
+			(void )mbufq_enqueue(&rxq, m);
 			np->rx.rsp_cons = i;
 		}
 
 		if (pages_flipped) {
 			/* Some pages are no longer absent... */
 #ifdef notyet
 			balloon_update_driver_allowance(-pages_flipped);
 #endif
 			/* Do all the remapping work, and M->P updates, in one big
 			 * hypercall.
 			 */
 			if (!!xen_feature(XENFEAT_auto_translated_physmap)) {
 				mcl = np->rx_mcl + pages_flipped;
 				mcl->op = __HYPERVISOR_mmu_update;
 				mcl->args[0] = (u_long)np->rx_mmu;
 				mcl->args[1] = pages_flipped;
 				mcl->args[2] = 0;
 				mcl->args[3] = DOMID_SELF;
 				(void)HYPERVISOR_multicall(np->rx_mcl,
 				    pages_flipped + 1);
 			}
 		}
 	
-		while ((m = mbufq_dequeue(&errq)))
-			m_freem(m);
+		mbufq_drain(&errq);
 
 		/* 
 		 * Process all the mbufs after the remapping is complete.
 		 * Break the mbuf chain first though.
 		 */
 		while ((m = mbufq_dequeue(&rxq)) != NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 			
 			/*
 			 * Do we really need to drop the rx lock?
 			 */
 			XN_RX_UNLOCK(np);
 #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6))
 			/* Use LRO if possible */
 			if ((ifp->if_capenable & IFCAP_LRO) == 0 ||
 			    lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) {
 				/*
 				 * If LRO fails, pass up to the stack
 				 * directly.
 				 */
 				(*ifp->if_input)(ifp, m);
 			}
 #else
 			(*ifp->if_input)(ifp, m);
 #endif
 			XN_RX_LOCK(np);
 		}
 	
 		np->rx.rsp_cons = i;
 
 #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6))
 		/*
 		 * Flush any outstanding LRO work
 		 */
 		while (!SLIST_EMPTY(&lro->lro_active)) {
 			queued = SLIST_FIRST(&lro->lro_active);
 			SLIST_REMOVE_HEAD(&lro->lro_active, next);
 			tcp_lro_flush(lro, queued);
 		}
 #endif
 
 #if 0
 		/* If we get a callback with very few responses, reduce fill target. */
 		/* NB. Note exponential increase, linear decrease. */
 		if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > 
 			((3*np->rx_target) / 4)) && (--np->rx_target < np->rx_min_target))
 			np->rx_target = np->rx_min_target;
 #endif
 	
 		network_alloc_rx_buffers(np);
 
 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, work_to_do);
 	} while (work_to_do);
 }
 
 static void 
 xn_txeof(struct netfront_info *np)
 {
 	RING_IDX i, prod;
 	unsigned short id;
 	struct ifnet *ifp;
 	netif_tx_response_t *txr;
 	struct mbuf *m;
 	
 	XN_TX_LOCK_ASSERT(np);
 	
 	if (!netfront_carrier_ok(np))
 		return;
 	
 	ifp = np->xn_ifp;
 	
 	do {
 		prod = np->tx.sring->rsp_prod;
 		rmb(); /* Ensure we see responses up to 'rp'. */
 		
 		for (i = np->tx.rsp_cons; i != prod; i++) {
 			txr = RING_GET_RESPONSE(&np->tx, i);
 			if (txr->status == NETIF_RSP_NULL)
 				continue;
 
 			if (txr->status != NETIF_RSP_OKAY) {
 				printf("%s: WARNING: response is %d!\n",
 				       __func__, txr->status);
 			}
 			id = txr->id;
 			m = np->tx_mbufs[id]; 
 			KASSERT(m != NULL, ("mbuf not found in xn_tx_chain"));
 			KASSERT((uintptr_t)m > NET_TX_RING_SIZE,
 				("mbuf already on the free list, but we're "
 				"trying to free it again!"));
 			M_ASSERTVALID(m);
 			
 			/*
 			 * Increment packet count if this is the last
 			 * mbuf of the chain.
 			 */
 			if (!m->m_next)
 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if (__predict_false(gnttab_query_foreign_access(
 			    np->grant_tx_ref[id]) != 0)) {
 				panic("%s: grant id %u still in use by the "
 				    "backend", __func__, id);
 			}
 			gnttab_end_foreign_access_ref(
 				np->grant_tx_ref[id]);
 			gnttab_release_grant_reference(
 				&np->gref_tx_head, np->grant_tx_ref[id]);
 			np->grant_tx_ref[id] = GRANT_REF_INVALID;
 			
 			np->tx_mbufs[id] = NULL;
 			add_id_to_freelist(np->tx_mbufs, id);
 			np->xn_cdata.xn_tx_chain_cnt--;
 			m_free(m);
 			/* Only mark the queue active if we've freed up at least one slot to try */
 			ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		}
 		np->tx.rsp_cons = prod;
 		
 		/*
 		 * Set a new event, then check for race with update of
 		 * tx_cons. Note that it is essential to schedule a
 		 * callback, no matter how few buffers are pending. Even if
 		 * there is space in the transmit ring, higher layers may
 		 * be blocked because too much data is outstanding: in such
 		 * cases notification from Xen is likely to be the only kick
 		 * that we'll get.
 		 */
 		np->tx.sring->rsp_event =
 		    prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
 
 		mb();
 	} while (prod != np->tx.sring->rsp_prod);
 	
 	if (np->tx_full &&
 	    ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
 		np->tx_full = 0;
 #if 0
 		if (np->user_state == UST_OPEN)
 			netif_wake_queue(dev);
 #endif
 	}
 }
 
 static void
 xn_intr(void *xsc)
 {
 	struct netfront_info *np = xsc;
 	struct ifnet *ifp = np->xn_ifp;
 
 #if 0
 	if (!(np->rx.rsp_cons != np->rx.sring->rsp_prod &&
 	    likely(netfront_carrier_ok(np)) &&
 	    ifp->if_drv_flags & IFF_DRV_RUNNING))
 		return;
 #endif
 	if (RING_HAS_UNCONSUMED_RESPONSES(&np->tx)) {
 		XN_TX_LOCK(np);
 		xn_txeof(np);
 		XN_TX_UNLOCK(np);			
 	}	
 
 	XN_RX_LOCK(np);
 	xn_rxeof(np);
 	XN_RX_UNLOCK(np);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
 	    !IFQ_DRV_IS_EMPTY(&ifp->if_snd))
 		xn_start(ifp);
 }
 
 static void
 xennet_move_rx_slot(struct netfront_info *np, struct mbuf *m,
 	grant_ref_t ref)
 {
 	int new = xennet_rxidx(np->rx.req_prod_pvt);
 
 	KASSERT(np->rx_mbufs[new] == NULL, ("rx_mbufs != NULL"));
 	np->rx_mbufs[new] = m;
 	np->grant_rx_ref[new] = ref;
 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
 	np->rx.req_prod_pvt++;
 }
 
 static int
 xennet_get_extras(struct netfront_info *np,
     struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons)
 {
 	struct netif_extra_info *extra;
 
 	int err = 0;
 
 	do {
 		struct mbuf *m;
 		grant_ref_t ref;
 
 		if (__predict_false(*cons + 1 == rp)) {
 #if 0			
 			if (net_ratelimit())
 				WPRINTK("Missing extra info\n");
 #endif			
 			err = EINVAL;
 			break;
 		}
 
 		extra = (struct netif_extra_info *)
 		RING_GET_RESPONSE(&np->rx, ++(*cons));
 
 		if (__predict_false(!extra->type ||
 			extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
 #if 0				
 			if (net_ratelimit())
 				WPRINTK("Invalid extra type: %d\n",
 					extra->type);
 #endif			
 			err = EINVAL;
 		} else {
 			memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
 		}
 
 		m = xennet_get_rx_mbuf(np, *cons);
 		ref = xennet_get_rx_ref(np, *cons);
 		xennet_move_rx_slot(np, m, ref);
 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
 
 	return err;
 }
 
 static int
 xennet_get_responses(struct netfront_info *np,
 	struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
 	struct mbuf  **list,
 	int *pages_flipped_p)
 {
 	int pages_flipped = *pages_flipped_p;
 	struct mmu_update *mmu;
 	struct multicall_entry *mcl;
 	struct netif_rx_response *rx = &rinfo->rx;
 	struct netif_extra_info *extras = rinfo->extras;
 	struct mbuf *m, *m0, *m_prev;
 	grant_ref_t ref = xennet_get_rx_ref(np, *cons);
 	RING_IDX ref_cons = *cons;
 	int frags = 1;
 	int err = 0;
 	u_long ret;
 
 	m0 = m = m_prev = xennet_get_rx_mbuf(np, *cons);
 
 	if (rx->flags & NETRXF_extra_info) {
 		err = xennet_get_extras(np, extras, rp, cons);
 	}
 
 	if (m0 != NULL) {
 		m0->m_pkthdr.len = 0;
 		m0->m_next = NULL;
 	}
 
 	for (;;) {
 		u_long mfn;
 
 #if 0		
 		DPRINTK("rx->status=%hd rx->offset=%hu frags=%u\n",
 			rx->status, rx->offset, frags);
 #endif
 		if (__predict_false(rx->status < 0 ||
 			rx->offset + rx->status > PAGE_SIZE)) {
 
 #if 0						
 			if (net_ratelimit())
 				WPRINTK("rx->offset: %x, size: %u\n",
 					rx->offset, rx->status);
 #endif						
 			xennet_move_rx_slot(np, m, ref);
 			if (m0 == m)
 				m0 = NULL;
 			m = NULL;
 			err = EINVAL;
 			goto next_skip_queue;
 		}
 		
 		/*
 		 * This definitely indicates a bug, either in this driver or in
 		 * the backend driver. In future this should flag the bad
 		 * situation to the system controller to reboot the backed.
 		 */
 		if (ref == GRANT_REF_INVALID) {
 
 #if 0 				
 			if (net_ratelimit())
 				WPRINTK("Bad rx response id %d.\n", rx->id);
 #endif			
 			printf("%s: Bad rx response id %d.\n", __func__,rx->id);
 			err = EINVAL;
 			goto next;
 		}
 
 		if (!np->copying_receiver) {
 			/* Memory pressure, insufficient buffer
 			 * headroom, ...
 			 */
 			if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
 				WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
 					rx->id, rx->status);
 				xennet_move_rx_slot(np, m, ref);
 				err = ENOMEM;
 				goto next;
 			}
 
 			if (!xen_feature( XENFEAT_auto_translated_physmap)) {
 				/* Remap the page. */
 				void *vaddr = mtod(m, void *);
 				uint32_t pfn;
 
 				mcl = np->rx_mcl + pages_flipped;
 				mmu = np->rx_mmu + pages_flipped;
 
 				MULTI_update_va_mapping(mcl, (u_long)vaddr,
 				    (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW |
 				    PG_V | PG_M | PG_A, 0);
 				pfn = (uintptr_t)m->m_ext.ext_arg1;
 				mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) |
 				    MMU_MACHPHYS_UPDATE;
 				mmu->val = pfn;
 
 				set_phys_to_machine(pfn, mfn);
 			}
 			pages_flipped++;
 		} else {
 			ret = gnttab_end_foreign_access_ref(ref);
 			KASSERT(ret, ("ret != 0"));
 		}
 
 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
 
 next:
 		if (m == NULL)
 			break;
 
 		m->m_len = rx->status;
 		m->m_data += rx->offset;
 		m0->m_pkthdr.len += rx->status;
 		
 next_skip_queue:
 		if (!(rx->flags & NETRXF_more_data))
 			break;
 
 		if (*cons + frags == rp) {
 			if (net_ratelimit())
 				WPRINTK("Need more frags\n");
 			err = ENOENT;
 			printf("%s: cons %u frags %u rp %u, not enough frags\n",
 			       __func__, *cons, frags, rp);
 			break;
 		}
 		/*
 		 * Note that m can be NULL, if rx->status < 0 or if
 		 * rx->offset + rx->status > PAGE_SIZE above.  
 		 */
 		m_prev = m;
 		
 		rx = RING_GET_RESPONSE(&np->rx, *cons + frags);
 		m = xennet_get_rx_mbuf(np, *cons + frags);
 
 		/*
 		 * m_prev == NULL can happen if rx->status < 0 or if
 		 * rx->offset + * rx->status > PAGE_SIZE above.  
 		 */
 		if (m_prev != NULL)
 			m_prev->m_next = m;
 
 		/*
 		 * m0 can be NULL if rx->status < 0 or if * rx->offset +
 		 * rx->status > PAGE_SIZE above.  
 		 */
 		if (m0 == NULL)
 			m0 = m;
 		m->m_next = NULL;
 		ref = xennet_get_rx_ref(np, *cons + frags);
 		ref_cons = *cons + frags;
 		frags++;
 	}
 	*list = m0;
 	*cons += frags;
 	*pages_flipped_p = pages_flipped;
 
 	return (err);
 }
 
 static void
 xn_tick_locked(struct netfront_info *sc) 
 {
 	XN_RX_LOCK_ASSERT(sc);
 	callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc);
 
 	/* XXX placeholder for printing debug information */
 }
 
 static void
 xn_tick(void *xsc) 
 {
 	struct netfront_info *sc;
     
 	sc = xsc;
 	XN_RX_LOCK(sc);
 	xn_tick_locked(sc);
 	XN_RX_UNLOCK(sc);
 }
 
 /**
  * \brief Count the number of fragments in an mbuf chain.
  *
  * Surprisingly, there isn't an M* macro for this.
  */
 static inline int
 xn_count_frags(struct mbuf *m)
 {
 	int nfrags;
 
 	for (nfrags = 0; m != NULL; m = m->m_next)
 		nfrags++;
 
 	return (nfrags);
 }
 
 /**
  * Given an mbuf chain, make sure we have enough room and then push
  * it onto the transmit ring.
  */
 static int
 xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head)
 {
 	struct ifnet *ifp;
 	struct mbuf *m;
 	u_int nfrags;
 	netif_extra_info_t *extra;
 	int otherend_id;
 
 	ifp = sc->xn_ifp;
 
 	/**
 	 * Defragment the mbuf if necessary.
 	 */
 	nfrags = xn_count_frags(m_head);
 
 	/*
 	 * Check to see whether this request is longer than netback
 	 * can handle, and try to defrag it.
 	 */
 	/**
 	 * It is a bit lame, but the netback driver in Linux can't
 	 * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of
 	 * the Linux network stack.
 	 */
 	if (nfrags > sc->maxfrags) {
 		m = m_defrag(m_head, M_NOWAIT);
 		if (!m) {
 			/*
 			 * Defrag failed, so free the mbuf and
 			 * therefore drop the packet.
 			 */
 			m_freem(m_head);
 			return (EMSGSIZE);
 		}
 		m_head = m;
 	}
 
 	/* Determine how many fragments now exist */
 	nfrags = xn_count_frags(m_head);
 
 	/*
 	 * Check to see whether the defragmented packet has too many
 	 * segments for the Linux netback driver.
 	 */
 	/**
 	 * The FreeBSD TCP stack, with TSO enabled, can produce a chain
 	 * of mbufs longer than Linux can handle.  Make sure we don't
 	 * pass a too-long chain over to the other side by dropping the
 	 * packet.  It doesn't look like there is currently a way to
 	 * tell the TCP stack to generate a shorter chain of packets.
 	 */
 	if (nfrags > MAX_TX_REQ_FRAGS) {
 #ifdef DEBUG
 		printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback "
 		       "won't be able to handle it, dropping\n",
 		       __func__, nfrags, MAX_TX_REQ_FRAGS);
 #endif
 		m_freem(m_head);
 		return (EMSGSIZE);
 	}
 
 	/*
 	 * This check should be redundant.  We've already verified that we
 	 * have enough slots in the ring to handle a packet of maximum
 	 * size, and that our packet is less than the maximum size.  Keep
 	 * it in here as an assert for now just to make certain that
 	 * xn_tx_chain_cnt is accurate.
 	 */
 	KASSERT((sc->xn_cdata.xn_tx_chain_cnt + nfrags) <= NET_TX_RING_SIZE,
 		("%s: xn_tx_chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE "
 		 "(%d)!", __func__, (int) sc->xn_cdata.xn_tx_chain_cnt,
                     (int) nfrags, (int) NET_TX_RING_SIZE));
 
 	/*
 	 * Start packing the mbufs in this chain into
 	 * the fragment pointers. Stop when we run out
 	 * of fragments or hit the end of the mbuf chain.
 	 */
 	m = m_head;
 	extra = NULL;
 	otherend_id = xenbus_get_otherend_id(sc->xbdev);
 	for (m = m_head; m; m = m->m_next) {
 		netif_tx_request_t *tx;
 		uintptr_t id;
 		grant_ref_t ref;
 		u_long mfn; /* XXX Wrong type? */
 
 		tx = RING_GET_REQUEST(&sc->tx, sc->tx.req_prod_pvt);
 		id = get_id_from_freelist(sc->tx_mbufs);
 		if (id == 0)
 			panic("%s: was allocated the freelist head!\n",
 			    __func__);
 		sc->xn_cdata.xn_tx_chain_cnt++;
 		if (sc->xn_cdata.xn_tx_chain_cnt > NET_TX_RING_SIZE)
 			panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n",
 			    __func__);
 		sc->tx_mbufs[id] = m;
 		tx->id = id;
 		ref = gnttab_claim_grant_reference(&sc->gref_tx_head);
 		KASSERT((short)ref >= 0, ("Negative ref"));
 		mfn = virt_to_mfn(mtod(m, vm_offset_t));
 		gnttab_grant_foreign_access_ref(ref, otherend_id,
 		    mfn, GNTMAP_readonly);
 		tx->gref = sc->grant_tx_ref[id] = ref;
 		tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
 		tx->flags = 0;
 		if (m == m_head) {
 			/*
 			 * The first fragment has the entire packet
 			 * size, subsequent fragments have just the
 			 * fragment size. The backend works out the
 			 * true size of the first fragment by
 			 * subtracting the sizes of the other
 			 * fragments.
 			 */
 			tx->size = m->m_pkthdr.len;
 
 			/*
 			 * The first fragment contains the checksum flags
 			 * and is optionally followed by extra data for
 			 * TSO etc.
 			 */
 			/**
 			 * CSUM_TSO requires checksum offloading.
 			 * Some versions of FreeBSD fail to
 			 * set CSUM_TCP in the CSUM_TSO case,
 			 * so we have to test for CSUM_TSO
 			 * explicitly.
 			 */
 			if (m->m_pkthdr.csum_flags
 			    & (CSUM_DELAY_DATA | CSUM_TSO)) {
 				tx->flags |= (NETTXF_csum_blank
 				    | NETTXF_data_validated);
 			}
 #if __FreeBSD_version >= 700000
 			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 				struct netif_extra_info *gso =
 					(struct netif_extra_info *)
 					RING_GET_REQUEST(&sc->tx,
 							 ++sc->tx.req_prod_pvt);
 
 				tx->flags |= NETTXF_extra_info;
 
 				gso->u.gso.size = m->m_pkthdr.tso_segsz;
 				gso->u.gso.type =
 					XEN_NETIF_GSO_TYPE_TCPV4;
 				gso->u.gso.pad = 0;
 				gso->u.gso.features = 0;
 
 				gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
 				gso->flags = 0;
 			}
 #endif
 		} else {
 			tx->size = m->m_len;
 		}
 		if (m->m_next)
 			tx->flags |= NETTXF_more_data;
 
 		sc->tx.req_prod_pvt++;
 	}
 	BPF_MTAP(ifp, m_head);
 
 	sc->stats.tx_bytes += m_head->m_pkthdr.len;
 	sc->stats.tx_packets++;
 
 	return (0);
 }
 
 static void
 xn_start_locked(struct ifnet *ifp) 
 {
 	struct netfront_info *sc;
 	struct mbuf *m_head;
 	int notify;
 
 	sc = ifp->if_softc;
 
 	if (!netfront_carrier_ok(sc))
 		return;
 
 	/*
 	 * While we have enough transmit slots available for at least one
 	 * maximum-sized packet, pull mbufs off the queue and put them on
 	 * the transmit ring.
 	 */
 	while (xn_tx_slot_available(sc)) {
 		IF_DEQUEUE(&ifp->if_snd, m_head);
 		if (m_head == NULL)
 			break;
 
 		if (xn_assemble_tx_request(sc, m_head) != 0)
 			break;
 	}
 
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->tx, notify);
 	if (notify)
 		xen_intr_signal(sc->xen_intr_handle);
 
 	if (RING_FULL(&sc->tx)) {
 		sc->tx_full = 1;
 #if 0
 		netif_stop_queue(dev);
 #endif
 	}
 }
 
 static void
 xn_start(struct ifnet *ifp)
 {
 	struct netfront_info *sc;
 	sc = ifp->if_softc;
 	XN_TX_LOCK(sc);
 	xn_start_locked(ifp);
 	XN_TX_UNLOCK(sc);
 }
 
 /* equivalent of network_open() in Linux */
 static void 
 xn_ifinit_locked(struct netfront_info *sc) 
 {
 	struct ifnet *ifp;
 	
 	XN_LOCK_ASSERT(sc);
 	
 	ifp = sc->xn_ifp;
 	
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) 
 		return;
 	
 	xn_stop(sc);
 	
 	network_alloc_rx_buffers(sc);
 	sc->rx.sring->rsp_event = sc->rx.rsp_cons + 1;
 	
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	if_link_state_change(ifp, LINK_STATE_UP);
 	
 	callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc);
 }
 
 static void 
 xn_ifinit(void *xsc)
 {
 	struct netfront_info *sc = xsc;
     
 	XN_LOCK(sc);
 	xn_ifinit_locked(sc);
 	XN_UNLOCK(sc);
 }
 
 static int
 xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct netfront_info *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *) data;
 #ifdef INET
 	struct ifaddr *ifa = (struct ifaddr *)data;
 #endif
 
 	int mask, error = 0;
 	switch(cmd) {
 	case SIOCSIFADDR:
 #ifdef INET
 		XN_LOCK(sc);
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			ifp->if_flags |= IFF_UP;
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) 
 				xn_ifinit_locked(sc);
 			arp_ifinit(ifp, ifa);
 			XN_UNLOCK(sc);	
 		} else {
 			XN_UNLOCK(sc);	
 #endif
 			error = ether_ioctl(ifp, cmd, data);
 #ifdef INET
 		}
 #endif
 		break;
 	case SIOCSIFMTU:
 		/* XXX can we alter the MTU on a VN ?*/
 #ifdef notyet
 		if (ifr->ifr_mtu > XN_JUMBO_MTU)
 			error = EINVAL;
 		else 
 #endif
 		{
 			ifp->if_mtu = ifr->ifr_mtu;
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			xn_ifinit(sc);
 		}
 		break;
 	case SIOCSIFFLAGS:
 		XN_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			/*
 			 * If only the state of the PROMISC flag changed,
 			 * then just use the 'set promisc mode' command
 			 * instead of reinitializing the entire NIC. Doing
 			 * a full re-init means reloading the firmware and
 			 * waiting for it to start up, which may take a
 			 * second or two.
 			 */
 #ifdef notyet
 			/* No promiscuous mode with Xen */
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
 			    ifp->if_flags & IFF_PROMISC &&
 			    !(sc->xn_if_flags & IFF_PROMISC)) {
 				XN_SETBIT(sc, XN_RX_MODE,
 					  XN_RXMODE_RX_PROMISC);
 			} else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
 				   !(ifp->if_flags & IFF_PROMISC) &&
 				   sc->xn_if_flags & IFF_PROMISC) {
 				XN_CLRBIT(sc, XN_RX_MODE,
 					  XN_RXMODE_RX_PROMISC);
 			} else
 #endif
 				xn_ifinit_locked(sc);
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				xn_stop(sc);
 			}
 		}
 		sc->xn_if_flags = ifp->if_flags;
 		XN_UNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCSIFCAP:
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			if (IFCAP_TXCSUM & ifp->if_capenable) {
 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
 				    | CSUM_IP | CSUM_TSO);
 			} else {
 				ifp->if_capenable |= IFCAP_TXCSUM;
 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP
 				    | CSUM_IP);
 			}
 		}
 		if (mask & IFCAP_RXCSUM) {
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		}
 #if __FreeBSD_version >= 700000
 		if (mask & IFCAP_TSO4) {
 			if (IFCAP_TSO4 & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				ifp->if_hwassist &= ~CSUM_TSO;
 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
 				ifp->if_capenable |= IFCAP_TSO4;
 				ifp->if_hwassist |= CSUM_TSO;
 			} else {
 				IPRINTK("Xen requires tx checksum offload"
 				    " be enabled to use TSO\n");
 				error = EINVAL;
 			}
 		}
 		if (mask & IFCAP_LRO) {
 			ifp->if_capenable ^= IFCAP_LRO;
 			
 		}
 #endif
 		error = 0;
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 #ifdef notyet
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			XN_LOCK(sc);
 			xn_setmulti(sc);
 			XN_UNLOCK(sc);
 			error = 0;
 		}
 #endif
 		/* FALLTHROUGH */
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 	}
     
 	return (error);
 }
 
 static void
 xn_stop(struct netfront_info *sc)
 {	
 	struct ifnet *ifp;
 
 	XN_LOCK_ASSERT(sc);
     
 	ifp = sc->xn_ifp;
 
 	callout_stop(&sc->xn_stat_ch);
 
 	xn_free_rx_ring(sc);
 	xn_free_tx_ring(sc);
     
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 }
 
 /* START of Xenolinux helper functions adapted to FreeBSD */
 int
 network_connect(struct netfront_info *np)
 {
 	int i, requeue_idx, error;
 	grant_ref_t ref;
 	netif_rx_request_t *req;
 	u_int feature_rx_copy, feature_rx_flip;
 
 	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 	    "feature-rx-copy", NULL, "%u", &feature_rx_copy);
 	if (error)
 		feature_rx_copy = 0;
 	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 	    "feature-rx-flip", NULL, "%u", &feature_rx_flip);
 	if (error)
 		feature_rx_flip = 1;
 
 	/*
 	 * Copy packets on receive path if:
 	 *  (a) This was requested by user, and the backend supports it; or
 	 *  (b) Flipping was requested, but this is unsupported by the backend.
 	 */
 	np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
 				(MODPARM_rx_flip && !feature_rx_flip));
 
 	/* Recovery procedure: */
 	error = talk_to_backend(np->xbdev, np);
 	if (error) 
 		return (error);
 	
 	/* Step 1: Reinitialise variables. */
 	xn_query_features(np);
 	xn_configure_features(np);
 	netif_release_tx_bufs(np);
 
 	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
 	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
 		struct mbuf *m;
 		u_long pfn;
 
 		if (np->rx_mbufs[i] == NULL)
 			continue;
 
 		m = np->rx_mbufs[requeue_idx] = xennet_get_rx_mbuf(np, i);
 		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
 
 		req = RING_GET_REQUEST(&np->rx, requeue_idx);
 		pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT;
 
 		if (!np->copying_receiver) {
 			gnttab_grant_foreign_transfer_ref(ref,
 			    xenbus_get_otherend_id(np->xbdev),
 			    pfn);
 		} else {
 			gnttab_grant_foreign_access_ref(ref,
 			    xenbus_get_otherend_id(np->xbdev),
 			    PFNTOMFN(pfn), 0);
 		}
 		req->gref = ref;
 		req->id   = requeue_idx;
 
 		requeue_idx++;
 	}
 
 	np->rx.req_prod_pvt = requeue_idx;
 	
 	/* Step 3: All public and private state should now be sane.  Get
 	 * ready to start sending and receiving packets and give the driver
 	 * domain a kick because we've probably just requeued some
 	 * packets.
 	 */
 	netfront_carrier_on(np);
 	xen_intr_signal(np->xen_intr_handle);
 	XN_TX_LOCK(np);
 	xn_txeof(np);
 	XN_TX_UNLOCK(np);
 	network_alloc_rx_buffers(np);
 
 	return (0);
 }
 
 static void
 xn_query_features(struct netfront_info *np)
 {
 	int val;
 
 	device_printf(np->xbdev, "backend features:");
 
 	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 		"feature-sg", NULL, "%d", &val) < 0)
 		val = 0;
 
 	np->maxfrags = 1;
 	if (val) {
 		np->maxfrags = MAX_TX_REQ_FRAGS;
 		printf(" feature-sg");
 	}
 
 	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 		"feature-gso-tcpv4", NULL, "%d", &val) < 0)
 		val = 0;
 
 	np->xn_ifp->if_capabilities &= ~(IFCAP_TSO4|IFCAP_LRO);
 	if (val) {
 		np->xn_ifp->if_capabilities |= IFCAP_TSO4|IFCAP_LRO;
 		printf(" feature-gso-tcp4");
 	}
 
 	printf("\n");
 }
 
 static int
 xn_configure_features(struct netfront_info *np)
 {
 	int err;
 
 	err = 0;
 #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6))
 	if ((np->xn_ifp->if_capenable & IFCAP_LRO) != 0)
 		tcp_lro_free(&np->xn_lro);
 #endif
     	np->xn_ifp->if_capenable =
 	    np->xn_ifp->if_capabilities & ~(IFCAP_LRO|IFCAP_TSO4);
 	np->xn_ifp->if_hwassist &= ~CSUM_TSO;
 #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6))
 	if (xn_enable_lro && (np->xn_ifp->if_capabilities & IFCAP_LRO) != 0) {
 		err = tcp_lro_init(&np->xn_lro);
 		if (err) {
 			device_printf(np->xbdev, "LRO initialization failed\n");
 		} else {
 			np->xn_lro.ifp = np->xn_ifp;
 			np->xn_ifp->if_capenable |= IFCAP_LRO;
 		}
 	}
 	if ((np->xn_ifp->if_capabilities & IFCAP_TSO4) != 0) {
 		np->xn_ifp->if_capenable |= IFCAP_TSO4;
 		np->xn_ifp->if_hwassist |= CSUM_TSO;
 	}
 #endif
 	return (err);
 }
 
 /**
  * Create a network device.
  * @param dev  Newbus device representing this virtual NIC.
  */
 int 
 create_netdev(device_t dev)
 {
 	int i;
 	struct netfront_info *np;
 	int err;
 	struct ifnet *ifp;
 
 	np = device_get_softc(dev);
 	
 	np->xbdev         = dev;
     
 	XN_LOCK_INIT(np, xennetif);
 
 	ifmedia_init(&np->sc_media, 0, xn_ifmedia_upd, xn_ifmedia_sts);
 	ifmedia_add(&np->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL);
 	ifmedia_set(&np->sc_media, IFM_ETHER|IFM_MANUAL);
 
 	np->rx_target     = RX_MIN_TARGET;
 	np->rx_min_target = RX_MIN_TARGET;
 	np->rx_max_target = RX_MAX_TARGET;
 
 	/* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
 	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
 		np->tx_mbufs[i] = (void *) ((u_long) i+1);
 		np->grant_tx_ref[i] = GRANT_REF_INVALID;	
 	}
 	np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0;
 
 	for (i = 0; i <= NET_RX_RING_SIZE; i++) {
 
 		np->rx_mbufs[i] = NULL;
 		np->grant_rx_ref[i] = GRANT_REF_INVALID;
 	}
 	/* A grant for every tx ring slot */
 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
 					  &np->gref_tx_head) != 0) {
 		IPRINTK("#### netfront can't alloc tx grant refs\n");
 		err = ENOMEM;
 		goto exit;
 	}
 	/* A grant for every rx ring slot */
 	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
 					  &np->gref_rx_head) != 0) {
 		WPRINTK("#### netfront can't alloc rx grant refs\n");
 		gnttab_free_grant_references(np->gref_tx_head);
 		err = ENOMEM;
 		goto exit;
 	}
 	
 	err = xen_net_read_mac(dev, np->mac);
 	if (err)
 		goto out;
 	
 	/* Set up ifnet structure */
 	ifp = np->xn_ifp = if_alloc(IFT_ETHER);
     	ifp->if_softc = np;
     	if_initname(ifp, "xn",  device_get_unit(dev));
     	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
     	ifp->if_ioctl = xn_ioctl;
     	ifp->if_output = ether_output;
     	ifp->if_start = xn_start;
 #ifdef notyet
     	ifp->if_watchdog = xn_watchdog;
 #endif
     	ifp->if_init = xn_ifinit;
     	ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1;
 	
     	ifp->if_hwassist = XN_CSUM_FEATURES;
     	ifp->if_capabilities = IFCAP_HWCSUM;
 	ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MAX_TX_REQ_FRAGS;
 	ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
 	
     	ether_ifattach(ifp, np->mac);
     	callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);
 	netfront_carrier_off(np);
 
 	return (0);
 
 exit:
 	gnttab_free_grant_references(np->gref_tx_head);
 out:
 	return (err);
 }
 
 /**
  * Handle the change of state of the backend to Closing.  We must delete our
  * device-layer structures now, to ensure that writes are flushed through to
  * the backend.  Once is this done, we can switch to Closed in
  * acknowledgement.
  */
 #if 0
 static void
 netfront_closing(device_t dev)
 {
 #if 0
 	struct netfront_info *info = dev->dev_driver_data;
 
 	DPRINTK("netfront_closing: %s removed\n", dev->nodename);
 
 	close_netdev(info);
 #endif
 	xenbus_switch_state(dev, XenbusStateClosed);
 }
 #endif
 
 static int
 netfront_detach(device_t dev)
 {
 	struct netfront_info *info = device_get_softc(dev);
 
 	DPRINTK("%s\n", xenbus_get_node(dev));
 
 	netif_free(info);
 
 	return 0;
 }
 
 static void
 netif_free(struct netfront_info *info)
 {
 	XN_LOCK(info);
 	xn_stop(info);
 	XN_UNLOCK(info);
 	callout_drain(&info->xn_stat_ch);
 	netif_disconnect_backend(info);
 	if (info->xn_ifp != NULL) {
 		ether_ifdetach(info->xn_ifp);
 		if_free(info->xn_ifp);
 		info->xn_ifp = NULL;
 	}
 	ifmedia_removeall(&info->sc_media);
 }
 
 static void
 netif_disconnect_backend(struct netfront_info *info)
 {
 	XN_RX_LOCK(info);
 	XN_TX_LOCK(info);
 	netfront_carrier_off(info);
 	XN_TX_UNLOCK(info);
 	XN_RX_UNLOCK(info);
 
 	free_ring(&info->tx_ring_ref, &info->tx.sring);
 	free_ring(&info->rx_ring_ref, &info->rx.sring);
 
 	xen_intr_unbind(&info->xen_intr_handle);
 }
 
 static void
 free_ring(int *ref, void *ring_ptr_ref)
 {
 	void **ring_ptr_ptr = ring_ptr_ref;
 
 	if (*ref != GRANT_REF_INVALID) {
 		/* This API frees the associated storage. */
 		gnttab_end_foreign_access(*ref, *ring_ptr_ptr);
 		*ref = GRANT_REF_INVALID;
 	}
 	*ring_ptr_ptr = NULL;
 }
 
 static int
 xn_ifmedia_upd(struct ifnet *ifp)
 {
 	return (0);
 }
 
 static void
 xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE;
 	ifmr->ifm_active = IFM_ETHER|IFM_MANUAL;
 }
 
 /* ** Driver registration ** */
 static device_method_t netfront_methods[] = { 
 	/* Device interface */ 
 	DEVMETHOD(device_probe,         netfront_probe), 
 	DEVMETHOD(device_attach,        netfront_attach), 
 	DEVMETHOD(device_detach,        netfront_detach), 
 	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
 	DEVMETHOD(device_suspend,       netfront_suspend), 
 	DEVMETHOD(device_resume,        netfront_resume), 
  
 	/* Xenbus interface */
 	DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed),
 
 	DEVMETHOD_END
 }; 
 
 static driver_t netfront_driver = { 
 	"xn", 
 	netfront_methods, 
 	sizeof(struct netfront_info),                      
 }; 
 devclass_t netfront_devclass; 
  
 DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, NULL,
     NULL); 
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h	(revision 278976)
+++ head/sys/sys/mbuf.h	(revision 278977)
@@ -1,1203 +1,1299 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	struct ifnet	*rcvif;		/* rcv interface */
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint64_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 cosqos;	/* class/quality of service */
 	uint8_t		 rsstype;	/* hash type */
 	uint8_t		 l2hlen;	/* layer 2 header length */
 	uint8_t		 l3hlen;	/* layer 3 header length */
 	uint8_t		 l4hlen;	/* layer 4 header length */
 	uint8_t		 l5hlen;	/* layer 5 header length */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]
 #define	tso_segsz	PH_per.sixteen[1]
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
 #define	pkt_tcphdr	PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct m_ext {
 	volatile u_int	*ext_cnt;	/* pointer to ref count info */
 	caddr_t		 ext_buf;	/* start of buffer */
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	void		(*ext_free)	/* free routine if not the usual */
 			    (struct mbuf *, void *, void *);
 	void		*ext_arg1;	/* optional argument pointer */
 	void		*ext_arg2;	/* optional argument pointer */
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			struct pkthdr	m_pkthdr;	/* M_PKTHDR set */
 			union {
 				struct m_ext	m_ext;	/* M_EXT set */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-12] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_UNUSED_8	0x00000100 /* --available-- */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 
 #define	M_PROTO1	0x00001000 /* protocol-specific */
 #define	M_PROTO2	0x00002000 /* protocol-specific */
 #define	M_PROTO3	0x00004000 /* protocol-specific */
 #define	M_PROTO4	0x00008000 /* protocol-specific */
 #define	M_PROTO5	0x00010000 /* protocol-specific */
 #define	M_PROTO6	0x00020000 /* protocol-specific */
 #define	M_PROTO7	0x00040000 /* protocol-specific */
 #define	M_PROTO8	0x00080000 /* protocol-specific */
 #define	M_PROTO9	0x00100000 /* protocol-specific */
 #define	M_PROTO10	0x00200000 /* protocol-specific */
 #define	M_PROTO11	0x00400000 /* protocol-specific */
 #define	M_PROTO12	0x00800000 /* protocol-specific */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG| \
      M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG"
 #define	M_FLAG_PROTOBITS \
     "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
     "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
     "\27M_PROTO11\30M_PROTO12"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE can be set by non-RSS cards or configurations
  * that provide an opaque flow identifier, allowing for ordering and
  * distribution without explicit affinity.
  */
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		1	/* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		2	/* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		3	/* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		4	/* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		5	/* IPv6 2-tuple + ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	6	/* TCPv6 4-tiple + ext hdrs */
 /* Non-standard RSS hash types */
 #define	M_HASHTYPE_RSS_UDP_IPV4		7	/* IPv4 UDP 4-tuple */
 #define	M_HASHTYPE_RSS_UDP_IPV4_EX	8	/* IPv4 UDP 4-tuple + ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV6		9	/* IPv6 UDP 4-tuple */
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	10	/* IPv6 UDP 4-tuple + ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		255	/* ordering, not affinity */
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 
 /*
  * COS/QOS class and quality of service tags.
  * It uses DSCP code points as base.
  */
 #define	QOS_DSCP_CS0		0x00
 #define	QOS_DSCP_DEF		QOS_DSCP_CS0
 #define	QOS_DSCP_CS1		0x20
 #define	QOS_DSCP_AF11		0x28
 #define	QOS_DSCP_AF12		0x30
 #define	QOS_DSCP_AF13		0x38
 #define	QOS_DSCP_CS2		0x40
 #define	QOS_DSCP_AF21		0x48
 #define	QOS_DSCP_AF22		0x50
 #define	QOS_DSCP_AF23		0x58
 #define	QOS_DSCP_CS3		0x60
 #define	QOS_DSCP_AF31		0x68
 #define	QOS_DSCP_AF32		0x70
 #define	QOS_DSCP_AF33		0x78
 #define	QOS_DSCP_CS4		0x80
 #define	QOS_DSCP_AF41		0x88
 #define	QOS_DSCP_AF42		0x90
 #define	QOS_DSCP_AF43		0x98
 #define	QOS_DSCP_CS5		0xa0
 #define	QOS_DSCP_EF		0xb8
 #define	QOS_DSCP_CS6		0xc0
 #define	QOS_DSCP_CS7		0xe0
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
 #define	EXT_JUMBOP	3	/* jumbo cluster 4096 bytes */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference (M_IOVEC) */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_cnt, notyet */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR2	0x020000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR3	0x040000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR4	0x080000	/* for vendor-internal use */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * External reference/free functions.
  */
 void sf_ext_ref(void *, void *);
 void sf_ext_free(void *, void *);
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESED		0x40000000	/* contains merged segments */
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI" \
     "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
     "\16CSUM_IP6_ISCSI" \
     "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
     "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESED"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_OOBDATA	15	/* expedited data  */
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 
 #ifdef _KERNEL
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 extern uma_zone_t	zone_ext_refcnt;
 
 void		 mb_free_ext(struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, caddr_t buf, u_int size, u_int *ref_cnt,
     void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, uma_zone_t zone, int size, int how, short type,
     int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR) {
 		if ((error = m_pkthdr_init(m, how)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	return (uma_zalloc_arg(zone_mbuf, &args, how));
 }
 
 /*
  * XXX This should be deprecated, very little use.
  */
 static __inline struct mbuf *
 m_getclr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m != NULL)
 		bzero(m->m_data, MLEN);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	return (uma_zalloc_arg(zone_mbuf, &args, how));
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	return (uma_zalloc_arg(zone_pack, &args, how));
 }
 
 static __inline int
 m_clget(struct mbuf *m, int how)
 {
 
 	if (m->m_flags & M_EXT)
 		printf("%s: %p mbuf already has external storage\n", __func__, m);
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		zone_drain(zone_pack);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 static __inline void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 
 	if (m && m->m_flags & M_EXT)
 		printf("%s: %p mbuf already has external storage\n", __func__, m);
 	if (m != NULL)
 		m->m_ext.ext_buf = NULL;
 
 	zone = m_getzone(size);
 	return (uma_zalloc_arg(zone, m, how));
 }
 
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	uma_zone_t zone;
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		zone = zone_jumbop;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		zone = zone_jumbo9;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = 0;
 	m->m_ext.ext_cnt = uma_find_refcnt(zone, cl);
 	m->m_flags |= M_EXT;
 
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     (void )m_extadd((m), (caddr_t)(buf), (size), (free), (arg1), (arg2),\
     (flags), (type), M_NOWAIT)
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) &&			\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (*((m)->m_ext.ext_cnt) == 1)) )		\
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 /* Compatibility with 4.3. */
 #define	m_copy(m, o, l)	m_copym((m), (o), (l), M_NOWAIT)
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 
 struct uio;
 
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(struct mbuf *, void *, void *), void *, void *,
 		    int, int, int);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 struct mbuf	*m_dup(struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static int inline
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 #endif /* _KERNEL */
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
+struct mbufq {
+	STAILQ_HEAD(, mbuf)	mq_head;
+	int			mq_len;
+	int			mq_maxlen;
+};
 
+static inline void
+mbufq_init(struct mbufq *mq, int maxlen)
+{
+
+	STAILQ_INIT(&mq->mq_head);
+	mq->mq_maxlen = maxlen;
+	mq->mq_len = 0;
+}
+
+static inline struct mbuf *
+mbufq_flush(struct mbufq *mq)
+{
+	struct mbuf *m;
+
+	m = STAILQ_FIRST(&mq->mq_head);
+	STAILQ_INIT(&mq->mq_head);
+	mq->mq_len = 0;
+	return (m);
+}
+
+static inline void
+mbufq_drain(struct mbufq *mq)
+{
+	struct mbuf *m, *n;
+
+	n = mbufq_flush(mq);
+	while ((m = n) != NULL) {
+		n = STAILQ_NEXT(m, m_stailqpkt);
+		m_freem(m);
+	}
+}
+
+static inline struct mbuf *
+mbufq_first(const struct mbufq *mq)
+{
+
+	return (STAILQ_FIRST(&mq->mq_head));
+}
+
+static inline struct mbuf *
+mbufq_last(const struct mbufq *mq)
+{
+
+	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
+}
+
+static inline int
+mbufq_full(const struct mbufq *mq)
+{
+
+	return (mq->mq_len >= mq->mq_maxlen);
+}
+
+static inline int
+mbufq_len(const struct mbufq *mq)
+{
+
+	return (mq->mq_len);
+}
+
+static inline int
+mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
+{
+
+	if (mbufq_full(mq))
+		return (ENOBUFS);
+	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
+	mq->mq_len++;
+	return (0);
+}
+
+static inline struct mbuf *
+mbufq_dequeue(struct mbufq *mq)
+{
+	struct mbuf *m;
+
+	m = STAILQ_FIRST(&mq->mq_head);
+	if (m) {
+		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
+		mq->mq_len--;
+	}
+	return (m);
+}
+
+static inline void
+mbufq_prepend(struct mbufq *mq, struct mbuf *m)
+{
+
+	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
+	mq->mq_len++;
+}
 #endif /* !_SYS_MBUF_H_ */