Index: head/sys/dev/cxgb/cxgb_sge.c
===================================================================
--- head/sys/dev/cxgb/cxgb_sge.c	(revision 212369)
+++ head/sys/dev/cxgb/cxgb_sge.c	(revision 212370)
@@ -1,3860 +1,3846 @@
 /**************************************************************************
 
 Copyright (c) 2007-2009, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Chelsio Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
  
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 ***************************************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus_dma.h>
 #include <sys/rman.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 
 #include <net/bpf.h>	
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <cxgb_include.h>
 #include <sys/mvec.h>
 
 int	txq_fills = 0;
 int	multiq_tx_enable = 1;
 
 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
     "size of per-queue mbuf ring");
 
 static int cxgb_tx_coalesce_force = 0;
 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
     &cxgb_tx_coalesce_force, 0,
     "coalesce small packets into a single work request regardless of ring state");
 
 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
 
 
 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
     &cxgb_tx_coalesce_enable_start);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
     &cxgb_tx_coalesce_enable_start, 0,
     "coalesce enable threshold");
 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
     &cxgb_tx_coalesce_enable_stop, 0,
     "coalesce disable threshold");
 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
     &cxgb_tx_reclaim_threshold, 0,
     "tx cleaning minimum threshold");
 
 /*
  * XXX don't re-enable this until TOE stops assuming
  * we have an m_ext
  */
 static int recycle_enable = 0;
 
 extern int cxgb_use_16k_clusters;
 extern int nmbjumbop;
 extern int nmbjumbo9;
 extern int nmbjumbo16;
 
 #define USE_GTS 0
 
 #define SGE_RX_SM_BUF_SIZE	1536
 #define SGE_RX_DROP_THRES	16
 #define SGE_RX_COPY_THRES	128
 
 /*
  * Period of the Tx buffer reclaim timer.  This timer does not need to run
  * frequently as Tx buffers are usually reclaimed by new Tx packets.
  */
 #define TX_RECLAIM_PERIOD       (hz >> 1)
 
 /* 
  * Values for sge_txq.flags
  */
 enum {
 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
 };
 
 struct tx_desc {
 	uint64_t	flit[TX_DESC_FLITS];
 } __packed;
 
 struct rx_desc {
 	uint32_t	addr_lo;
 	uint32_t	len_gen;
 	uint32_t	gen2;
 	uint32_t	addr_hi;
 } __packed;
 
 struct rsp_desc {               /* response queue descriptor */
 	struct rss_header	rss_hdr;
 	uint32_t		flags;
 	uint32_t		len_cq;
 	uint8_t			imm_data[47];
 	uint8_t			intr_gen;
 } __packed;
 
 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
 
 struct tx_sw_desc {                /* SW state per Tx descriptor */
 	struct mbuf	*m;
 	bus_dmamap_t	map;
 	int		flags;
 };
 
 struct rx_sw_desc {                /* SW state per Rx descriptor */
 	caddr_t		rxsd_cl;
 	struct mbuf	*m;
 	bus_dmamap_t	map;
 	int		flags;
 };
 
 struct txq_state {
 	unsigned int	compl;
 	unsigned int	gen;
 	unsigned int	pidx;
 };
 
 struct refill_fl_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 
 /*
  * Maps a number of flits to the number of Tx descriptors that can hold them.
  * The formula is
  *
  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
  *
  * HW allows up to 4 descriptors to be combined into a WR.
  */
 static uint8_t flit_desc_map[] = {
 	0,
 #if SGE_NUM_GENBITS == 1
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 #elif SGE_NUM_GENBITS == 2
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 #else
 # error "SGE_NUM_GENBITS must be 1 or 2"
 #endif
 };
 
 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)	
 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)	
 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)	
 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
 #define	TXQ_RING_DEQUEUE(qs) \
 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
 
 int cxgb_debug = 0;
 
 static void sge_timer_cb(void *arg);
 static void sge_timer_reclaim(void *arg, int ncount);
 static void sge_txq_reclaim_handler(void *arg, int ncount);
 static void cxgb_start_locked(struct sge_qset *qs);
 
 /*
  * XXX need to cope with bursty scheduling by looking at a wider
  * window than we are now for determining the need for coalescing
  *
  */
 static __inline uint64_t
 check_pkt_coalesce(struct sge_qset *qs) 
 { 
         struct adapter *sc; 
         struct sge_txq *txq; 
 	uint8_t *fill;
 
 	if (__predict_false(cxgb_tx_coalesce_force))
 		return (1);
 	txq = &qs->txq[TXQ_ETH]; 
         sc = qs->port->adapter; 
 	fill = &sc->tunq_fill[qs->idx];
 
 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
 	/*
 	 * if the hardware transmit queue is more than 1/8 full
 	 * we mark it as coalescing - we drop back from coalescing
 	 * when we go below 1/32 full and there are no packets enqueued, 
 	 * this provides us with some degree of hysteresis
 	 */
         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
                 *fill = 0; 
         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
                 *fill = 1; 
 
 	return (sc->tunq_coalesce);
 } 
 
 #ifdef __LP64__
 static void
 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
 {
 	uint64_t wr_hilo;
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	wr_hilo = wr_hi;
 	wr_hilo |= (((uint64_t)wr_lo)<<32);
 #else
 	wr_hilo = wr_lo;
 	wr_hilo |= (((uint64_t)wr_hi)<<32);
 #endif	
 	wrp->wrh_hilo = wr_hilo;
 }
 #else
 static void
 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
 {
 
 	wrp->wrh_hi = wr_hi;
 	wmb();
 	wrp->wrh_lo = wr_lo;
 }
 #endif
 
 struct coalesce_info {
 	int count;
 	int nbytes;
 };
 
 static int
 coalesce_check(struct mbuf *m, void *arg)
 {
 	struct coalesce_info *ci = arg;
 	int *count = &ci->count;
 	int *nbytes = &ci->nbytes;
 
 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
 		(*count < 7) && (m->m_next == NULL))) {
 		*count += 1;
 		*nbytes += m->m_len;
 		return (1);
 	}
 	return (0);
 }
 
 static struct mbuf *
 cxgb_dequeue(struct sge_qset *qs)
 {
 	struct mbuf *m, *m_head, *m_tail;
 	struct coalesce_info ci;
 
 	
 	if (check_pkt_coalesce(qs) == 0) 
 		return TXQ_RING_DEQUEUE(qs);
 
 	m_head = m_tail = NULL;
 	ci.count = ci.nbytes = 0;
 	do {
 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
 		if (m_head == NULL) {
 			m_tail = m_head = m;
 		} else if (m != NULL) {
 			m_tail->m_nextpkt = m;
 			m_tail = m;
 		}
 	} while (m != NULL);
 	if (ci.count > 7)
 		panic("trying to coalesce %d packets in to one WR", ci.count);
 	return (m_head);
 }
 	
 /**
  *	reclaim_completed_tx - reclaims completed Tx descriptors
  *	@adapter: the adapter
  *	@q: the Tx queue to reclaim completed descriptors from
  *
  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
  *	and frees the associated buffers if possible.  Called with the Tx
  *	queue's lock held.
  */
 static __inline int
 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
 {
 	struct sge_txq *q = &qs->txq[queue];
 	int reclaim = desc_reclaimable(q);
 
 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
 
 	if (reclaim < reclaim_min)
 		return (0);
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	if (reclaim > 0) {
 		t3_free_tx_desc(qs, reclaim, queue);
 		q->cleaned += reclaim;
 		q->in_use -= reclaim;
 	}
 	if (isset(&qs->txq_stopped, TXQ_ETH))
                 clrbit(&qs->txq_stopped, TXQ_ETH);
 
 	return (reclaim);
 }
 
 /**
  *	should_restart_tx - are there enough resources to restart a Tx queue?
  *	@q: the Tx queue
  *
  *	Checks if there are enough descriptors to restart a suspended Tx queue.
  */
 static __inline int
 should_restart_tx(const struct sge_txq *q)
 {
 	unsigned int r = q->processed - q->cleaned;
 
 	return q->in_use - r < (q->size >> 1);
 }
 
 /**
  *	t3_sge_init - initialize SGE
  *	@adap: the adapter
  *	@p: the SGE parameters
  *
  *	Performs SGE initialization needed every time after a chip reset.
  *	We do not initialize any of the queue sets here, instead the driver
  *	top-level must request those individually.  We also do not enable DMA
  *	here, that should be done after the queues have been set up.
  */
 void
 t3_sge_init(adapter_t *adap, struct sge_params *p)
 {
 	u_int ctrl, ups;
 
 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
 
 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
 #if SGE_NUM_GENBITS == 1
 	ctrl |= F_EGRGENCTRL;
 #endif
 	if (adap->params.rev > 0) {
 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
 	}
 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
 		     V_LORCQDRBTHRSH(512));
 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
 		     adap->params.rev < T3_REV_C ? 1000 : 500);
 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
 }
 
 
 /**
  *	sgl_len - calculates the size of an SGL of the given capacity
  *	@n: the number of SGL entries
  *
  *	Calculates the number of flits needed for a scatter/gather list that
  *	can hold the given number of entries.
  */
 static __inline unsigned int
 sgl_len(unsigned int n)
 {
 	return ((3 * n) / 2 + (n & 1));
 }
 
 /**
  *	get_imm_packet - return the next ingress packet buffer from a response
  *	@resp: the response descriptor containing the packet data
  *
  *	Return a packet containing the immediate data of the given response.
  */
 static int
 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
 {
 
 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
 	m->m_ext.ext_buf = NULL;
 	m->m_ext.ext_type = 0;
 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE); 
 	return (0);	
 }
 
 static __inline u_int
 flits_to_desc(u_int n)
 {
 	return (flit_desc_map[n]);
 }
 
 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
 		    F_HIRCQPARITYERROR)
 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
 		      F_RSPQDISABLED)
 
 /**
  *	t3_sge_err_intr_handler - SGE async event interrupt handler
  *	@adapter: the adapter
  *
  *	Interrupt handler for SGE asynchronous (non-data) events.
  */
 void
 t3_sge_err_intr_handler(adapter_t *adapter)
 {
 	unsigned int v, status;
 
 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
 	if (status & SGE_PARERR)
 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
 			 status & SGE_PARERR);
 	if (status & SGE_FRAMINGERR)
 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
 			 status & SGE_FRAMINGERR);
 	if (status & F_RSPQCREDITOVERFOW)
 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
 
 	if (status & F_RSPQDISABLED) {
 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
 
 		CH_ALERT(adapter,
 			 "packet delivered to disabled response queue (0x%x)\n",
 			 (v >> S_RSPQ0DISABLED) & 0xff);
 	}
 
 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
 	if (status & SGE_FATALERR)
 		t3_fatal_err(adapter);
 }
 
 void
 t3_sge_prep(adapter_t *adap, struct sge_params *p)
 {
 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
 
 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
 	nqsets *= adap->params.nports;
 
 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
 
 	while (!powerof2(fl_q_size))
 		fl_q_size--;
 
 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
 	    is_offload(adap);
 
 #if __FreeBSD_version >= 700111
 	if (use_16k) {
 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
 		jumbo_buf_size = MJUM16BYTES;
 	} else {
 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
 		jumbo_buf_size = MJUM9BYTES;
 	}
 #else
 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
 	jumbo_buf_size = MJUMPAGESIZE;
 #endif
 	while (!powerof2(jumbo_q_size))
 		jumbo_q_size--;
 
 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
 		device_printf(adap->dev,
 		    "Insufficient clusters and/or jumbo buffers.\n");
 
 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
 
 	for (i = 0; i < SGE_QSETS; ++i) {
 		struct qset_params *q = p->qset + i;
 
 		if (adap->params.nports > 2) {
 			q->coalesce_usecs = 50;
 		} else {
 #ifdef INVARIANTS			
 			q->coalesce_usecs = 10;
 #else
 			q->coalesce_usecs = 5;
 #endif			
 		}
 		q->polling = 0;
 		q->rspq_size = RSPQ_Q_SIZE;
 		q->fl_size = fl_q_size;
 		q->jumbo_size = jumbo_q_size;
 		q->jumbo_buf_size = jumbo_buf_size;
 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
 		q->cong_thres = 0;
 	}
 }
 
 int
 t3_sge_alloc(adapter_t *sc)
 {
 
 	/* The parent tag. */
 	if (bus_dma_tag_create( NULL,			/* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
 				BUS_SPACE_UNRESTRICTED, /* nsegments */
 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
 				0,			/* flags */
 				NULL, NULL,		/* lock, lockarg */
 				&sc->parent_dmat)) {
 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * DMA tag for normal sized RX frames
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/* 
 	 * DMA tag for jumbo sized RX frames.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
 		return (ENOMEM);
 	}
 
 	/* 
 	 * DMA tag for TX frames.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
 		NULL, NULL, &sc->tx_dmat)) {
 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 int
 t3_sge_free(struct adapter * sc)
 {
 
 	if (sc->tx_dmat != NULL)
 		bus_dma_tag_destroy(sc->tx_dmat);
 
 	if (sc->rx_jumbo_dmat != NULL)
 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
 
 	if (sc->rx_dmat != NULL)
 		bus_dma_tag_destroy(sc->rx_dmat);
 
 	if (sc->parent_dmat != NULL)
 		bus_dma_tag_destroy(sc->parent_dmat);
 
 	return (0);
 }
 
 void
 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
 {
 
 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
 	qs->rspq.polling = 0 /* p->polling */;
 }
 
 #if !defined(__i386__) && !defined(__amd64__)
 static void
 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct refill_fl_cb_arg *cb_arg = arg;
 	
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 
 }
 #endif
 /**
  *	refill_fl - refill an SGE free-buffer list
  *	@sc: the controller softc
  *	@q: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 {
 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 	struct rx_desc *d = &q->desc[q->pidx];
 	struct refill_fl_cb_arg cb_arg;
 	struct mbuf *m;
 	caddr_t cl;
 	int err;
 	
 	cb_arg.error = 0;
 	while (n--) {
 		/*
 		 * We only allocate a cluster, mbuf allocation happens after rx
 		 */
 		if (q->zone == zone_pack) {
 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
 				break;
 			cl = m->m_ext.ext_buf;			
 		} else {
 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
 				break;
 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 				uma_zfree(q->zone, cl);
 				break;
 			}
 		}
 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
 				uma_zfree(q->zone, cl);
 				goto done;
 			}
 			sd->flags |= RX_SW_DESC_MAP_CREATED;
 		}
 #if !defined(__i386__) && !defined(__amd64__)
 		err = bus_dmamap_load(q->entry_tag, sd->map,
 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
 		
 		if (err != 0 || cb_arg.error) {
 			if (q->zone == zone_pack)
 				uma_zfree(q->zone, cl);
 			m_free(m);
 			goto done;
 		}
 #else
 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
 #endif		
 		sd->flags |= RX_SW_DESC_INUSE;
 		sd->rxsd_cl = cl;
 		sd->m = m;
 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
 
 		d++;
 		sd++;
 
 		if (++q->pidx == q->size) {
 			q->pidx = 0;
 			q->gen ^= 1;
 			sd = q->sdesc;
 			d = q->desc;
 		}
 		q->credits++;
 		q->db_pending++;
 	}
 
 done:
 	if (q->db_pending >= 32) {
 		q->db_pending = 0;
 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 	}
 }
 
 
 /**
  *	free_rx_bufs - free the Rx buffers on an SGE free list
  *	@sc: the controle softc
  *	@q: the SGE free list to clean up
  *
  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
  *	this queue should be stopped before calling this function.
  */
 static void
 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
 {
 	u_int cidx = q->cidx;
 
 	while (q->credits--) {
 		struct rx_sw_desc *d = &q->sdesc[cidx];
 
 		if (d->flags & RX_SW_DESC_INUSE) {
 			bus_dmamap_unload(q->entry_tag, d->map);
 			bus_dmamap_destroy(q->entry_tag, d->map);
 			if (q->zone == zone_pack) {
 				m_init(d->m, zone_pack, MCLBYTES,
 				    M_NOWAIT, MT_DATA, M_EXT);
 				uma_zfree(zone_pack, d->m);
 			} else {
 				m_init(d->m, zone_mbuf, MLEN,
 				    M_NOWAIT, MT_DATA, 0);
 				uma_zfree(zone_mbuf, d->m);
 				uma_zfree(q->zone, d->rxsd_cl);
 			}			
 		}
 		
 		d->rxsd_cl = NULL;
 		d->m = NULL;
 		if (++cidx == q->size)
 			cidx = 0;
 	}
 }
 
 static __inline void
 __refill_fl(adapter_t *adap, struct sge_fl *fl)
 {
 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
 }
 
 static __inline void
 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
 {
 	uint32_t reclaimable = fl->size - fl->credits;
 
 	if (reclaimable > 0)
 		refill_fl(adap, fl, min(max, reclaimable));
 }
 
 /**
  *	recycle_rx_buf - recycle a receive buffer
  *	@adapter: the adapter
  *	@q: the SGE free list
  *	@idx: index of buffer to recycle
  *
  *	Recycles the specified buffer on the given free list by adding it at
  *	the next available slot on the list.
  */
 static void
 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
 {
 	struct rx_desc *from = &q->desc[idx];
 	struct rx_desc *to   = &q->desc[q->pidx];
 
 	q->sdesc[q->pidx] = q->sdesc[idx];
 	to->addr_lo = from->addr_lo;        // already big endian
 	to->addr_hi = from->addr_hi;        // likewise
 	wmb();	/* necessary ? */
 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
 	q->credits++;
 
 	if (++q->pidx == q->size) {
 		q->pidx = 0;
 		q->gen ^= 1;
 	}
 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 }
 
 static void
 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	uint32_t *addr;
 
 	addr = arg;
 	*addr = segs[0].ds_addr;
 }
 
 static int
 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
 {
 	size_t len = nelem * elem_size;
 	void *s = NULL;
 	void *p = NULL;
 	int err;
 
 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
 				      BUS_SPACE_MAXADDR_32BIT,
 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
 				      len, 0, NULL, NULL, tag)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
 		return (ENOMEM);
 	}
 
 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
 				    map)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
 		return (ENOMEM);
 	}
 
 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
 	bzero(p, len);
 	*(void **)desc = p;
 
 	if (sw_size) {
 		len = nelem * sw_size;
 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
 		*(void **)sdesc = s;
 	}
 	if (parent_entry_tag == NULL)
 		return (0);
 	    
 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
 		                      NULL, NULL, entry_tag)) != 0) {
 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
 		return (ENOMEM);
 	}
 	return (0);
 }
 
 static void
 sge_slow_intr_handler(void *arg, int ncount)
 {
 	adapter_t *sc = arg;
 
 	t3_slow_intr_handler(sc);
 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
 }
 
 /**
  *	sge_timer_cb - perform periodic maintenance of an SGE qset
  *	@data: the SGE queue set to maintain
  *
  *	Runs periodically from a timer to perform maintenance of an SGE queue
  *	set.  It performs two tasks:
  *
  *	a) Cleans up any completed Tx descriptors that may still be pending.
  *	Normal descriptor cleanup happens when new packets are added to a Tx
  *	queue so this timer is relatively infrequent and does any cleanup only
  *	if the Tx queue has not seen any new packets in a while.  We make a
  *	best effort attempt to reclaim descriptors, in that we don't wait
  *	around if we cannot get a queue's lock (which most likely is because
  *	someone else is queueing new packets and so will also handle the clean
  *	up).  Since control queues use immediate data exclusively we don't
  *	bother cleaning them up here.
  *
  *	b) Replenishes Rx queues that have run out due to memory shortage.
  *	Normally new Rx buffers are added when existing ones are consumed but
  *	when out of memory a queue can become empty.  We try to add only a few
  *	buffers here, the queue will be replenished fully as these new buffers
  *	are used up if memory shortage has subsided.
  *	
  *	c) Return coalesced response queue credits in case a response queue is
  *	starved.
  *
  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell 
  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
  */
 static void
 sge_timer_cb(void *arg)
 {
 	adapter_t *sc = arg;
 	if ((sc->flags & USING_MSIX) == 0) {
 		
 		struct port_info *pi;
 		struct sge_qset *qs;
 		struct sge_txq  *txq;
 		int i, j;
 		int reclaim_ofl, refill_rx;
 
 		if (sc->open_device_map == 0) 
 			return;
 
 		for (i = 0; i < sc->params.nports; i++) {
 			pi = &sc->port[i];
 			for (j = 0; j < pi->nqsets; j++) {
 				qs = &sc->sge.qs[pi->first_qset + j];
 				txq = &qs->txq[0];
 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || 
 				    (qs->fl[1].credits < qs->fl[1].size));
 				if (reclaim_ofl || refill_rx) {
 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
 					break;
 				}
 			}
 		}
 	}
 	
 	if (sc->params.nports > 2) {
 		int i;
 
 		for_each_port(sc, i) {
 			struct port_info *pi = &sc->port[i];
 
 			t3_write_reg(sc, A_SG_KDOORBELL, 
 				     F_SELEGRCNTX | 
 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
 		}
 	}	
 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
 	    sc->open_device_map != 0)
 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 }
 
 /*
  * This is meant to be a catch-all function to keep sge state private
  * to sge.c
  *
  */
 int
 t3_sge_init_adapter(adapter_t *sc)
 {
 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
 	return (0);
 }
 
 int
 t3_sge_reset_adapter(adapter_t *sc)
 {
 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 	return (0);
 }
 
 int
 t3_sge_init_port(struct port_info *pi)
 {
 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
 	return (0);
 }
 
 /**
  *	refill_rspq - replenish an SGE response queue
  *	@adapter: the adapter
  *	@q: the response queue to replenish
  *	@credits: how many new responses to make available
  *
  *	Replenishes a response queue by making the supplied number of responses
  *	available to HW.
  */
 static __inline void
 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
 {
 
 	/* mbufs are allocated on demand when a rspq entry is processed. */
 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
 }
 
 static void
 sge_txq_reclaim_handler(void *arg, int ncount)
 {
 	struct sge_qset *qs = arg;
 	int i;
 
 	for (i = 0; i < 3; i++)
 		reclaim_completed_tx(qs, 16, i);
 }
 
 static void
 sge_timer_reclaim(void *arg, int ncount)
 {
 	struct port_info *pi = arg;
 	int i, nqsets = pi->nqsets;
 	adapter_t *sc = pi->adapter;
 	struct sge_qset *qs;
 	struct mtx *lock;
 	
 	KASSERT((sc->flags & USING_MSIX) == 0,
 	    ("can't call timer reclaim for msi-x"));
 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[pi->first_qset + i];
 
 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
 			    &sc->sge.qs[0].rspq.lock;
 
 		if (mtx_trylock(lock)) {
 			/* XXX currently assume that we are *NOT* polling */
 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
 
 			if (qs->fl[0].credits < qs->fl[0].size - 16)
 				__refill_fl(sc, &qs->fl[0]);
 			if (qs->fl[1].credits < qs->fl[1].size - 16)
 				__refill_fl(sc, &qs->fl[1]);
 			
 			if (status & (1 << qs->rspq.cntxt_id)) {
 				if (qs->rspq.credits) {
 					refill_rspq(sc, &qs->rspq, 1);
 					qs->rspq.credits--;
 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, 
 					    1 << qs->rspq.cntxt_id);
 				}
 			}
 			mtx_unlock(lock);
 		}
 	}
 }
 
 /**
  *	init_qset_cntxt - initialize an SGE queue set context info
  *	@qs: the queue set
  *	@id: the queue set id
  *
  *	Initializes the TIDs and context ids for the queues of a queue set.
  */
 static void
 init_qset_cntxt(struct sge_qset *qs, u_int id)
 {
 
 	qs->rspq.cntxt_id = id;
 	qs->fl[0].cntxt_id = 2 * id;
 	qs->fl[1].cntxt_id = 2 * id + 1;
 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
 
 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
 }
 
 
 static void
 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
 {
 	txq->in_use += ndesc;
 	/*
 	 * XXX we don't handle stopping of queue
 	 * presumably start handles this when we bump against the end
 	 */
 	txqs->gen = txq->gen;
 	txq->unacked += ndesc;
 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
 	txq->unacked &= 31;
 	txqs->pidx = txq->pidx;
 	txq->pidx += ndesc;
 #ifdef INVARIANTS
 	if (((txqs->pidx > txq->cidx) &&
 		(txq->pidx < txqs->pidx) &&
 		(txq->pidx >= txq->cidx)) ||
 	    ((txqs->pidx < txq->cidx) &&
 		(txq->pidx >= txq-> cidx)) ||
 	    ((txqs->pidx < txq->cidx) &&
 		(txq->cidx < txqs->pidx)))
 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
 		    txqs->pidx, txq->pidx, txq->cidx);
 #endif
 	if (txq->pidx >= txq->size) {
 		txq->pidx -= txq->size;
 		txq->gen ^= 1;
 	}
 
 }
 
 /**
  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
  *	@m: the packet mbufs
  *      @nsegs: the number of segments 
  *
  * 	Returns the number of Tx descriptors needed for the given Ethernet
  * 	packet.  Ethernet packets require addition of WR and CPL headers.
  */
 static __inline unsigned int
 calc_tx_descs(const struct mbuf *m, int nsegs)
 {
 	unsigned int flits;
 
 	if (m->m_pkthdr.len <= PIO_LEN)
 		return 1;
 
 	flits = sgl_len(nsegs) + 2;
 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
 		flits++;
 
 	return flits_to_desc(flits);
 }
 
 static unsigned int
 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
 {
 	struct mbuf *m0;
 	int err, pktlen, pass = 0;
 	bus_dma_tag_t tag = txq->entry_tag;
 
 retry:
 	err = 0;
 	m0 = *m;
 	pktlen = m0->m_pkthdr.len;
 #if defined(__i386__) || defined(__amd64__)
 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
 		goto done;
 	} else
 #endif
 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
 
 	if (err == 0) {
 		goto done;
 	}
 	if (err == EFBIG && pass == 0) {
 		pass = 1;
 		/* Too many segments, try to defrag */
 		m0 = m_defrag(m0, M_DONTWAIT);
 		if (m0 == NULL) {
 			m_freem(*m);
 			*m = NULL;
 			return (ENOBUFS);
 		}
 		*m = m0;
 		goto retry;
 	} else if (err == ENOMEM) {
 		return (err);
 	} if (err) {
 		if (cxgb_debug)
 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
 		m_freem(m0);
 		*m = NULL;
 		return (err);
 	}
 done:
 #if !defined(__i386__) && !defined(__amd64__)
 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
 #endif	
 	txsd->flags |= TX_SW_DESC_MAPPED;
 
 	return (0);
 }
 
 /**
  *	make_sgl - populate a scatter/gather list for a packet
  *	@sgp: the SGL to populate
  *	@segs: the packet dma segments
  *	@nsegs: the number of segments
  *
  *	Generates a scatter/gather list for the buffers that make up a packet
  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
  *	appropriately.
  */
 static __inline void
 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
 {
 	int i, idx;
 	
 	for (idx = 0, i = 0; i < nsegs; i++) {
 		/*
 		 * firmware doesn't like empty segments
 		 */
 		if (segs[i].ds_len == 0)
 			continue;
 		if (i && idx == 0) 
 			++sgp;
 		
 		sgp->len[idx] = htobe32(segs[i].ds_len);
 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
 		idx ^= 1;
 	}
 	
 	if (idx) {
 		sgp->len[idx] = 0;
 		sgp->addr[idx] = 0;
 	}
 }
 	
 /**
  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
  *	@adap: the adapter
  *	@q: the Tx queue
  *
  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
  *	where the HW is going to sleep just after we checked, however,
  *	then the interrupt handler will detect the outstanding TX packet
  *	and ring the doorbell for us.
  *
  *	When GTS is disabled we unconditionally ring the doorbell.
  */
 static __inline void
 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
 {
 #if USE_GTS
 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
 #ifdef T3_TRACE
 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
 			  q->cntxt_id);
 #endif
 		t3_write_reg(adap, A_SG_KDOORBELL,
 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 	}
 #else
 	if (mustring || ++q->db_pending >= 32) {
 		wmb();            /* write descriptors before telling HW */
 		t3_write_reg(adap, A_SG_KDOORBELL,
 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 		q->db_pending = 0;
 	}
 #endif
 }
 
 static __inline void
 wr_gen2(struct tx_desc *d, unsigned int gen)
 {
 #if SGE_NUM_GENBITS == 2
 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
 #endif
 }
 
 /**
  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
  *	@ndesc: number of Tx descriptors spanned by the SGL
  *	@txd: first Tx descriptor to be written
  *	@txqs: txq state (generation and producer index)
  *	@txq: the SGE Tx queue
  *	@sgl: the SGL
  *	@flits: number of flits to the start of the SGL in the first descriptor
  *	@sgl_flits: the SGL size in flits
  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
  *
  *	Write a work request header and an associated SGL.  If the SGL is
  *	small enough to fit into one Tx descriptor it has already been written
  *	and we just need to write the WR header.  Otherwise we distribute the
  *	SGL across the number of descriptors it spans.
  */
 static void
 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
 {
 
 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
 	
 	if (__predict_true(ndesc == 1)) {
 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 			V_WR_SGLSFLT(flits)) | wr_hi,
 		    htonl(V_WR_LEN(flits + sgl_flits) |
 			V_WR_GEN(txqs->gen)) | wr_lo);
 		/* XXX gen? */
 		wr_gen2(txd, txqs->gen);
 		
 	} else {
 		unsigned int ogen = txqs->gen;
 		const uint64_t *fp = (const uint64_t *)sgl;
 		struct work_request_hdr *wp = wrp;
 		
 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
 		    V_WR_SGLSFLT(flits)) | wr_hi;
 		
 		while (sgl_flits) {
 			unsigned int avail = WR_FLITS - flits;
 
 			if (avail > sgl_flits)
 				avail = sgl_flits;
 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
 			sgl_flits -= avail;
 			ndesc--;
 			if (!sgl_flits)
 				break;
 			
 			fp += avail;
 			txd++;
 			txsd++;
 			if (++txqs->pidx == txq->size) {
 				txqs->pidx = 0;
 				txqs->gen ^= 1;
 				txd = txq->desc;
 				txsd = txq->sdesc;
 			}
 
 			/*
 			 * when the head of the mbuf chain
 			 * is freed all clusters will be freed
 			 * with it
 			 */
 			wrp = (struct work_request_hdr *)txd;
 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
 			    V_WR_SGLSFLT(1)) | wr_hi;
 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
 				    sgl_flits + 1)) |
 			    V_WR_GEN(txqs->gen)) | wr_lo;
 			wr_gen2(txd, txqs->gen);
 			flits = 1;
 		}
 		wrp->wrh_hi |= htonl(F_WR_EOP);
 		wmb();
 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
 		wr_gen2((struct tx_desc *)wp, ogen);
 	}
 }
 
 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
 
 #define GET_VTAG(cntrl, m) \
 do { \
 	if ((m)->m_flags & M_VLANTAG)					            \
 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
 } while (0)
 
 static int
 t3_encap(struct sge_qset *qs, struct mbuf **m)
 {
 	adapter_t *sc;
 	struct mbuf *m0;
 	struct sge_txq *txq;
 	struct txq_state txqs;
 	struct port_info *pi;
 	unsigned int ndesc, flits, cntrl, mlen;
 	int err, nsegs, tso_info = 0;
 
 	struct work_request_hdr *wrp;
 	struct tx_sw_desc *txsd;
 	struct sg_ent *sgp, *sgl;
 	uint32_t wr_hi, wr_lo, sgl_flits; 
 	bus_dma_segment_t segs[TX_MAX_SEGS];
 
 	struct tx_desc *txd;
 		
 	pi = qs->port;
 	sc = pi->adapter;
 	txq = &qs->txq[TXQ_ETH];
 	txd = &txq->desc[txq->pidx];
 	txsd = &txq->sdesc[txq->pidx];
 	sgl = txq->txq_sgl;
 
 	prefetch(txd);
 	m0 = *m;
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
 	
 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
 
 	if (m0->m_nextpkt != NULL) {
 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
 		ndesc = 1;
 		mlen = 0;
 	} else {
 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
 		    &m0, segs, &nsegs))) {
 			if (cxgb_debug)
 				printf("failed ... err=%d\n", err);
 			return (err);
 		}
 		mlen = m0->m_pkthdr.len;
 		ndesc = calc_tx_descs(m0, nsegs);
 	}
 	txq_prod(txq, ndesc, &txqs);
 
 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
 	txsd->m = m0;
 
 	if (m0->m_nextpkt != NULL) {
 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
 		int i, fidx;
 
 		if (nsegs > 7)
 			panic("trying to coalesce %d packets in to one WR", nsegs);
 		txq->txq_coalesced += nsegs;
 		wrp = (struct work_request_hdr *)txd;
 		flits = nsegs*2 + 1;
 
 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
 			struct cpl_tx_pkt_batch_entry *cbe;
 			uint64_t flit;
 			uint32_t *hflit = (uint32_t *)&flit;
 			int cflags = m0->m_pkthdr.csum_flags;
 
 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
 			GET_VTAG(cntrl, m0);
 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
 			if (__predict_false(!(cflags & CSUM_IP)))
 				cntrl |= F_TXPKT_IPCSUM_DIS;
 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
 				cntrl |= F_TXPKT_L4CSUM_DIS;
 
 			hflit[0] = htonl(cntrl);
 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
 			flit |= htobe64(1 << 24);
 			cbe = &cpl_batch->pkt_entry[i];
 			cbe->cntrl = hflit[0];
 			cbe->len = hflit[1];
 			cbe->addr = htobe64(segs[i].ds_addr);
 		}
 
 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 		    V_WR_SGLSFLT(flits)) |
 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
 		wr_lo = htonl(V_WR_LEN(flits) |
 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
 		set_wr_hdr(wrp, wr_hi, wr_lo);
 		wmb();
 		ETHER_BPF_MTAP(pi->ifp, m0);
 		wr_gen2(txd, txqs.gen);
 		check_ring_tx_db(sc, txq, 0);
 		return (0);		
 	} else if (tso_info) {
 		int eth_type;
 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
 		struct ether_header *eh;
 		struct ip *ip;
 		struct tcphdr *tcp;
 
 		txd->flit[2] = 0;
 		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
 		hdr->cntrl = htonl(cntrl);
 		hdr->len = htonl(mlen | 0x80000000);
 
 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
 			    m0, mlen, m0->m_pkthdr.tso_segsz,
 			    m0->m_pkthdr.csum_flags, m0->m_flags);
 			panic("tx tso packet too small");
 		}
 
 		/* Make sure that ether, ip, tcp headers are all in m0 */
 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
 			if (__predict_false(m0 == NULL)) {
 				/* XXX panic probably an overreaction */
 				panic("couldn't fit header into mbuf");
 			}
 		}
 
 		eh = mtod(m0, struct ether_header *);
 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 			eth_type = CPL_ETH_II_VLAN;
 			ip = (struct ip *)((struct ether_vlan_header *)eh + 1);
 		} else {
 			eth_type = CPL_ETH_II;
 			ip = (struct ip *)(eh + 1);
 		}
 		tcp = (struct tcphdr *)(ip + 1);
 
 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
 		hdr->lso_info = htonl(tso_info);
 
 		if (__predict_false(mlen <= PIO_LEN)) {
 			/*
 			 * pkt not undersized but fits in PIO_LEN
 			 * Indicates a TSO bug at the higher levels.
 			 */
 			txsd->m = NULL;
 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
 			flits = (mlen + 7) / 8 + 3;
 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
 					  F_WR_SOP | F_WR_EOP | txqs.compl);
 			wr_lo = htonl(V_WR_LEN(flits) |
 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
 			wmb();
 			ETHER_BPF_MTAP(pi->ifp, m0);
 			wr_gen2(txd, txqs.gen);
 			check_ring_tx_db(sc, txq, 0);
 			m_freem(m0);
 			return (0);
 		}
 		flits = 3;	
 	} else {
 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
 		
 		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
 			cntrl |= F_TXPKT_IPCSUM_DIS;
 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
 			cntrl |= F_TXPKT_L4CSUM_DIS;
 		cpl->cntrl = htonl(cntrl);
 		cpl->len = htonl(mlen | 0x80000000);
 
 		if (mlen <= PIO_LEN) {
 			txsd->m = NULL;
 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
 			flits = (mlen + 7) / 8 + 2;
 			
 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
 					  F_WR_SOP | F_WR_EOP | txqs.compl);
 			wr_lo = htonl(V_WR_LEN(flits) |
 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
 			wmb();
 			ETHER_BPF_MTAP(pi->ifp, m0);
 			wr_gen2(txd, txqs.gen);
 			check_ring_tx_db(sc, txq, 0);
 			m_freem(m0);
 			return (0);
 		}
 		flits = 2;
 	}
 	wrp = (struct work_request_hdr *)txd;
 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
 	make_sgl(sgp, segs, nsegs);
 
 	sgl_flits = sgl_len(nsegs);
 
 	ETHER_BPF_MTAP(pi->ifp, m0);
 
 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
 	wr_lo = htonl(V_WR_TID(txq->token));
 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
 	    sgl_flits, wr_hi, wr_lo);
 	check_ring_tx_db(sc, txq, 0);
 
 	return (0);
 }
 
 void
 cxgb_tx_watchdog(void *arg)
 {
 	struct sge_qset *qs = arg;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 
         if (qs->coalescing != 0 &&
 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
 	    TXQ_RING_EMPTY(qs))
                 qs->coalescing = 0; 
         else if (qs->coalescing == 0 &&
 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
                 qs->coalescing = 1;
 	if (TXQ_TRYLOCK(qs)) {
 		qs->qs_flags |= QS_FLUSHING;
 		cxgb_start_locked(qs);
 		qs->qs_flags &= ~QS_FLUSHING;
 		TXQ_UNLOCK(qs);
 	}
 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
 		    qs, txq->txq_watchdog.c_cpu);
 }
 
 static void
 cxgb_tx_timeout(void *arg)
 {
 	struct sge_qset *qs = arg;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 
 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
                 qs->coalescing = 1;	
 	if (TXQ_TRYLOCK(qs)) {
 		qs->qs_flags |= QS_TIMEOUT;
 		cxgb_start_locked(qs);
 		qs->qs_flags &= ~QS_TIMEOUT;
 		TXQ_UNLOCK(qs);
 	}
 }
 
 static void
 cxgb_start_locked(struct sge_qset *qs)
 {
 	struct mbuf *m_head = NULL;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 	struct port_info *pi = qs->port;
 	struct ifnet *ifp = pi->ifp;
 
 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
 		reclaim_completed_tx(qs, 0, TXQ_ETH);
 
 	if (!pi->link_config.link_ok) {
 		TXQ_RING_FLUSH(qs);
 		return;
 	}
 	TXQ_LOCK_ASSERT(qs);
 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
 	    pi->link_config.link_ok) {
 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
 
 		if (txq->size - txq->in_use <= TX_MAX_DESC)
 			break;
 
 		if ((m_head = cxgb_dequeue(qs)) == NULL)
 			break;
 		/*
 		 *  Encapsulation can modify our pointer, and or make it
 		 *  NULL on failure.  In that event, we can't requeue.
 		 */
 		if (t3_encap(qs, &m_head) || m_head == NULL)
 			break;
 
 		m_head = NULL;
 	}
 
 	if (txq->db_pending)
 		check_ring_tx_db(pi->adapter, txq, 1);
 
 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
 	    pi->link_config.link_ok)
 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
 		    qs, txq->txq_timer.c_cpu);
 	if (m_head != NULL)
 		m_freem(m_head);
 }
 
 static int
 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
 {
 	struct port_info *pi = qs->port;
 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
 	struct buf_ring *br = txq->txq_mr;
 	int error, avail;
 
 	avail = txq->size - txq->in_use;
 	TXQ_LOCK_ASSERT(qs);
 
 	/*
 	 * We can only do a direct transmit if the following are true:
 	 * - we aren't coalescing (ring < 3/4 full)
 	 * - the link is up -- checked in caller
 	 * - there are no packets enqueued already
 	 * - there is space in hardware transmit queue 
 	 */
 	if (check_pkt_coalesce(qs) == 0 &&
 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
 		if (t3_encap(qs, &m)) {
 			if (m != NULL &&
 			    (error = drbr_enqueue(ifp, br, m)) != 0) 
 				return (error);
 		} else {
 			if (txq->db_pending)
 				check_ring_tx_db(pi->adapter, txq, 1);
 
 			/*
 			 * We've bypassed the buf ring so we need to update
 			 * the stats directly
 			 */
 			txq->txq_direct_packets++;
 			txq->txq_direct_bytes += m->m_pkthdr.len;
 		}
 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
 		return (error);
 
 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
 		cxgb_start_locked(qs);
 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
 		    qs, txq->txq_timer.c_cpu);
 	return (0);
 }
 
 int
 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct sge_qset *qs;
 	struct port_info *pi = ifp->if_softc;
 	int error, qidx = pi->first_qset;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
 	    ||(!pi->link_config.link_ok)) {
 		m_freem(m);
 		return (0);
 	}
 	
 	if (m->m_flags & M_FLOWID)
 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
 
 	qs = &pi->adapter->sge.qs[qidx];
 	
 	if (TXQ_TRYLOCK(qs)) {
 		/* XXX running */
 		error = cxgb_transmit_locked(ifp, qs, m);
 		TXQ_UNLOCK(qs);
 	} else
 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
 	return (error);
 }
 void
 cxgb_start(struct ifnet *ifp)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
 	
 	if (!pi->link_config.link_ok)
 		return;
 
 	TXQ_LOCK(qs);
 	cxgb_start_locked(qs);
 	TXQ_UNLOCK(qs);
 }
 
 void
 cxgb_qflush(struct ifnet *ifp)
 {
 	/*
 	 * flush any enqueued mbufs in the buf_rings
 	 * and in the transmit queues
 	 * no-op for now
 	 */
 	return;
 }
 
 /**
  *	write_imm - write a packet into a Tx descriptor as immediate data
  *	@d: the Tx descriptor to write
  *	@m: the packet
  *	@len: the length of packet data to write as immediate data
  *	@gen: the generation bit value to write
  *
  *	Writes a packet as immediate data into a Tx descriptor.  The packet
  *	contains a work request at its beginning.  We must write the packet
  *	carefully so the SGE doesn't read accidentally before it's written in
  *	its entirety.
  */
 static __inline void
 write_imm(struct tx_desc *d, struct mbuf *m,
 	  unsigned int len, unsigned int gen)
 {
 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
 	struct work_request_hdr *to = (struct work_request_hdr *)d;
 	uint32_t wr_hi, wr_lo;
 
 	if (len > WR_LEN)
 		panic("len too big %d\n", len);
 	if (len < sizeof(*from))
 		panic("len too small %d", len);
 	
 	memcpy(&to[1], &from[1], len - sizeof(*from));
 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
 					V_WR_BCNTLFLT(len & 7));
 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
 					V_WR_LEN((len + 7) / 8));
 	set_wr_hdr(to, wr_hi, wr_lo);
 	wmb();
 	wr_gen2(d, gen);
 
 	/*
 	 * This check is a hack we should really fix the logic so
 	 * that this can't happen
 	 */
 	if (m->m_type != MT_DONTFREE)
 		m_freem(m);
 	
 }
 
 /**
  *	check_desc_avail - check descriptor availability on a send queue
  *	@adap: the adapter
  *	@q: the TX queue
  *	@m: the packet needing the descriptors
  *	@ndesc: the number of Tx descriptors needed
  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
  *
  *	Checks if the requested number of Tx descriptors is available on an
  *	SGE send queue.  If the queue is already suspended or not enough
  *	descriptors are available the packet is queued for later transmission.
  *	Must be called with the Tx queue locked.
  *
  *	Returns 0 if enough descriptors are available, 1 if there aren't
  *	enough descriptors and the packet has been queued, and 2 if the caller
  *	needs to retry because there weren't enough descriptors at the
  *	beginning of the call but some freed up in the mean time.
  */
 static __inline int
 check_desc_avail(adapter_t *adap, struct sge_txq *q,
 		 struct mbuf *m, unsigned int ndesc,
 		 unsigned int qid)
 {
 	/* 
 	 * XXX We currently only use this for checking the control queue
 	 * the control queue is only used for binding qsets which happens
 	 * at init time so we are guaranteed enough descriptors
 	 */
 	if (__predict_false(!mbufq_empty(&q->sendq))) {
 addq_exit:	mbufq_tail(&q->sendq, m);
 		return 1;
 	}
 	if (__predict_false(q->size - q->in_use < ndesc)) {
 
 		struct sge_qset *qs = txq_to_qset(q, qid);
 
 		setbit(&qs->txq_stopped, qid);
 		if (should_restart_tx(q) &&
 		    test_and_clear_bit(qid, &qs->txq_stopped))
 			return 2;
 
 		q->stops++;
 		goto addq_exit;
 	}
 	return 0;
 }
 
 
 /**
  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
  *	@q: the SGE control Tx queue
  *
  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
  *	that send only immediate data (presently just the control queues) and
  *	thus do not have any mbufs
  */
 static __inline void
 reclaim_completed_tx_imm(struct sge_txq *q)
 {
 	unsigned int reclaim = q->processed - q->cleaned;
 
 	q->in_use -= reclaim;
 	q->cleaned += reclaim;
 }
 
 static __inline int
 immediate(const struct mbuf *m)
 {
 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
 }
 
 /**
  *	ctrl_xmit - send a packet through an SGE control Tx queue
  *	@adap: the adapter
  *	@q: the control queue
  *	@m: the packet
  *
  *	Send a packet through an SGE control Tx queue.  Packets sent through
  *	a control queue must fit entirely as immediate data in a single Tx
  *	descriptor and have no page fragments.
  */
 static int
 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
 {
 	int ret;
 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	
 	if (__predict_false(!immediate(m))) {
 		m_freem(m);
 		return 0;
 	}
 	
 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
 
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx_imm(q);
 
 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
 			TXQ_UNLOCK(qs);
 			return (ENOSPC);
 		}
 		goto again;
 	}
 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
 	
 	q->in_use++;
 	if (++q->pidx >= q->size) {
 		q->pidx = 0;
 		q->gen ^= 1;
 	}
 	TXQ_UNLOCK(qs);
 	wmb();
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 	return (0);
 }
 
 
 /**
  *	restart_ctrlq - restart a suspended control queue
  *	@qs: the queue set cotaining the control queue
  *
  *	Resumes transmission on a suspended Tx control queue.
  */
 static void
 restart_ctrlq(void *data, int npending)
 {
 	struct mbuf *m;
 	struct sge_qset *qs = (struct sge_qset *)data;
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	adapter_t *adap = qs->port->adapter;
 
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx_imm(q);
 
 	while (q->in_use < q->size &&
 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
 
 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
 
 		if (++q->pidx >= q->size) {
 			q->pidx = 0;
 			q->gen ^= 1;
 		}
 		q->in_use++;
 	}
 	if (!mbufq_empty(&q->sendq)) {
 		setbit(&qs->txq_stopped, TXQ_CTRL);
 
 		if (should_restart_tx(q) &&
 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
 			goto again;
 		q->stops++;
 	}
 	TXQ_UNLOCK(qs);
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 }
 
 
 /*
  * Send a management message through control queue 0
  */
 int
 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
 {
 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
 }
 
 /**
  *	free_qset - free the resources of an SGE queue set
  *	@sc: the controller owning the queue set
  *	@q: the queue set
  *
  *	Release the HW and SW resources associated with an SGE queue set, such
  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
  *	queue set must be quiesced prior to calling this.
  */
 static void
 t3_free_qset(adapter_t *sc, struct sge_qset *q)
 {
 	int i;
 	
 	reclaim_completed_tx(q, 0, TXQ_ETH);
 	if (q->txq[TXQ_ETH].txq_mr != NULL) 
 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
 	}
 
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		if (q->fl[i].desc) {
 			mtx_lock_spin(&sc->sge.reg_lock);
 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
 			mtx_unlock_spin(&sc->sge.reg_lock);
 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
 					q->fl[i].desc_map);
 			bus_dma_tag_destroy(q->fl[i].desc_tag);
 			bus_dma_tag_destroy(q->fl[i].entry_tag);
 		}
 		if (q->fl[i].sdesc) {
 			free_rx_bufs(sc, &q->fl[i]);
 			free(q->fl[i].sdesc, M_DEVBUF);
 		}
 	}
 
 	mtx_unlock(&q->lock);
 	MTX_DESTROY(&q->lock);
 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
 		if (q->txq[i].desc) {
 			mtx_lock_spin(&sc->sge.reg_lock);
 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
 			mtx_unlock_spin(&sc->sge.reg_lock);
 			bus_dmamap_unload(q->txq[i].desc_tag,
 					q->txq[i].desc_map);
 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
 					q->txq[i].desc_map);
 			bus_dma_tag_destroy(q->txq[i].desc_tag);
 			bus_dma_tag_destroy(q->txq[i].entry_tag);
 		}
 		if (q->txq[i].sdesc) {
 			free(q->txq[i].sdesc, M_DEVBUF);
 		}
 	}
 
 	if (q->rspq.desc) {
 		mtx_lock_spin(&sc->sge.reg_lock);
 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
 		mtx_unlock_spin(&sc->sge.reg_lock);
 		
 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
 			        q->rspq.desc_map);
 		bus_dma_tag_destroy(q->rspq.desc_tag);
 		MTX_DESTROY(&q->rspq.lock);
 	}
 
 #ifdef INET
 	tcp_lro_free(&q->lro.ctrl);
 #endif
 
 	bzero(q, sizeof(*q));
 }
 
 /**
  *	t3_free_sge_resources - free SGE resources
  *	@sc: the adapter softc
  *
  *	Frees resources used by the SGE queue sets.
  */
 void
 t3_free_sge_resources(adapter_t *sc)
 {
 	int i, nqsets;
 	
 	for (nqsets = i = 0; i < (sc)->params.nports; i++) 
 		nqsets += sc->port[i].nqsets;
 
 	for (i = 0; i < nqsets; ++i) {
 		TXQ_LOCK(&sc->sge.qs[i]);
 		t3_free_qset(sc, &sc->sge.qs[i]);
 	}
 	
 }
 
 /**
  *	t3_sge_start - enable SGE
  *	@sc: the controller softc
  *
  *	Enables the SGE for DMAs.  This is the last step in starting packet
  *	transfers.
  */
 void
 t3_sge_start(adapter_t *sc)
 {
 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
 }
 
 /**
  *	t3_sge_stop - disable SGE operation
  *	@sc: the adapter
  *
  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
  *	from error interrupts) or from normal process context.  In the latter
  *	case it also disables any pending queue restart tasklets.  Note that
  *	if it is called in interrupt context it cannot disable the restart
  *	tasklets as it cannot wait, however the tasklets will have no effect
  *	since the doorbells are disabled and the driver will call this again
  *	later from process context, at which time the tasklets will be stopped
  *	if they are still running.
  */
 void
 t3_sge_stop(adapter_t *sc)
 {
 	int i, nqsets;
 	
 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
 
 	if (sc->tq == NULL)
 		return;
 	
 	for (nqsets = i = 0; i < (sc)->params.nports; i++) 
 		nqsets += sc->port[i].nqsets;
 #ifdef notyet
 	/*
 	 * 
 	 * XXX
 	 */
 	for (i = 0; i < nqsets; ++i) {
 		struct sge_qset *qs = &sc->sge.qs[i];
 		
 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 #endif
 }
 
 /**
  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
  *	@adapter: the adapter
  *	@q: the Tx queue to reclaim descriptors from
  *	@reclaimable: the number of descriptors to reclaim
  *      @m_vec_size: maximum number of buffers to reclaim
  *      @desc_reclaimed: returns the number of descriptors reclaimed
  *
  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
  *	Tx buffers.  Called with the Tx queue lock held.
  *
  *      Returns number of buffers of reclaimed   
  */
 void
 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
 {
 	struct tx_sw_desc *txsd;
 	unsigned int cidx, mask;
 	struct sge_txq *q = &qs->txq[queue];
 
 #ifdef T3_TRACE
 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
 #endif
 	cidx = q->cidx;
 	mask = q->size - 1;
 	txsd = &q->sdesc[cidx];
 
 	mtx_assert(&qs->lock, MA_OWNED);
 	while (reclaimable--) {
 		prefetch(q->sdesc[(cidx + 1) & mask].m);
 		prefetch(q->sdesc[(cidx + 2) & mask].m);
 
 		if (txsd->m != NULL) {
 			if (txsd->flags & TX_SW_DESC_MAPPED) {
 				bus_dmamap_unload(q->entry_tag, txsd->map);
 				txsd->flags &= ~TX_SW_DESC_MAPPED;
 			}
 			m_freem_list(txsd->m);
 			txsd->m = NULL;
 		} else
 			q->txq_skipped++;
 		
 		++txsd;
 		if (++cidx == q->size) {
 			cidx = 0;
 			txsd = q->sdesc;
 		}
 	}
 	q->cidx = cidx;
 
 }
 
 /**
  *	is_new_response - check if a response is newly written
  *	@r: the response descriptor
  *	@q: the response queue
  *
  *	Returns true if a response descriptor contains a yet unprocessed
  *	response.
  */
 static __inline int
 is_new_response(const struct rsp_desc *r,
     const struct sge_rspq *q)
 {
 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
 }
 
 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
 
 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
 #define NOMEM_INTR_DELAY 2500
 
 /**
  *	write_ofld_wr - write an offload work request
  *	@adap: the adapter
  *	@m: the packet to send
  *	@q: the Tx queue
  *	@pidx: index of the first Tx descriptor to write
  *	@gen: the generation value to use
  *	@ndesc: number of descriptors the packet will occupy
  *
  *	Write an offload work request to send the supplied packet.  The packet
  *	data already carry the work request with most fields populated.
  */
 static void
 write_ofld_wr(adapter_t *adap, struct mbuf *m,
     struct sge_txq *q, unsigned int pidx,
     unsigned int gen, unsigned int ndesc,
     bus_dma_segment_t *segs, unsigned int nsegs)
 {
 	unsigned int sgl_flits, flits;
 	struct work_request_hdr *from;
 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
 	struct tx_desc *d = &q->desc[pidx];
 	struct txq_state txqs;
 	
 	if (immediate(m) && nsegs == 0) {
 		write_imm(d, m, m->m_len, gen);
 		return;
 	}
 
 	/* Only TX_DATA builds SGLs */
 	from = mtod(m, struct work_request_hdr *);
 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
 
 	flits = m->m_len / 8;
 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
 
 	make_sgl(sgp, segs, nsegs);
 	sgl_flits = sgl_len(nsegs);
 
 	txqs.gen = gen;
 	txqs.pidx = pidx;
 	txqs.compl = 0;
 
 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
 	    from->wrh_hi, from->wrh_lo);
 }
 
 /**
  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
  *	@m: the packet
  *
  * 	Returns the number of Tx descriptors needed for the given offload
  * 	packet.  These packets are already fully constructed.
  */
 static __inline unsigned int
 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
 {
 	unsigned int flits, cnt = 0;
 	int ndescs;
 
 	if (m->m_len <= WR_LEN && nsegs == 0)
 		return (1);                 /* packet fits as immediate data */
 
 	/*
 	 * This needs to be re-visited for TOE
 	 */
 
 	cnt = nsegs;
 		
 	/* headers */
 	flits = m->m_len / 8;
 
 	ndescs = flits_to_desc(flits + sgl_len(cnt));
 
 	return (ndescs);
 }
 
 /**
  *	ofld_xmit - send a packet through an offload queue
  *	@adap: the adapter
  *	@q: the Tx offload queue
  *	@m: the packet
  *
  *	Send an offload packet through an SGE offload queue.
  */
 static int
 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
 {
 	int ret, nsegs;
 	unsigned int ndesc;
 	unsigned int pidx, gen;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
 	struct tx_sw_desc *stx;
 
 	nsegs = m_get_sgllen(m);
 	vsegs = m_get_sgl(m);
 	ndesc = calc_tx_descs_ofld(m, nsegs);
 	busdma_map_sgl(vsegs, segs, nsegs);
 
 	stx = &q->sdesc[q->pidx];
 	
 	TXQ_LOCK(qs);
 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
 			printf("no ofld desc avail\n");
 			
 			m_set_priority(m, ndesc);     /* save for restart */
 			TXQ_UNLOCK(qs);
 			return (EINTR);
 		}
 		goto again;
 	}
 
 	gen = q->gen;
 	q->in_use += ndesc;
 	pidx = q->pidx;
 	q->pidx += ndesc;
 	if (q->pidx >= q->size) {
 		q->pidx -= q->size;
 		q->gen ^= 1;
 	}
 #ifdef T3_TRACE
 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
 		  skb_shinfo(skb)->nr_frags);
 #endif
 	TXQ_UNLOCK(qs);
 
 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
 	check_ring_tx_db(adap, q, 1);
 	return (0);
 }
 
 /**
  *	restart_offloadq - restart a suspended offload queue
  *	@qs: the queue set cotaining the offload queue
  *
  *	Resumes transmission on a suspended Tx offload queue.
  */
 static void
 restart_offloadq(void *data, int npending)
 {
 	struct mbuf *m;
 	struct sge_qset *qs = data;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	adapter_t *adap = qs->port->adapter;
 	bus_dma_segment_t segs[TX_MAX_SEGS];
 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
 	int nsegs, cleaned;
 		
 	TXQ_LOCK(qs);
 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
 
 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
 		unsigned int gen, pidx;
 		unsigned int ndesc = m_get_priority(m);
 
 		if (__predict_false(q->size - q->in_use < ndesc)) {
 			setbit(&qs->txq_stopped, TXQ_OFLD);
 			if (should_restart_tx(q) &&
 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
 				goto again;
 			q->stops++;
 			break;
 		}
 
 		gen = q->gen;
 		q->in_use += ndesc;
 		pidx = q->pidx;
 		q->pidx += ndesc;
 		if (q->pidx >= q->size) {
 			q->pidx -= q->size;
 			q->gen ^= 1;
 		}
 		
 		(void)mbufq_dequeue(&q->sendq);
 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
 		TXQ_UNLOCK(qs);
 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
 		TXQ_LOCK(qs);
 	}
 #if USE_GTS
 	set_bit(TXQ_RUNNING, &q->flags);
 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
 #endif
 	TXQ_UNLOCK(qs);
 	wmb();
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 }
 
 /**
  *	queue_set - return the queue set a packet should use
  *	@m: the packet
  *
  *	Maps a packet to the SGE queue set it should use.  The desired queue
  *	set is carried in bits 1-3 in the packet's priority.
  */
 static __inline int
 queue_set(const struct mbuf *m)
 {
 	return m_get_priority(m) >> 1;
 }
 
 /**
  *	is_ctrl_pkt - return whether an offload packet is a control packet
  *	@m: the packet
  *
  *	Determines whether an offload packet should use an OFLD or a CTRL
  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
  */
 static __inline int
 is_ctrl_pkt(const struct mbuf *m)
 {
 	return m_get_priority(m) & 1;
 }
 
 /**
  *	t3_offload_tx - send an offload packet
  *	@tdev: the offload device to send to
  *	@m: the packet
  *
  *	Sends an offload packet.  We use the packet priority to select the
  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
  *	should be sent as regular or control, bits 1-3 select the queue set.
  */
 int
 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
 {
 	adapter_t *adap = tdev2adap(tdev);
 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
 
 	if (__predict_false(is_ctrl_pkt(m))) 
 		return ctrl_xmit(adap, qs, m);
 
 	return ofld_xmit(adap, qs, m);
 }
 
 /**
  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
  *	@tdev: the offload device that will be receiving the packets
  *	@q: the SGE response queue that assembled the bundle
  *	@m: the partial bundle
  *	@n: the number of packets in the bundle
  *
  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
  */
 static __inline void
 deliver_partial_bundle(struct t3cdev *tdev,
 			struct sge_rspq *q,
 			struct mbuf *mbufs[], int n)
 {
 	if (n) {
 		q->offload_bundles++;
 		cxgb_ofld_recv(tdev, mbufs, n);
 	}
 }
 
 static __inline int
 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
     struct mbuf *m, struct mbuf *rx_gather[],
     unsigned int gather_idx)
 {
 	
 	rq->offload_pkts++;
 	m->m_pkthdr.header = mtod(m, void *);
 	rx_gather[gather_idx++] = m;
 	if (gather_idx == RX_BUNDLE_SIZE) {
 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
 		gather_idx = 0;
 		rq->offload_bundles++;
 	}
 	return (gather_idx);
 }
 
 static void
 restart_tx(struct sge_qset *qs)
 {
 	struct adapter *sc = qs->port->adapter;
 	
 	
 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
 		qs->txq[TXQ_OFLD].restarts++;
 		DPRINTF("restarting TXQ_OFLD\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 	}
 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
 	    qs->txq[TXQ_CTRL].in_use);
 	
 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
 		qs->txq[TXQ_CTRL].restarts++;
 		DPRINTF("restarting TXQ_CTRL\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 }
 
 /**
  *	t3_sge_alloc_qset - initialize an SGE queue set
  *	@sc: the controller softc
  *	@id: the queue set id
  *	@nports: how many Ethernet ports will be using this queue set
  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
  *	@p: configuration parameters for this queue set
  *	@ntxq: number of Tx queues for the queue set
  *	@pi: port info for queue set
  *
  *	Allocate resources and initialize an SGE queue set.  A queue set
  *	comprises a response queue, two Rx free-buffer queues, and up to 3
  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
  *	queue, offload queue, and control queue.
  */
 int
 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
 		  const struct qset_params *p, int ntxq, struct port_info *pi)
 {
 	struct sge_qset *q = &sc->sge.qs[id];
 	int i, ret = 0;
 
 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
 	q->port = pi;
 
 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
 		goto err;
 	}
 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
 	    M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(sc->dev, "failed to allocate ifq\n");
 		goto err;
 	}
 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);	
 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
 
 	init_qset_cntxt(q, id);
 	q->idx = id;
 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
 		    &q->fl[0].desc, &q->fl[0].sdesc,
 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
 		printf("error %d from alloc ring fl0\n", ret);
 		goto err;
 	}
 
 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
 		    &q->fl[1].desc, &q->fl[1].sdesc,
 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
 		printf("error %d from alloc ring fl1\n", ret);
 		goto err;
 	}
 
 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
 		    &q->rspq.desc_tag, &q->rspq.desc_map,
 		    NULL, NULL)) != 0) {
 		printf("error %d from alloc ring rspq\n", ret);
 		goto err;
 	}
 
 	for (i = 0; i < ntxq; ++i) {
 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
 
 		if ((ret = alloc_ring(sc, p->txq_size[i],
 			    sizeof(struct tx_desc), sz,
 			    &q->txq[i].phys_addr, &q->txq[i].desc,
 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
 			    &q->txq[i].desc_map,
 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
 			printf("error %d from alloc ring tx %i\n", ret, i);
 			goto err;
 		}
 		mbufq_init(&q->txq[i].sendq);
 		q->txq[i].gen = 1;
 		q->txq[i].size = p->txq_size[i];
 	}
 	
 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
 
 	q->fl[0].gen = q->fl[1].gen = 1;
 	q->fl[0].size = p->fl_size;
 	q->fl[1].size = p->jumbo_size;
 
 	q->rspq.gen = 1;
 	q->rspq.cidx = 0;
 	q->rspq.size = p->rspq_size;
 
 	q->txq[TXQ_ETH].stop_thres = nports *
 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
 
 	q->fl[0].buf_size = MCLBYTES;
 	q->fl[0].zone = zone_pack;
 	q->fl[0].type = EXT_PACKET;
 
 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
 		q->fl[1].zone = zone_jumbo16;
 		q->fl[1].type = EXT_JUMBO16;
 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
 		q->fl[1].zone = zone_jumbo9;
 		q->fl[1].type = EXT_JUMBO9;		
 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
 		q->fl[1].zone = zone_jumbop;
 		q->fl[1].type = EXT_JUMBOP;
 	} else {
 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
 		ret = EDOOFUS;
 		goto err;
 	}
 	q->fl[1].buf_size = p->jumbo_buf_size;
 
 	/* Allocate and setup the lro_ctrl structure */
 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
 #ifdef INET
 	ret = tcp_lro_init(&q->lro.ctrl);
 	if (ret) {
 		printf("error %d from tcp_lro_init\n", ret);
 		goto err;
 	}
 #endif
 	q->lro.ctrl.ifp = pi->ifp;
 
 	mtx_lock_spin(&sc->sge.reg_lock);
 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
 				   q->rspq.phys_addr, q->rspq.size,
 				   q->fl[0].buf_size, 1, 0);
 	if (ret) {
 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
 		goto err_unlock;
 	}
 
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
 					  q->fl[i].phys_addr, q->fl[i].size,
 					  q->fl[i].buf_size, p->cong_thres, 1,
 					  0);
 		if (ret) {
 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
 			goto err_unlock;
 		}
 	}
 
 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
 				 1, 0);
 	if (ret) {
 		printf("error %d from t3_sge_init_ecntxt\n", ret);
 		goto err_unlock;
 	}
 
 	if (ntxq > 1) {
 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
 					 USE_GTS, SGE_CNTXT_OFLD, id,
 					 q->txq[TXQ_OFLD].phys_addr,
 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
 		if (ret) {
 			printf("error %d from t3_sge_init_ecntxt\n", ret);
 			goto err_unlock;
 		}
 	}
 
 	if (ntxq > 2) {
 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
 					 SGE_CNTXT_CTRL, id,
 					 q->txq[TXQ_CTRL].phys_addr,
 					 q->txq[TXQ_CTRL].size,
 					 q->txq[TXQ_CTRL].token, 1, 0);
 		if (ret) {
 			printf("error %d from t3_sge_init_ecntxt\n", ret);
 			goto err_unlock;
 		}
 	}
 	
 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
 	    device_get_unit(sc->dev), irq_vec_idx);
 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
 	
 	mtx_unlock_spin(&sc->sge.reg_lock);
 	t3_update_qset_coalesce(q, p);
 	q->port = pi;
 	
 	refill_fl(sc, &q->fl[0], q->fl[0].size);
 	refill_fl(sc, &q->fl[1], q->fl[1].size);
 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
 
 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
 		     V_NEWTIMER(q->rspq.holdoff_tmr));
 
 	return (0);
 
 err_unlock:
 	mtx_unlock_spin(&sc->sge.reg_lock);
 err:	
 	TXQ_LOCK(q);
 	t3_free_qset(sc, q);
 
 	return (ret);
 }
 
 /*
  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
  * ethernet data.  Hardware assistance with various checksums and any vlan tag
  * will also be taken into account here.
  */
 void
 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
 {
 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
 	struct ifnet *ifp = pi->ifp;
 	
 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
 
 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
 	    cpl->csum_valid && cpl->csum == 0xffff) {
 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 		m->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_valid) {
 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
 		m->m_flags |= M_VLANTAG;
 	} 
 
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
 	/*
 	 * adjust after conversion to mbuf chain
 	 */
 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
 	m->m_len -= (sizeof(*cpl) + ethpad);
 	m->m_data += (sizeof(*cpl) + ethpad);
 }
 
 /**
  *	get_packet - return the next ingress packet buffer from a free list
  *	@adap: the adapter that received the packet
  *	@drop_thres: # of remaining buffers before we start dropping packets
  *	@qs: the qset that the SGE free list holding the packet belongs to
  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
  *      @r: response descriptor 
  *
  *	Get the next packet from a free list and complete setup of the
  *	sk_buff.  If the packet is small we make a copy and recycle the
  *	original buffer, otherwise we use the original buffer itself.  If a
  *	positive drop threshold is supplied packets are dropped and their
  *	buffers recycled if (a) the number of remaining buffers is under the
  *	threshold and the packet is too big to copy, or (b) the packet should
  *	be copied but there is no memory for the copy.
  */
 static int
 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
 {
 
 	unsigned int len_cq =  ntohl(r->len_cq);
 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
 	int mask, cidx = fl->cidx;
 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
 	uint32_t len = G_RSPD_LEN(len_cq);
 	uint32_t flags = M_EXT;
 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
 	caddr_t cl;
 	struct mbuf *m;
 	int ret = 0;
 
 	mask = fl->size - 1;
 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);	
 
 	fl->credits--;
 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
 	
 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
 	    sopeop == RSPQ_SOP_EOP) {
 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
 			goto skip_recycle;
 		cl = mtod(m, void *);
 		memcpy(cl, sd->rxsd_cl, len);
 		recycle_rx_buf(adap, fl, fl->cidx);
 		m->m_pkthdr.len = m->m_len = len;
 		m->m_flags = 0;
 		mh->mh_head = mh->mh_tail = m;
 		ret = 1;
 		goto done;
 	} else {
 	skip_recycle:
 		bus_dmamap_unload(fl->entry_tag, sd->map);
 		cl = sd->rxsd_cl;
 		m = sd->m;
 
 		if ((sopeop == RSPQ_SOP_EOP) ||
 		    (sopeop == RSPQ_SOP))
 			flags |= M_PKTHDR;
 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
 		if (fl->zone == zone_pack) {
 			/*
 			 * restore clobbered data pointer
 			 */
 			m->m_data = m->m_ext.ext_buf;
 		} else {
 			m_cljset(m, cl, fl->type);
 		}
 		m->m_len = len;
 	}		
 	switch(sopeop) {
 	case RSPQ_SOP_EOP:
 		ret = 1;
 		/* FALLTHROUGH */
 	case RSPQ_SOP:
 		mh->mh_head = mh->mh_tail = m;
 		m->m_pkthdr.len = len;
 		break;
 	case RSPQ_EOP:
 		ret = 1;
 		/* FALLTHROUGH */
 	case RSPQ_NSOP_NEOP:
 		if (mh->mh_tail == NULL) {
 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
 			m_freem(m);
 			break;
 		}
 		mh->mh_tail->m_next = m;
 		mh->mh_tail = m;
 		mh->mh_head->m_pkthdr.len += len;
 		break;
 	}
 	if (cxgb_debug)
 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
 done:
 	if (++fl->cidx == fl->size)
 		fl->cidx = 0;
 
 	return (ret);
 }
 
 /**
  *	handle_rsp_cntrl_info - handles control information in a response
  *	@qs: the queue set corresponding to the response
  *	@flags: the response control flags
  *
  *	Handles the control information of an SGE response, such as GTS
  *	indications and completion credits for the queue set's Tx queues.
  *	HW coalesces credits, we don't do any extra SW coalescing.
  */
 static __inline void
 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
 {
 	unsigned int credits;
 
 #if USE_GTS
 	if (flags & F_RSPD_TXQ0_GTS)
 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
 #endif
 	credits = G_RSPD_TXQ0_CR(flags);
 	if (credits) 
 		qs->txq[TXQ_ETH].processed += credits;
 
 	credits = G_RSPD_TXQ2_CR(flags);
 	if (credits)
 		qs->txq[TXQ_CTRL].processed += credits;
 
 # if USE_GTS
 	if (flags & F_RSPD_TXQ1_GTS)
 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
 # endif
 	credits = G_RSPD_TXQ1_CR(flags);
 	if (credits)
 		qs->txq[TXQ_OFLD].processed += credits;
 
 }
 
 static void
 check_ring_db(adapter_t *adap, struct sge_qset *qs,
     unsigned int sleeping)
 {
 	;
 }
 
 /**
  *	process_responses - process responses from an SGE response queue
  *	@adap: the adapter
  *	@qs: the queue set to which the response queue belongs
  *	@budget: how many responses can be processed in this round
  *
  *	Process responses from an SGE response queue up to the supplied budget.
  *	Responses include received packets as well as credits and other events
  *	for the queues that belong to the response queue's queue set.
  *	A negative budget is effectively unlimited.
  *
  *	Additionally choose the interrupt holdoff time for the next interrupt
  *	on this queue.  If the system is under memory shortage use a fairly
  *	long delay to help recovery.
  */
 static int
 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 {
 	struct sge_rspq *rspq = &qs->rspq;
 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
 	int budget_left = budget;
 	unsigned int sleeping = 0;
 	int lro_enabled = qs->lro.enabled;
 	int skip_lro;
 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
 	int ngathered = 0;
 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
 #ifdef DEBUG	
 	static int last_holdoff = 0;
 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
 		last_holdoff = rspq->holdoff_tmr;
 	}
 #endif
 	rspq->next_holdoff = rspq->holdoff_tmr;
 
 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
 		int eth, eop = 0, ethpad = 0;
 		uint32_t flags = ntohl(r->flags);
 		uint32_t rss_csum = *(const uint32_t *)r;
 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
 		
 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
 		
 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
 			struct mbuf *m;
 
 			if (cxgb_debug)
 				printf("async notification\n");
 
 			if (mh->mh_head == NULL) {
 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
 				m = mh->mh_head;
 			} else {
 				m = m_gethdr(M_DONTWAIT, MT_DATA);
 			}
 			if (m == NULL)
 				goto no_mem;
 
                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
 			eop = 1;
                         rspq->async_notif++;
 			goto skip;
 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
 			struct mbuf *m = NULL;
 
 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
 			    r->rss_hdr.opcode, rspq->cidx);
 			if (mh->mh_head == NULL)
 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
                         else 
 				m = m_gethdr(M_DONTWAIT, MT_DATA);
 
 			if (mh->mh_head == NULL &&  m == NULL) {	
 		no_mem:
 				rspq->next_holdoff = NOMEM_INTR_DELAY;
 				budget_left--;
 				break;
 			}
 			get_imm_packet(adap, r, mh->mh_head);
 			eop = 1;
 			rspq->imm_data++;
 		} else if (r->len_cq) {
 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
 			
 			eop = get_packet(adap, drop_thresh, qs, mh, r);
 			if (eop) {
 				if (r->rss_hdr.hash_type && !adap->timestamp)
 					mh->mh_head->m_flags |= M_FLOWID;
 				mh->mh_head->m_pkthdr.flowid = rss_hash;
 			}
 			
 			ethpad = 2;
 		} else {
 			rspq->pure_rsps++;
 		}
 	skip:
 		if (flags & RSPD_CTRL_MASK) {
 			sleeping |= flags & RSPD_GTS_MASK;
 			handle_rsp_cntrl_info(qs, flags);
 		}
 
 		r++;
 		if (__predict_false(++rspq->cidx == rspq->size)) {
 			rspq->cidx = 0;
 			rspq->gen ^= 1;
 			r = rspq->desc;
 		}
 
 		if (++rspq->credits >= 64) {
 			refill_rspq(adap, rspq, rspq->credits);
 			rspq->credits = 0;
 		}
 		if (!eth && eop) {
 			mh->mh_head->m_pkthdr.csum_data = rss_csum;
 			/*
 			 * XXX size mismatch
 			 */
 			m_set_priority(mh->mh_head, rss_hash);
 
 			
 			ngathered = rx_offload(&adap->tdev, rspq,
 			    mh->mh_head, offload_mbufs, ngathered);
 			mh->mh_head = NULL;
 			DPRINTF("received offload packet\n");
 			
 		} else if (eth && eop) {
 			struct mbuf *m = mh->mh_head;
 
 			t3_rx_eth(adap, rspq, m, ethpad);
 
 			/*
 			 * The T304 sends incoming packets on any qset.  If LRO
 			 * is also enabled, we could end up sending packet up
 			 * lro_ctrl->ifp's input.  That is incorrect.
 			 *
 			 * The mbuf's rcvif was derived from the cpl header and
 			 * is accurate.  Skip LRO and just use that.
 			 */
 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
 
 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
 #ifdef INET
 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
 #endif
 			    ) {
 				/* successfully queue'd for LRO */
 			} else {
 				/*
 				 * LRO not enabled, packet unsuitable for LRO,
 				 * or unable to queue.  Pass it up right now in
 				 * either case.
 				 */
 				struct ifnet *ifp = m->m_pkthdr.rcvif;
 				(*ifp->if_input)(ifp, m);
 			}
 			mh->mh_head = NULL;
 
 		}
 		__refill_fl_lt(adap, &qs->fl[0], 32);
 		__refill_fl_lt(adap, &qs->fl[1], 32);
 		--budget_left;
 	}
 
 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
 
 #ifdef INET
 	/* Flush LRO */
 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
 		tcp_lro_flush(lro_ctrl, queued);
 	}
 #endif
 
 	if (sleeping)
 		check_ring_db(adap, qs, sleeping);
 
 	mb();  /* commit Tx queue processed updates */
 	if (__predict_false(qs->txq_stopped > 1))
 		restart_tx(qs);
 
 	__refill_fl_lt(adap, &qs->fl[0], 512);
 	__refill_fl_lt(adap, &qs->fl[1], 512);
 	budget -= budget_left;
 	return (budget);
 }
 
 /*
  * A helper function that processes responses and issues GTS.
  */
 static __inline int
 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
 {
 	int work;
 	static int last_holdoff = 0;
 	
 	work = process_responses(adap, rspq_to_qset(rq), -1);
 
 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
 		printf("next_holdoff=%d\n", rq->next_holdoff);
 		last_holdoff = rq->next_holdoff;
 	}
 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
 	
 	return (work);
 }
 
 
 /*
  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
  * Handles data events from SGE response queues as well as error and other
  * async events as they all use the same interrupt pin.  We use one SGE
  * response queue per port in this mode and protect all response queues with
  * queue 0's lock.
  */
 void
 t3b_intr(void *data)
 {
 	uint32_t i, map;
 	adapter_t *adap = data;
 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
 	
 	t3_write_reg(adap, A_PL_CLI, 0);
 	map = t3_read_reg(adap, A_SG_DATA_INTR);
 
 	if (!map) 
 		return;
 
 	if (__predict_false(map & F_ERRINTR)) {
 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
 	}
 
 	mtx_lock(&q0->lock);
 	for_each_port(adap, i)
 	    if (map & (1 << i))
 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
 	mtx_unlock(&q0->lock);
 }
 
 /*
  * The MSI interrupt handler.  This needs to handle data events from SGE
  * response queues as well as error and other async events as they all use
  * the same MSI vector.  We use one SGE response queue per port in this mode
  * and protect all response queues with queue 0's lock.
  */
 void
 t3_intr_msi(void *data)
 {
 	adapter_t *adap = data;
 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
 	int i, new_packets = 0;
 
 	mtx_lock(&q0->lock);
 
 	for_each_port(adap, i)
 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq)) 
 		    new_packets = 1;
 	mtx_unlock(&q0->lock);
 	if (new_packets == 0) {
 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
 	}
 }
 
 void
 t3_intr_msix(void *data)
 {
 	struct sge_qset *qs = data;
 	adapter_t *adap = qs->port->adapter;
 	struct sge_rspq *rspq = &qs->rspq;
 
 	if (process_responses_gts(adap, rspq) == 0)
 		rspq->unhandled_irqs++;
 }
 
 #define QDUMP_SBUF_SIZE		32 * 400
 static int
 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_rspq *rspq;
 	struct sge_qset *qs;
 	int i, err, dump_end, idx;
-	static int multiplier = 1;
 	struct sbuf *sb;
 	struct rsp_desc *rspd;
 	uint32_t data[4];
 	
 	rspq = arg1;
 	qs = rspq_to_qset(rspq);
 	if (rspq->rspq_dump_count == 0) 
 		return (0);
 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", rspq->rspq_dump_count);
 		rspq->rspq_dump_count = 0;
 		return (EINVAL);
 	}
 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    rspq->rspq_dump_start);
 		rspq->rspq_dump_start = 0;
 		return (EINVAL);
 	}
 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
 	if (err)
 		return (err);
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
 
+	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
+
 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
 	
 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
 	
 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
 		idx = i & (RSPQ_Q_SIZE-1);
 		
 		rspd = &rspq->desc[idx];
 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
 		    be32toh(rspd->len_cq), rspd->intr_gen);
 	}
-	if (sbuf_overflowed(sb)) {
-		sbuf_delete(sb);
-		multiplier++;
-		goto retry_sbufops;
-	}
-	sbuf_finish(sb);
-	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+
+	err = sbuf_finish(sb);
+	/* Output a trailing NUL. */
+	if (err == 0)
+		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }	
 
 static int
 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_txq *txq;
 	struct sge_qset *qs;
 	int i, j, err, dump_end;
-	static int multiplier = 1;
 	struct sbuf *sb;
 	struct tx_desc *txd;
 	uint32_t *WR, wr_hi, wr_lo, gen;
 	uint32_t data[4];
 	
 	txq = arg1;
 	qs = txq_to_qset(txq, TXQ_ETH);
 	if (txq->txq_dump_count == 0) {
 		return (0);
 	}
 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", txq->txq_dump_count);
 		txq->txq_dump_count = 1;
 		return (EINVAL);
 	}
 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    txq->txq_dump_start);
 		txq->txq_dump_start = 0;
 		return (EINVAL);
 	}
 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
 	if (err)
 		return (err);
 	
-	    
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
+	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
 
 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16), 
 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
 	    txq->txq_dump_start,
 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
 
 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
 	for (i = txq->txq_dump_start; i < dump_end; i++) {
 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
 		WR = (uint32_t *)txd->flit;
 		wr_hi = ntohl(WR[0]);
 		wr_lo = ntohl(WR[1]);		
 		gen = G_WR_GEN(wr_lo);
 		
 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
 		    wr_hi, wr_lo, gen);
 		for (j = 2; j < 30; j += 4) 
 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
 
 	}
-	if (sbuf_overflowed(sb)) {
-		sbuf_delete(sb);
-		multiplier++;
-		goto retry_sbufops;
-	}
-	sbuf_finish(sb);
-	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	err = sbuf_finish(sb);
+	/* Output a trailing NUL. */
+	if (err == 0)
+		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }
 
 static int
 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct sge_txq *txq;
 	struct sge_qset *qs;
 	int i, j, err, dump_end;
-	static int multiplier = 1;
 	struct sbuf *sb;
 	struct tx_desc *txd;
 	uint32_t *WR, wr_hi, wr_lo, gen;
 	
 	txq = arg1;
 	qs = txq_to_qset(txq, TXQ_CTRL);
 	if (txq->txq_dump_count == 0) {
 		return (0);
 	}
 	if (txq->txq_dump_count > 256) {
 		log(LOG_WARNING,
 		    "dump count is too large %d\n", txq->txq_dump_count);
 		txq->txq_dump_count = 1;
 		return (EINVAL);
 	}
 	if (txq->txq_dump_start > 255) {
 		log(LOG_WARNING,
 		    "dump start of %d is greater than queue size\n",
 		    txq->txq_dump_start);
 		txq->txq_dump_start = 0;
 		return (EINVAL);
 	}
 
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
+	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
 	    txq->txq_dump_start,
 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
 
 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
 	for (i = txq->txq_dump_start; i < dump_end; i++) {
 		txd = &txq->desc[i & (255)];
 		WR = (uint32_t *)txd->flit;
 		wr_hi = ntohl(WR[0]);
 		wr_lo = ntohl(WR[1]);		
 		gen = G_WR_GEN(wr_lo);
 		
 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
 		    wr_hi, wr_lo, gen);
 		for (j = 2; j < 30; j += 4) 
 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
 
 	}
-	if (sbuf_overflowed(sb)) {
-		sbuf_delete(sb);
-		multiplier++;
-		goto retry_sbufops;
-	}
-	sbuf_finish(sb);
-	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	err = sbuf_finish(sb);
+	/* Output a trailing NUL. */
+	if (err == 0)
+		err = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (err);
 }
 
 static int
 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
 {
 	adapter_t *sc = arg1;
 	struct qset_params *qsp = &sc->params.sge.qset[0]; 
 	int coalesce_usecs;	
 	struct sge_qset *qs;
 	int i, j, err, nqsets = 0;
 	struct mtx *lock;
 
 	if ((sc->flags & FULL_INIT_DONE) == 0)
 		return (ENXIO);
 		
 	coalesce_usecs = qsp->coalesce_usecs;
         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
 
 	if (err != 0) {
 		return (err);
 	}
 	if (coalesce_usecs == qsp->coalesce_usecs)
 		return (0);
 
 	for (i = 0; i < sc->params.nports; i++) 
 		for (j = 0; j < sc->port[i].nqsets; j++)
 			nqsets++;
 
 	coalesce_usecs = max(1, coalesce_usecs);
 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[i];
 		qsp = &sc->params.sge.qset[i];
 		qsp->coalesce_usecs = coalesce_usecs;
 		
 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
 			    &sc->sge.qs[0].rspq.lock;
 
 		mtx_lock(lock);
 		t3_update_qset_coalesce(qs, qsp);
 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
 		mtx_unlock(lock);
 	}
 
 	return (0);
 }
 
 static int
 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
 {
 	adapter_t *sc = arg1;
 	int rc, timestamp;
 
 	if ((sc->flags & FULL_INIT_DONE) == 0)
 		return (ENXIO);
 
 	timestamp = sc->timestamp;
 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
 
 	if (rc != 0)
 		return (rc);
 
 	if (timestamp != sc->timestamp) {
 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
 		sc->timestamp = timestamp;
 	}
 
 	return (0);
 }
 
 void
 t3_add_attach_sysctls(adapter_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
 	/* random information */
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 	    "firmware_version",
 	    CTLFLAG_RD, &sc->fw_version,
 	    0, "firmware version");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "hw_revision",
 	    CTLFLAG_RD, &sc->params.rev,
 	    0, "chip model");
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 	    "port_types",
 	    CTLFLAG_RD, &sc->port_types,
 	    0, "type of ports");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "enable_debug",
 	    CTLFLAG_RW, &cxgb_debug,
 	    0, "enable verbose debugging output");
 	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
 	    CTLFLAG_RD, &sc->tunq_coalesce,
 	    "#tunneled packets freed");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "txq_overrun",
 	    CTLFLAG_RD, &txq_fills,
 	    0, "#times txq overrun");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "core_clock",
 	    CTLFLAG_RD, &sc->params.vpd.cclk,
 	    0, "core clock frequency (in KHz)");
 }
 
 
 static const char *rspq_name = "rspq";
 static const char *txq_names[] =
 {
 	"txq_eth",
 	"txq_ofld",
 	"txq_ctrl"	
 };
 
 static int
 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *p = arg1;
 	uint64_t *parg;
 
 	if (!p)
 		return (EINVAL);
 
 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
 	PORT_LOCK(p);
 	t3_mac_update_stats(&p->mac);
 	PORT_UNLOCK(p);
 
 	return (sysctl_handle_quad(oidp, parg, 0, req));
 }
 
 void
 t3_add_configured_sysctls(adapter_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 	int i, j;
 	
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 	    "intr_coal",
 	    CTLTYPE_INT|CTLFLAG_RW, sc,
 	    0, t3_set_coalesce_usecs,
 	    "I", "interrupt coalescing timer (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 	    "pkt_timestamp",
 	    CTLTYPE_INT | CTLFLAG_RW, sc,
 	    0, t3_pkt_timestamp,
 	    "I", "provide packet timestamp instead of connection hash");
 
 	for (i = 0; i < sc->params.nports; i++) {
 		struct port_info *pi = &sc->port[i];
 		struct sysctl_oid *poid;
 		struct sysctl_oid_list *poidlist;
 		struct mac_stats *mstats = &pi->mac.stats;
 		
 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, 
 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
 		poidlist = SYSCTL_CHILDREN(poid);
 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO, 
 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
 		    0, "#queue sets");
 
 		for (j = 0; j < pi->nqsets; j++) {
 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
 					  *ctrlqpoid, *lropoid;
 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
 					       *txqpoidlist, *ctrlqpoidlist,
 					       *lropoidlist;
 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
 			
 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
 			
 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, 
 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
 			qspoidlist = SYSCTL_CHILDREN(qspoid);
 
 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
 					CTLFLAG_RD, &qs->fl[0].empty, 0,
 					"freelist #0 empty");
 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
 					CTLFLAG_RD, &qs->fl[1].empty, 0,
 					"freelist #1 empty");
 
 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
 
 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
 
 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
 
 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, 
 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
 			lropoidlist = SYSCTL_CHILDREN(lropoid);
 
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
 			    CTLFLAG_RD, &qs->rspq.size,
 			    0, "#entries in response queue");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
 			    CTLFLAG_RD, &qs->rspq.cidx,
 			    0, "consumer index");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
 			    CTLFLAG_RD, &qs->rspq.credits,
 			    0, "#credits");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
 			    CTLFLAG_RD, &qs->rspq.starved,
 			    0, "#times starved");
 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
 			    CTLFLAG_RD, &qs->rspq.phys_addr,
 			    "physical_address_of the queue");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
 			    0, "start rspq dump entry");
 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
 			    0, "#rspq entries to dump");
 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
 			    0, t3_dump_rspq, "A", "dump of the response queue");
 
 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
 			    "#tunneled packets dropped");
 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
 			    0, "#tunneled packets waiting to be sent");
 #if 0			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
 			    0, "#tunneled packets queue producer index");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
 			    0, "#tunneled packets queue consumer index");
 #endif			
 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
 			    0, "#tunneled packets processed by the card");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
 			    CTLFLAG_RD, &txq->cleaned,
 			    0, "#tunneled packets cleaned");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
 			    CTLFLAG_RD, &txq->in_use,
 			    0, "#tunneled packet slots in use");
 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
 			    CTLFLAG_RD, &txq->txq_frees,
 			    "#tunneled packets freed");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
 			    CTLFLAG_RD, &txq->txq_skipped,
 			    0, "#tunneled packet descriptors skipped");
 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
 			    CTLFLAG_RD, &txq->txq_coalesced,
 			    "#tunneled packets coalesced");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
 			    CTLFLAG_RD, &txq->txq_enqueued,
 			    0, "#tunneled packets enqueued to hardware");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
 			    CTLFLAG_RD, &qs->txq_stopped,
 			    0, "tx queues stopped");
 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
 			    CTLFLAG_RD, &txq->phys_addr,
 			    "physical_address_of the queue");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
 			    0, "txq generation");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
 			    CTLFLAG_RD, &txq->cidx,
 			    0, "hardware queue cidx");			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
 			    CTLFLAG_RD, &txq->pidx,
 			    0, "hardware queue pidx");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
 			    0, "txq start idx for dump");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
 			    0, "txq #entries to dump");			
 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
 
 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
 			    0, "ctrlq start idx for dump");
 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
 			    0, "ctrl #entries to dump");			
 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
 
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
 		}
 
 		/* Now add a node for mac stats. */
 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
 		    CTLFLAG_RD, NULL, "MAC statistics");
 		poidlist = SYSCTL_CHILDREN(poid);
 
 		/*
 		 * We (ab)use the length argument (arg2) to pass on the offset
 		 * of the data that we are interested in.  This is only required
 		 * for the quad counters that are updated from the hardware (we
 		 * make sure that we return the latest value).
 		 * sysctl_handle_macstat first updates *all* the counters from
 		 * the hardware, and then returns the latest value of the
 		 * requested counter.  Best would be to update only the
 		 * requested counter from hardware, but t3_mac_update_stats()
 		 * hides all the register details and we don't want to dive into
 		 * all that here.
 		 */
 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
     sysctl_handle_macstat, "QU", 0)
 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
 		CXGB_SYSCTL_ADD_QUAD(rx_short);
 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
 #undef CXGB_SYSCTL_ADD_QUAD
 
 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
     CTLFLAG_RD, &mstats->a, 0)
 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
 		CXGB_SYSCTL_ADD_ULONG(num_resets);
 		CXGB_SYSCTL_ADD_ULONG(link_faults);
 #undef CXGB_SYSCTL_ADD_ULONG
 	}
 }
 	
 /**
  *	t3_get_desc - dump an SGE descriptor for debugging purposes
  *	@qs: the queue set
  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
  *	@idx: the descriptor index in the queue
  *	@data: where to dump the descriptor contents
  *
  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
  *	size of the descriptor.
  */
 int
 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
 		unsigned char *data)
 {
 	if (qnum >= 6)
 		return (EINVAL);
 
 	if (qnum < 3) {
 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
 			return -EINVAL;
 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
 		return sizeof(struct tx_desc);
 	}
 
 	if (qnum == 3) {
 		if (!qs->rspq.desc || idx >= qs->rspq.size)
 			return (EINVAL);
 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
 		return sizeof(struct rsp_desc);
 	}
 
 	qnum -= 4;
 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
 		return (EINVAL);
 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
 	return sizeof(struct rx_desc);
 }
Index: head/sys/kern/kern_malloc.c
===================================================================
--- head/sys/kern/kern_malloc.c	(revision 212369)
+++ head/sys/kern/kern_malloc.c	(revision 212370)
@@ -1,1056 +1,1016 @@
 /*-
  * Copyright (c) 1987, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005-2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * Kernel malloc(9) implementation -- general purpose kernel memory allocator
  * based on memory types.  Back end is implemented using the UMA(9) zone
  * allocator.  A set of fixed-size buckets are used for smaller allocations,
  * and a special UMA allocation interface is used for larger allocations.
  * Callers declare memory types, and statistics are maintained independently
  * for each memory type.  Statistics are maintained per-CPU for performance
  * reasons.  See malloc(9) and comments in malloc.h for a detailed
  * description.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdtrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 #ifdef DEBUG_REDZONE
 #include <vm/redzone.h>
 #endif
 
 #if defined(INVARIANTS) && defined(__i386__)
 #include <machine/cpu.h>
 #endif
 
 #include <ddb/ddb.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_malloc_probe_func_t	dtrace_malloc_probe;
 #endif
 
 /*
  * When realloc() is called, if the new size is sufficiently smaller than
  * the old size, realloc() will allocate a new, smaller block to avoid
  * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
  * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
  */
 #ifndef REALLOC_FRACTION
 #define	REALLOC_FRACTION	1	/* new block if <= half the size */
 #endif
 
 /*
  * Centrally define some common malloc types.
  */
 MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
 MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
 MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
 
 MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
 
 static void kmeminit(void *);
 SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL);
 
 static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
 
 static struct malloc_type *kmemstatistics;
 static vm_offset_t kmembase;
 static vm_offset_t kmemlimit;
 static int kmemcount;
 
 #define KMEM_ZSHIFT	4
 #define KMEM_ZBASE	16
 #define KMEM_ZMASK	(KMEM_ZBASE - 1)
 
 #define KMEM_ZMAX	PAGE_SIZE
 #define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
 static uint8_t kmemsize[KMEM_ZSIZE + 1];
 
 #ifndef MALLOC_DEBUG_MAXZONES
 #define	MALLOC_DEBUG_MAXZONES	1
 #endif
 static int numzones = MALLOC_DEBUG_MAXZONES;
 
 /*
  * Small malloc(9) memory allocations are allocated from a set of UMA buckets
  * of various sizes.
  *
  * XXX: The comment here used to read "These won't be powers of two for
  * long."  It's possible that a significant amount of wasted memory could be
  * recovered by tuning the sizes of these buckets.
  */
 struct {
 	int kz_size;
 	char *kz_name;
 	uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES];
 } kmemzones[] = {
 	{16, "16", },
 	{32, "32", },
 	{64, "64", },
 	{128, "128", },
 	{256, "256", },
 	{512, "512", },
 	{1024, "1024", },
 	{2048, "2048", },
 	{4096, "4096", },
 #if PAGE_SIZE > 4096
 	{8192, "8192", },
 #if PAGE_SIZE > 8192
 	{16384, "16384", },
 #if PAGE_SIZE > 16384
 	{32768, "32768", },
 #if PAGE_SIZE > 32768
 	{65536, "65536", },
 #if PAGE_SIZE > 65536
 #error	"Unsupported PAGE_SIZE"
 #endif	/* 65536 */
 #endif	/* 32768 */
 #endif	/* 16384 */
 #endif	/* 8192 */
 #endif	/* 4096 */
 	{0, NULL},
 };
 
 /*
  * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
  * types are described by a data structure passed by the declaring code, but
  * the malloc(9) implementation has its own data structure describing the
  * type and statistics.  This permits the malloc(9)-internal data structures
  * to be modified without breaking binary-compiled kernel modules that
  * declare malloc types.
  */
 static uma_zone_t mt_zone;
 
 u_long vm_kmem_size;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RD, &vm_kmem_size, 0,
     "Size of kernel memory");
 
 static u_long vm_kmem_size_min;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RD, &vm_kmem_size_min, 0,
     "Minimum size of kernel memory");
 
 static u_long vm_kmem_size_max;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RD, &vm_kmem_size_max, 0,
     "Maximum size of kernel memory");
 
 static u_int vm_kmem_size_scale;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RD, &vm_kmem_size_scale, 0,
     "Scale factor for kernel memory size");
 
 /*
  * The malloc_mtx protects the kmemstatistics linked list.
  */
 struct mtx malloc_mtx;
 
 #ifdef MALLOC_PROFILE
 uint64_t krequests[KMEM_ZSIZE + 1];
 
 static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
 #endif
 
 static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
 
 /*
  * time_uptime of the last malloc(9) failure (induced or real).
  */
 static time_t t_malloc_fail;
 
 #if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1)
 SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
     "Kernel malloc debugging options");
 #endif
 
 /*
  * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
  * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
  */
 #ifdef MALLOC_MAKE_FAILURES
 static int malloc_failure_rate;
 static int malloc_nowait_count;
 static int malloc_failure_count;
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW,
     &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
 TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate);
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
     &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
 #endif
 
 /*
  * malloc(9) uma zone separation -- sub-page buffer overruns in one
  * malloc type will affect only a subset of other malloc types.
  */
 #if MALLOC_DEBUG_MAXZONES > 1
 static void
 tunable_set_numzones(void)
 {
 
 	TUNABLE_INT_FETCH("debug.malloc.numzones",
 	    &numzones);
 
 	/* Sanity check the number of malloc uma zones. */
 	if (numzones <= 0)
 		numzones = 1;
 	if (numzones > MALLOC_DEBUG_MAXZONES)
 		numzones = MALLOC_DEBUG_MAXZONES;
 }
 SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL);
 SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN,
     &numzones, 0, "Number of malloc uma subzones");
 
 /*
  * Any number that changes regularly is an okay choice for the
  * offset.  Build numbers are pretty good of you have them.
  */
 static u_int zone_offset = __FreeBSD_version;
 TUNABLE_INT("debug.malloc.zone_offset", &zone_offset);
 SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN,
     &zone_offset, 0, "Separate malloc types by examining the "
     "Nth character in the malloc type short description.");
 
 static u_int
 mtp_get_subzone(const char *desc)
 {
 	size_t len;
 	u_int val;
 
 	if (desc == NULL || (len = strlen(desc)) == 0)
 		return (0);
 	val = desc[zone_offset % len];
 	return (val % numzones);
 }
 #elif MALLOC_DEBUG_MAXZONES == 0
 #error "MALLOC_DEBUG_MAXZONES must be positive."
 #else
 static inline u_int
 mtp_get_subzone(const char *desc)
 {
 
 	return (0);
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 
 int
 malloc_last_fail(void)
 {
 
 	return (time_uptime - t_malloc_fail);
 }
 
 /*
  * An allocation has succeeded -- update malloc type statistics for the
  * amount of bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-PCU
  * statistics.
  */
 static void
 malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
     int zindx)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	if (size > 0) {
 		mtsp->mts_memalloced += size;
 		mtsp->mts_numallocs++;
 	}
 	if (zindx != -1)
 		mtsp->mts_size |= 1 << zindx;
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_malloc_probe != NULL) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, zindx);
 	}
 #endif
 
 	critical_exit();
 }
 
 void
 malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
 {
 
 	if (size > 0)
 		malloc_type_zone_allocated(mtp, size, -1);
 }
 
 /*
  * A free operation has occurred -- update malloc type statistics for the
  * amount of the bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-CPU
  * statistics.
  */
 void
 malloc_type_freed(struct malloc_type *mtp, unsigned long size)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	mtsp->mts_memfreed += size;
 	mtsp->mts_numfrees++;
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_malloc_probe != NULL) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, 0);
 	}
 #endif
 
 	critical_exit();
 }
 
 /*
  *	malloc:
  *
  *	Allocate a block of memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 malloc(unsigned long size, struct malloc_type *mtp, int flags)
 {
 	int indx;
 	struct malloc_type_internal *mtip;
 	caddr_t va;
 	uma_zone_t zone;
 #if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
 #ifdef INVARIANTS
 	KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
 	/*
 	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
 	 */
 	indx = flags & (M_WAITOK | M_NOWAIT);
 	if (indx != M_NOWAIT && indx != M_WAITOK) {
 		static	struct timeval lasterr;
 		static	int curerr, once;
 		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
 			printf("Bad malloc flags: %x\n", indx);
 			kdb_backtrace();
 			flags |= M_WAITOK;
 			once++;
 		}
 	}
 #endif
 #ifdef MALLOC_MAKE_FAILURES
 	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
 		atomic_add_int(&malloc_nowait_count, 1);
 		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
 			atomic_add_int(&malloc_failure_count, 1);
 			t_malloc_fail = time_uptime;
 			return (NULL);
 		}
 	}
 #endif
 	if (flags & M_WAITOK)
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp(mtp, size)) {
 		va = memguard_alloc(size, flags);
 		if (va != NULL)
 			return (va);
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	size = redzone_size_ntor(size);
 #endif
 
 	if (size <= KMEM_ZMAX) {
 		mtip = mtp->ks_handle;
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		KASSERT(mtip->mti_zone < numzones,
 		    ("mti_zone %u out of range %d",
 		    mtip->mti_zone, numzones));
 		zone = kmemzones[indx].kz_zone[mtip->mti_zone];
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
 		if (va != NULL)
 			size = zone->uz_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		va = uma_large_malloc(size, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
 #ifdef DIAGNOSTIC
 	if (va != NULL && !(flags & M_ZERO)) {
 		memset(va, 0x70, osize);
 	}
 #endif
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
 #endif
 	return ((void *) va);
 }
 
 /*
  *	free:
  *
  *	Free a block of memory allocated by malloc.
  *
  *	This routine may not block.
  */
 void
 free(void *addr, struct malloc_type *mtp)
 {
 	uma_slab_t slab;
 	u_long size;
 
 	KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return;
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr)) {
 		memguard_free(addr);
 		return;
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	redzone_check(addr);
 	addr = redzone_addr_ntor(addr);
 #endif
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 
 	if (slab == NULL)
 		panic("free: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 #ifdef INVARIANTS
 		struct malloc_type **mtpp = addr;
 #endif
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
 		 * this memory here.  This way we know who is most likely to
 		 * have stepped on it later.
 		 *
 		 * This code assumes that size is a multiple of 8 bytes for
 		 * 64 bit machines
 		 */
 		mtpp = (struct malloc_type **)
 		    ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
 		mtpp += (size - sizeof(struct malloc_type *)) /
 		    sizeof(struct malloc_type *);
 		*mtpp = mtp;
 #endif
 		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
 	malloc_type_freed(mtp, size);
 }
 
 /*
  *	realloc: change the size of a memory block
  */
 void *
 realloc(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	uma_slab_t slab;
 	unsigned long alloc;
 	void *newaddr;
 
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("realloc: bad malloc type magic"));
 
 	/* realloc(NULL, ...) is equivalent to malloc(...) */
 	if (addr == NULL)
 		return (malloc(size, mtp, flags));
 
 	/*
 	 * XXX: Should report free of old memory and alloc of new memory to
 	 * per-CPU stats.
 	 */
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr))
 		return (memguard_realloc(addr, size, mtp, flags));
 #endif
 
 #ifdef DEBUG_REDZONE
 	slab = NULL;
 	alloc = redzone_get_size(addr);
 #else
 	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
 
 	/* Sanity check */
 	KASSERT(slab != NULL,
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
 	if (!(slab->us_flags & UMA_SLAB_MALLOC))
 		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
 	/* Reuse the original block if appropriate */
 	if (size <= alloc
 	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
 		return (addr);
 #endif /* !DEBUG_REDZONE */
 
 	/* Allocate a new, bigger (or smaller) block */
 	if ((newaddr = malloc(size, mtp, flags)) == NULL)
 		return (NULL);
 
 	/* Copy over original contents */
 	bcopy(addr, newaddr, min(size, alloc));
 	free(addr, mtp);
 	return (newaddr);
 }
 
 /*
  *	reallocf: same as realloc() but free memory on failure.
  */
 void *
 reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	void *mem;
 
 	if ((mem = realloc(addr, size, mtp, flags)) == NULL)
 		free(addr, mtp);
 	return (mem);
 }
 
 /*
  * Initialize the kernel memory allocator
  */
 /* ARGSUSED*/
 static void
 kmeminit(void *dummy)
 {
 	uint8_t indx;
 	u_long mem_size, tmp;
 	int i;
  
 	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
 
 	/*
 	 * Try to auto-tune the kernel memory size, so that it is
 	 * more applicable for a wider range of machine sizes.
 	 * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
 	 * a VM_KMEM_SIZE of 12MB is a fair compromise.  The
 	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
 	 * available, and on an X86 with a total KVA space of 256MB,
 	 * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
 	 *
 	 * Note that the kmem_map is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
 	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
 	mem_size = cnt.v_page_count;
 
 #if defined(VM_KMEM_SIZE_SCALE)
 	vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
 #endif
 	TUNABLE_INT_FETCH("vm.kmem_size_scale", &vm_kmem_size_scale);
 	if (vm_kmem_size_scale > 0 &&
 	    (mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
 		vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
 
 #if defined(VM_KMEM_SIZE_MIN)
 	vm_kmem_size_min = VM_KMEM_SIZE_MIN;
 #endif
 	TUNABLE_ULONG_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
 	if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
 		vm_kmem_size = vm_kmem_size_min;
 	}
 
 #if defined(VM_KMEM_SIZE_MAX)
 	vm_kmem_size_max = VM_KMEM_SIZE_MAX;
 #endif
 	TUNABLE_ULONG_FETCH("vm.kmem_size_max", &vm_kmem_size_max);
 	if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
 		vm_kmem_size = vm_kmem_size_max;
 
 	/* Allow final override from the kernel environment */
 	TUNABLE_ULONG_FETCH("vm.kmem_size", &vm_kmem_size);
 
 	/*
 	 * Limit kmem virtual size to twice the physical memory.
 	 * This allows for kmem map sparseness, but limits the size
 	 * to something sane. Be careful to not overflow the 32bit
 	 * ints while doing the check.
 	 */
 	if (((vm_kmem_size / 2) / PAGE_SIZE) > cnt.v_page_count)
 		vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE;
 
 	/*
 	 * Tune settings based on the kmem map's size at this time.
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);
 
 #ifdef DEBUG_MEMGUARD
 	tmp = memguard_fudge(vm_kmem_size, vm_kmem_size_max);
 #else
 	tmp = vm_kmem_size;
 #endif
 	kmem_map = kmem_suballoc(kernel_map, &kmembase, &kmemlimit,
 	    tmp, TRUE);
 	kmem_map->system_map = 1;
 
 #ifdef DEBUG_MEMGUARD
 	/*
 	 * Initialize MemGuard if support compiled in.  MemGuard is a
 	 * replacement allocator used for detecting tamper-after-free
 	 * scenarios as they occur.  It is only used for debugging.
 	 */
 	memguard_init(kmem_map);
 #endif
 
 	uma_startup2();
 
 	mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
 #ifdef INVARIANTS
 	    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 	    NULL, NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
 		int size = kmemzones[indx].kz_size;
 		char *name = kmemzones[indx].kz_name;
 		int subzone;
 
 		for (subzone = 0; subzone < numzones; subzone++) {
 			kmemzones[indx].kz_zone[subzone] =
 			    uma_zcreate(name, size,
 #ifdef INVARIANTS
 			    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 			    NULL, NULL, NULL, NULL,
 #endif
 			    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 		}		    
 		for (;i <= size; i+= KMEM_ZBASE)
 			kmemsize[i >> KMEM_ZSHIFT] = indx;
 		
 	}
 }
 
 void
 malloc_init(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 
 	KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init"));
 
 	mtp = data;
 	if (mtp->ks_magic != M_MAGIC)
 		panic("malloc_init: bad malloc type magic");
 
 	mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
 	mtp->ks_handle = mtip;
 	mtip->mti_zone = mtp_get_subzone(mtp->ks_shortdesc);
 
 	mtx_lock(&malloc_mtx);
 	mtp->ks_next = kmemstatistics;
 	kmemstatistics = mtp;
 	kmemcount++;
 	mtx_unlock(&malloc_mtx);
 }
 
 void
 malloc_uninit(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 	struct malloc_type *mtp, *temp;
 	uma_slab_t slab;
 	long temp_allocs, temp_bytes;
 	int i;
 
 	mtp = data;
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("malloc_uninit: bad malloc type magic"));
 	KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
 
 	mtx_lock(&malloc_mtx);
 	mtip = mtp->ks_handle;
 	mtp->ks_handle = NULL;
 	if (mtp != kmemstatistics) {
 		for (temp = kmemstatistics; temp != NULL;
 		    temp = temp->ks_next) {
 			if (temp->ks_next == mtp) {
 				temp->ks_next = mtp->ks_next;
 				break;
 			}
 		}
 		KASSERT(temp,
 		    ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc));
 	} else
 		kmemstatistics = mtp->ks_next;
 	kmemcount--;
 	mtx_unlock(&malloc_mtx);
 
 	/*
 	 * Look for memory leaks.
 	 */
 	temp_allocs = temp_bytes = 0;
 	for (i = 0; i < MAXCPU; i++) {
 		mtsp = &mtip->mti_stats[i];
 		temp_allocs += mtsp->mts_numallocs;
 		temp_allocs -= mtsp->mts_numfrees;
 		temp_bytes += mtsp->mts_memalloced;
 		temp_bytes -= mtsp->mts_memfreed;
 	}
 	if (temp_allocs > 0 || temp_bytes > 0) {
 		printf("Warning: memory type %s leaked memory on destroy "
 		    "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
 		    temp_allocs, temp_bytes);
 	}
 
 	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
 	uma_zfree_arg(mt_zone, mtip, slab);
 }
 
 struct malloc_type *
 malloc_desc2type(const char *desc)
 {
 	struct malloc_type *mtp;
 
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		if (strcmp(mtp->ks_shortdesc, desc) == 0)
 			return (mtp);
 	}
 	return (NULL);
 }
 
 static int
 sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct malloc_type_stream_header mtsh;
 	struct malloc_type_internal *mtip;
 	struct malloc_type_header mth;
 	struct malloc_type *mtp;
-	int buflen, count, error, i;
+	int error, i;
 	struct sbuf sbuf;
-	char *buffer;
 
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	mtx_lock(&malloc_mtx);
-restart:
-	mtx_assert(&malloc_mtx, MA_OWNED);
-	count = kmemcount;
-	mtx_unlock(&malloc_mtx);
-	buflen = sizeof(mtsh) + count * (sizeof(mth) +
-	    sizeof(struct malloc_type_stats) * MAXCPU) + 1;
-	buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
-	mtx_lock(&malloc_mtx);
-	if (count < kmemcount) {
-		free(buffer, M_TEMP);
-		goto restart;
-	}
 
-	sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN);
-
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&mtsh, sizeof(mtsh));
 	mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
 	mtsh.mtsh_maxcpus = MAXCPU;
 	mtsh.mtsh_count = kmemcount;
-	if (sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh)) < 0) {
-		mtx_unlock(&malloc_mtx);
-		error = ENOMEM;
-		goto out;
-	}
+	(void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh));
 
 	/*
 	 * Insert alternating sequence of type headers and type statistics.
 	 */
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 
 		/*
 		 * Insert type header.
 		 */
 		bzero(&mth, sizeof(mth));
 		strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
-		if (sbuf_bcat(&sbuf, &mth, sizeof(mth)) < 0) {
-			mtx_unlock(&malloc_mtx);
-			error = ENOMEM;
-			goto out;
-		}
+		(void)sbuf_bcat(&sbuf, &mth, sizeof(mth));
 
 		/*
 		 * Insert type statistics for each CPU.
 		 */
 		for (i = 0; i < MAXCPU; i++) {
-			if (sbuf_bcat(&sbuf, &mtip->mti_stats[i],
-			    sizeof(mtip->mti_stats[i])) < 0) {
-				mtx_unlock(&malloc_mtx);
-				error = ENOMEM;
-				goto out;
-			}
+			(void)sbuf_bcat(&sbuf, &mtip->mti_stats[i],
+			    sizeof(mtip->mti_stats[i]));
 		}
 	}
 	mtx_unlock(&malloc_mtx);
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
-out:
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
     "Return malloc types");
 
 SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
     "Count of kernel malloc types");
 
 void
 malloc_type_list(malloc_type_list_func_t *func, void *arg)
 {
 	struct malloc_type *mtp, **bufmtp;
 	int count, i;
 	size_t buflen;
 
 	mtx_lock(&malloc_mtx);
 restart:
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	count = kmemcount;
 	mtx_unlock(&malloc_mtx);
 
 	buflen = sizeof(struct malloc_type *) * count;
 	bufmtp = malloc(buflen, M_TEMP, M_WAITOK);
 
 	mtx_lock(&malloc_mtx);
 
 	if (count < kmemcount) {
 		free(bufmtp, M_TEMP);
 		goto restart;
 	}
 
 	for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++)
 		bufmtp[i] = mtp;
 
 	mtx_unlock(&malloc_mtx);
 
 	for (i = 0; i < count; i++)
 		(func)(bufmtp[i], arg);
 
 	free(bufmtp, M_TEMP);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(malloc, db_show_malloc)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	uint64_t allocs, frees;
 	uint64_t alloced, freed;
 	int i;
 
 	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
 	    "Requests");
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 		allocs = 0;
 		frees = 0;
 		alloced = 0;
 		freed = 0;
 		for (i = 0; i < MAXCPU; i++) {
 			allocs += mtip->mti_stats[i].mts_numallocs;
 			frees += mtip->mti_stats[i].mts_numfrees;
 			alloced += mtip->mti_stats[i].mts_memalloced;
 			freed += mtip->mti_stats[i].mts_memfreed;
 		}
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
 	}
 }
 
 #if MALLOC_DEBUG_MAXZONES > 1
 DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	u_int subzone;
 
 	if (!have_addr) {
 		db_printf("Usage: show multizone_matches <malloc type/addr>\n");
 		return;
 	}
 	mtp = (void *)addr;
 	if (mtp->ks_magic != M_MAGIC) {
 		db_printf("Magic %lx does not match expected %x\n",
 		    mtp->ks_magic, M_MAGIC);
 		return;
 	}
 
 	mtip = mtp->ks_handle;
 	subzone = mtip->mti_zone;
 
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = mtp->ks_handle;
 		if (mtip->mti_zone != subzone)
 			continue;
 		db_printf("%s\n", mtp->ks_shortdesc);
 	}
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 #endif /* DDB */
 
 #ifdef MALLOC_PROFILE
 
 static int
 sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
 {
-	int linesize = 64;
 	struct sbuf sbuf;
 	uint64_t count;
 	uint64_t waste;
 	uint64_t mem;
-	int bufsize;
 	int error;
-	char *buf;
 	int rsize;
 	int size;
 	int i;
 
-	bufsize = linesize * (KMEM_ZSIZE + 1);
-	bufsize += 128; 	/* For the stats line */
-	bufsize += 128; 	/* For the banner line */
 	waste = 0;
 	mem = 0;
 
-	buf = malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
-	sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, 
 	    "\n  Size                    Requests  Real Size\n");
 	for (i = 0; i < KMEM_ZSIZE; i++) {
 		size = i << KMEM_ZSHIFT;
 		rsize = kmemzones[kmemsize[i]].kz_size;
 		count = (long long unsigned)krequests[i];
 
 		sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
 		    (unsigned long long)count, rsize);
 
 		if ((rsize * count) > (size * count))
 			waste += (rsize * count) - (size * count);
 		mem += (rsize * count);
 	}
 	sbuf_printf(&sbuf,
 	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
 	    (unsigned long long)mem, (unsigned long long)waste);
-	sbuf_finish(&sbuf);
-
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
-
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(buf, M_TEMP);
 	return (error);
 }
 
 SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
     NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
 #endif /* MALLOC_PROFILE */
Index: head/sys/kern/kern_sysctl.c
===================================================================
--- head/sys/kern/kern_sysctl.c	(revision 212369)
+++ head/sys/kern/kern_sysctl.c	(revision 212370)
@@ -1,1546 +1,1574 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
 static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 /*
  * The sysctllock protects the MIB tree.  It also protects sysctl
  * contexts used with dynamic sysctls.  The sysctl_register_oid() and
  * sysctl_unregister_oid() routines require the sysctllock to already
  * be held, so the sysctl_lock() and sysctl_unlock() routines are
  * provided for the few places in the kernel which need to use that
  * API rather than using the dynamic API.  Use of the dynamic API is
  * strongly encouraged for most code.
  *
  * The sysctlmemlock is used to limit the amount of user memory wired for
  * sysctl requests.  This is implemented by serializing any userland
  * sysctl requests larger than a single page via an exclusive lock.
  */
 static struct sx sysctllock;
 static struct sx sysctlmemlock;
 
 #define	SYSCTL_SLOCK()		sx_slock(&sysctllock)
 #define	SYSCTL_SUNLOCK()	sx_sunlock(&sysctllock)
 #define	SYSCTL_XLOCK()		sx_xlock(&sysctllock)
 #define	SYSCTL_XUNLOCK()	sx_xunlock(&sysctllock)
 #define	SYSCTL_ASSERT_XLOCKED()	sx_assert(&sysctllock, SA_XLOCKED)
 #define	SYSCTL_ASSERT_LOCKED()	sx_assert(&sysctllock, SA_LOCKED)
 #define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 struct sysctl_oid_list sysctl__children; /* root list */
 
 static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
 		    int recurse);
 
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	SLIST_FOREACH(oidp, list, oid_link) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Initialization of the MIB tree.
  *
  * Order by number in each list.
  */
 void
 sysctl_lock(void)
 {
 
 	SYSCTL_XLOCK();
 }
 
 void
 sysctl_unlock(void)
 {
 
 	SYSCTL_XUNLOCK();
 }
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid_list *parent = oidp->oid_parent;
 	struct sysctl_oid *p;
 	struct sysctl_oid *q;
 
 	/*
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
 	SYSCTL_ASSERT_XLOCKED();
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			p->oid_refcnt++;
 			return;
 		} else {
 			printf("can't re-use a leaf (%s)!\n", p->oid_name);
 			return;
 		}
 	}
 	/*
 	 * If this oid has a number OID_AUTO, give it a number which
 	 * is greater than any current oid.
 	 * NOTE: DO NOT change the starting value here, change it in
 	 * <sys/sysctl.h>, and make sure it is at least 256 to
 	 * accomodate e.g. net.inet.raw as a static sysctl node.
 	 */
 	if (oidp->oid_number == OID_AUTO) {
 		static int newoid = CTL_AUTO_START;
 
 		oidp->oid_number = newoid++;
 		if (newoid == 0x7fffffff)
 			panic("out of oids");
 	}
 #if 0
 	else if (oidp->oid_number >= CTL_AUTO_START) {
 		/* do not panic; this happens when unregistering sysctl sets */
 		printf("static sysctl oid too high: %d", oidp->oid_number);
 	}
 #endif
 
 	/*
 	 * Insert the oid into the parent's list in order.
 	 */
 	q = NULL;
 	SLIST_FOREACH(p, parent, oid_link) {
 		if (oidp->oid_number < p->oid_number)
 			break;
 		q = p;
 	}
 	if (q)
 		SLIST_INSERT_AFTER(q, oidp, oid_link);
 	else
 		SLIST_INSERT_HEAD(parent, oidp, oid_link);
 }
 
 void
 sysctl_unregister_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	SYSCTL_ASSERT_XLOCKED();
 	error = ENOENT;
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
 	} else {
 		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
 			if (p == oidp) {
 				SLIST_REMOVE(oidp->oid_parent, oidp,
 				    sysctl_oid, oid_link);
 				error = 0;
 				break;
 			}
 		}
 	}
 
 	/* 
 	 * This can happen when a module fails to register and is
 	 * being unloaded afterwards.  It should not be a panic()
 	 * for normal use.
 	 */
 	if (error)
 		printf("%s: failed to unregister sysctl\n", __func__);
 }
 
 /* Initialize a new context to keep track of dynamically added sysctls. */
 int
 sysctl_ctx_init(struct sysctl_ctx_list *c)
 {
 
 	if (c == NULL) {
 		return (EINVAL);
 	}
 
 	/*
 	 * No locking here, the caller is responsible for not adding
 	 * new nodes to a context until after this function has
 	 * returned.
 	 */
 	TAILQ_INIT(c);
 	return (0);
 }
 
 /* Free the context, and destroy all dynamic oids registered in this context */
 int
 sysctl_ctx_free(struct sysctl_ctx_list *clist)
 {
 	struct sysctl_ctx_entry *e, *e1;
 	int error;
 
 	error = 0;
 	/*
 	 * First perform a "dry run" to check if it's ok to remove oids.
 	 * XXX FIXME
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
 	SYSCTL_XLOCK();
 	TAILQ_FOREACH(e, clist, link) {
 		error = sysctl_remove_oid_locked(e->entry, 0, 0);
 		if (error)
 			break;
 	}
 	/*
 	 * Restore deregistered entries, either from the end,
 	 * or from the place where error occured.
 	 * e contains the entry that was not unregistered
 	 */
 	if (error)
 		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
 	else
 		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
 	while (e1 != NULL) {
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
 	if (error) {
 		SYSCTL_XUNLOCK();
 		return(EBUSY);
 	}
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
 		error = sysctl_remove_oid_locked(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
 	SYSCTL_XUNLOCK();
 	return (error);
 }
 
 /* Add an entry to the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	SYSCTL_ASSERT_XLOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
 	e->entry = oidp;
 	TAILQ_INSERT_HEAD(clist, e, link);
 	return (e);
 }
 
 /* Find an entry in the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	SYSCTL_ASSERT_LOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
 		if(e->entry == oidp)
 			return(e);
 	}
 	return (e);
 }
 
 /*
  * Delete an entry from the context.
  * NOTE: this function doesn't free oidp! You have to remove it
  * with sysctl_remove_oid().
  */
 int
 sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
 	SYSCTL_XLOCK();
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
 		SYSCTL_XUNLOCK();
 		free(e, M_SYSCTLOID);
 		return (0);
 	} else {
 		SYSCTL_XUNLOCK();
 		return (ENOENT);
 	}
 }
 
 /*
  * Remove dynamically created sysctl trees.
  * oidp - top of the tree to be removed
  * del - if 0 - just deregister, otherwise free up entries as well
  * recurse - if != 0 traverse the subtree to be deleted
  */
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
 	int error;
 
 	SYSCTL_XLOCK();
 	error = sysctl_remove_oid_locked(oidp, del, recurse);
 	SYSCTL_XUNLOCK();
 	return (error);
 }
 
 static int
 sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	SYSCTL_ASSERT_XLOCKED();
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
 		printf("can't remove non-dynamic nodes!\n");
 		return (EINVAL);
 	}
 	/*
 	 * WARNING: normal method to do this should be through
 	 * sysctl_ctx_free(). Use recursing as the last resort
 	 * method to purge your sysctl tree of leftovers...
 	 * However, if some other code still references these nodes,
 	 * it will panic.
 	 */
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		if (oidp->oid_refcnt == 1) {
 			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
 				if (!recurse)
 					return (ENOTEMPTY);
 				error = sysctl_remove_oid_locked(p, del,
 				    recurse);
 				if (error)
 					return (error);
 			}
 			if (del)
 				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
 		}
 	}
 	if (oidp->oid_refcnt > 1 ) {
 		oidp->oid_refcnt--;
 	} else {
 		if (oidp->oid_refcnt == 0) {
 			printf("Warning: bad oid_refcnt=%u (%s)!\n",
 				oidp->oid_refcnt, oidp->oid_name);
 			return (EINVAL);
 		}
 		sysctl_unregister_oid(oidp);
 		if (del) {
 			if (oidp->oid_descr)
 				free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID);
 			free((void *)(uintptr_t)(const void *)oidp->oid_name,
 			     M_SYSCTLOID);
 			free(oidp, M_SYSCTLOID);
 		}
 	}
 	return (0);
 }
 
 /*
  * Create new sysctls at run time.
  * clist may point to a valid context initialized with sysctl_ctx_init().
  */
 struct sysctl_oid *
 sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
 	int number, const char *name, int kind, void *arg1, int arg2,
 	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
 {
 	struct sysctl_oid *oidp;
 	ssize_t len;
 	char *newname;
 
 	/* You have to hook up somewhere.. */
 	if (parent == NULL)
 		return(NULL);
 	/* Check if the node already exists, otherwise create it */
 	SYSCTL_XLOCK();
 	oidp = sysctl_find_oidname(name, parent);
 	if (oidp != NULL) {
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			oidp->oid_refcnt++;
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
 			SYSCTL_XUNLOCK();
 			return (oidp);
 		} else {
 			SYSCTL_XUNLOCK();
 			printf("can't re-use a leaf (%s)!\n", name);
 			return (NULL);
 		}
 	}
 	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
 	oidp->oid_parent = parent;
 	SLIST_NEXT(oidp, oid_link) = NULL;
 	oidp->oid_number = number;
 	oidp->oid_refcnt = 1;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	oidp->oid_name = newname;
 	oidp->oid_handler = handler;
 	oidp->oid_kind = CTLFLAG_DYN | kind;
 	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
 		/* Allocate space for children */
 		SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
 		    M_SYSCTLOID, M_WAITOK));
 		SLIST_INIT(SYSCTL_CHILDREN(oidp));
 	} else {
 		oidp->oid_arg1 = arg1;
 		oidp->oid_arg2 = arg2;
 	}
 	oidp->oid_fmt = fmt;
 	if (descr) {
 		int len = strlen(descr) + 1;
 		oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
 		if (oidp->oid_descr)
 			strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr);
 	}
 	/* Update the context, if used */
 	if (clist != NULL)
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
 	SYSCTL_XUNLOCK();
 	return (oidp);
 }
 
 /*
  * Rename an existing oid.
  */
 void
 sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
 {
 	ssize_t len;
 	char *newname;
 	void *oldname;
 
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	SYSCTL_XLOCK();
 	oldname = (void *)(uintptr_t)(const void *)oidp->oid_name;
 	oidp->oid_name = newname;
 	SYSCTL_XUNLOCK();
 	free(oldname, M_SYSCTLOID);
 }
 
 /*
  * Reparent an existing oid.
  */
 int
 sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_XLOCK();
 	if (oid->oid_parent == parent) {
 		SYSCTL_XUNLOCK();
 		return (0);
 	}
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
 	if (oidp != NULL) {
 		SYSCTL_XUNLOCK();
 		return (EEXIST);
 	}
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
 	SYSCTL_XUNLOCK();
 	return (0);
 }
 
 /*
  * Register the kernel's oids on startup.
  */
 SET_DECLARE(sysctl_set, struct sysctl_oid);
 
 static void
 sysctl_register_all(void *arg)
 {
 	struct sysctl_oid **oidp;
 
 	sx_init(&sysctlmemlock, "sysctl mem");
 	SYSCTL_INIT();
 	SYSCTL_XLOCK();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
 	SYSCTL_XUNLOCK();
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
 
 /*
  * "Staff-functions"
  *
  * These functions implement a presently undocumented interface 
  * used by the sysctl program to walk the tree, and get the type
  * so it can print the value.
  * This interface is under work and consideration, and should probably
  * be killed with a big axe by the first person who can find the time.
  * (be aware though, that the proper interface isn't as obvious as it
  * may seem, there are various conflicting requirements.
  *
  * {0,0}	printf the entire MIB-tree.
  * {0,1,...}	return the name of the "..." OID.
  * {0,2,...}	return the next OID.
  * {0,3}	return the OID of the name in "new"
  * {0,4,...}	return the kind & format info for the "..." OID.
  * {0,5,...}	return the description the "..." OID.
  */
 
 #ifdef SYSCTL_DEBUG
 static void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
 	int k;
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	SLIST_FOREACH(oidp, l, oid_link) {
 
 		for (k=0; k<i; k++)
 			printf(" ");
 
 		printf("%d %s ", oidp->oid_number, oidp->oid_name);
 
 		printf("%c%c",
 			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
 			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
 
 		if (oidp->oid_handler)
 			printf(" *Handler");
 
 		switch (oidp->oid_kind & CTLTYPE) {
 			case CTLTYPE_NODE:
 				printf(" Node\n");
 				if (!oidp->oid_handler) {
 					sysctl_sysctl_debug_dump_node(
 						oidp->oid_arg1, i+2);
 				}
 				break;
 			case CTLTYPE_INT:    printf(" Int\n"); break;
 			case CTLTYPE_STRING: printf(" String\n"); break;
 			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
 			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
 			default:	     printf("\n");
 		}
 
 	}
 }
 
 static int
 sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
 	return (ENOENT);
 }
 
 SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_sysctl_debug, "-", "");
 #endif
 
 static int
 sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int error = 0;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	char buf[10];
 
 	SYSCTL_ASSERT_LOCKED();
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, buf, strlen(buf));
 			if (error)
 				return (error);
 			namelen--;
 			name++;
 			continue;
 		}
 		lsp2 = 0;
 		SLIST_FOREACH(oid, lsp, oid_link) {
 			if (oid->oid_number != *name)
 				continue;
 
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, oid->oid_name,
 					strlen(oid->oid_name));
 			if (error)
 				return (error);
 
 			namelen--;
 			name++;
 
 			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				break;
 
 			if (oid->oid_handler)
 				break;
 
 			lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
 			break;
 		}
 		lsp = lsp2;
 	}
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
 
 static int
 sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
 	int *next, int *len, int level, struct sysctl_oid **oidpp)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	*len = level;
 	SLIST_FOREACH(oidp, lsp, oid_link) {
 		*next = oidp->oid_number;
 		*oidpp = oidp;
 
 		if (oidp->oid_kind & CTLFLAG_SKIP)
 			continue;
 
 		if (!namelen) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				return (0);
 			if (oidp->oid_handler) 
 				/* We really should call the handler here...*/
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
 				len, level+1, oidpp))
 				return (0);
 			goto emptynode;
 		}
 
 		if (oidp->oid_number < *name)
 			continue;
 
 		if (oidp->oid_number > *name) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 				return (0);
 			if (oidp->oid_handler)
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
 				next+1, len, level+1, oidpp))
 				return (0);
 			goto next;
 		}
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			continue;
 
 		if (oidp->oid_handler)
 			continue;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
 			len, level+1, oidpp))
 			return (0);
 	next:
 		namelen = 1;
 	emptynode:
 		*len = level;
 	}
 	return (1);
 }
 
 static int
 sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int i, j, error;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	int newoid[CTL_MAXNAME];
 
 	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
 	if (i)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
 
 static int
 name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
 {
 	int i;
 	struct sysctl_oid *oidp;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	char *p;
 
 	SYSCTL_ASSERT_LOCKED();
 
 	if (!*name)
 		return (ENOENT);
 
 	p = name + strlen(name) - 1 ;
 	if (*p == '.')
 		*p = '\0';
 
 	*len = 0;
 
 	for (p = name; *p && *p != '.'; p++) 
 		;
 	i = *p;
 	if (i == '.')
 		*p = '\0';
 
 	oidp = SLIST_FIRST(lsp);
 
 	while (oidp && *len < CTL_MAXNAME) {
 		if (strcmp(name, oidp->oid_name)) {
 			oidp = SLIST_NEXT(oidp, oid_link);
 			continue;
 		}
 		*oid++ = oidp->oid_number;
 		(*len)++;
 
 		if (!i) {
 			if (oidpp)
 				*oidpp = oidp;
 			return (0);
 		}
 
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			break;
 
 		if (oidp->oid_handler)
 			break;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		oidp = SLIST_FIRST(lsp);
 		name = p+1;
 		for (p = name; *p && *p != '.'; p++) 
 				;
 		i = *p;
 		if (i == '.')
 			*p = '\0';
 	}
 	return (ENOENT);
 }
 
 static int
 sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
 {
 	char *p;
 	int error, oid[CTL_MAXNAME], len;
 	struct sysctl_oid *op = 0;
 
 	SYSCTL_ASSERT_LOCKED();
 
 	if (!req->newlen) 
 		return (ENOENT);
 	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
 		return (ENAMETOOLONG);
 
 	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
 
 	error = SYSCTL_IN(req, p, req->newlen);
 	if (error) {
 		free(p, M_SYSCTL);
 		return (error);
 	}
 
 	p [req->newlen] = '\0';
 
 	error = name2oid(p, oid, &len, &op);
 
 	free(p, M_SYSCTL);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
 	return (error);
 }
 
 SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE,
     0, 0, sysctl_sysctl_name2oid, "I", "");
 
 static int
 sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_fmt)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
 	return (error);
 }
 
 
 static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_sysctl_oidfmt, "");
 
 static int
 sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_descr)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
 
 /*
  * Default "handler" functions.
  */
 
 /*
  * Handle an int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 	int tmpout, error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 /*
  * Based on on sysctl_handle_int() convert milliseconds into ticks.
  * Note: this is used by TCP.
  */
 
 int
 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int error, s, tt;
 
 	tt = *(int *)arg1;
 	s = (int)((int64_t)tt * 1000 / hz);
 
 	error = sysctl_handle_int(oidp, &s, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	tt = (int)((int64_t)s * hz / 1000);
 	if (tt < 1)
 		return (EINVAL);
 
 	*(int *)arg1 = tt;
 	return (0);
 }
 
 
 /*
  * Handle a long, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_long(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmplong;
 #ifdef SCTL_MASK32
 	int tmpint;
 #endif
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmplong = *(long *)arg1;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		tmpint = tmplong;
 		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
 
 	if (error || !req->newptr)
 		return (error);
 
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		error = SYSCTL_IN(req, &tmpint, sizeof(int));
 		*(long *)arg1 = (long)tmpint;
 	} else
 #endif
 		error = SYSCTL_IN(req, arg1, sizeof(long));
 	return (error);
 }
 
 /*
  * Handle a 64 bit int, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_quad(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	uint64_t tmpout;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmpout = *(uint64_t *)arg1;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
 
 	if (error || !req->newptr)
 		return (error);
 
 	error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
 	return (error);
 }
 
 /*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
  * 	a constant string:  point arg1 at it, arg2 is zero.
  */
 
 int
 sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 	int error=0;
 	char *tmparg;
 	size_t outlen;
 
 	/*
 	 * Attempt to get a coherent snapshot by copying to a
 	 * temporary kernel buffer.
 	 */
 retry:
 	outlen = strlen((char *)arg1)+1;
 	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
 
 	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
 		free(tmparg, M_SYSCTLTMP);
 		goto retry;
 	}
 
 	error = SYSCTL_OUT(req, tmparg, outlen);
 	free(tmparg, M_SYSCTLTMP);
 
 	if (error || !req->newptr)
 		return (error);
 
 	if ((req->newlen - req->newidx) >= arg2) {
 		error = EINVAL;
 	} else {
 		arg2 = (req->newlen - req->newidx);
 		error = SYSCTL_IN(req, arg1, arg2);
 		((char *)arg1)[arg2] = '\0';
 	}
 
 	return (error);
 }
 
 /*
  * Handle any kind of opaque data.
  * arg1 points to it, arg2 is the size.
  */
 
 int
 sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 	int error, tries;
 	u_int generation;
 	struct sysctl_req req2;
 
 	/*
 	 * Attempt to get a coherent snapshot, by using the thread
 	 * pre-emption counter updated from within mi_switch() to
 	 * determine if we were pre-empted during a bcopy() or
 	 * copyout(). Make 3 attempts at doing this before giving up.
 	 * If we encounter an error, stop immediately.
 	 */
 	tries = 0;
 	req2 = *req;
 retry:
 	generation = curthread->td_generation;
 	error = SYSCTL_OUT(req, arg1, arg2);
 	if (error)
 		return (error);
 	tries++;
 	if (generation != curthread->td_generation && tries < 3) {
 		*req = req2;
 		goto retry;
 	}
 
 	error = SYSCTL_IN(req, arg1, arg2);
 
 	return (error);
 }
 
 /*
  * Transfer functions to/from kernel space.
  * XXX: rather untested at this point
  */
 static int
 sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i = 0;
 
 	if (req->oldptr) {
 		i = l;
 		if (req->oldlen <= req->oldidx)
 			i = 0;
 		else
 			if (i > req->oldlen - req->oldidx)
 				i = req->oldlen - req->oldidx;
 		if (i > 0)
 			bcopy(p, (char *)req->oldptr + req->oldidx, i);
 	}
 	req->oldidx += l;
 	if (req->oldptr && i != l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
 {
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	bcopy((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (0);
 }
 
 int
 kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_SLOCK();
 	error = sysctl_root(0, name, namelen, &req);
 	SYSCTL_SUNLOCK();
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 int
 kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
     void *new, size_t newlen, size_t *retval, int flags)
 {
         int oid[CTL_MAXNAME];
         size_t oidlen, plen;
 	int error;
 
 	oid[0] = 0;		/* sysctl internal magic */
 	oid[1] = 3;		/* name2oid */
 	oidlen = sizeof(oid);
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
 	    (void *)name, strlen(name), &plen, flags);
 	if (error)
 		return (error);
 
 	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
 	    new, newlen, retval, flags);
 	return (error);
 }
 
 /*
  * Transfer function to/from user space.
  */
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
 	int error = 0;
 	size_t i, len, origidx;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
 	if (req->oldptr == NULL)
 		return (0);
 	/*
 	 * If we have not wired the user supplied buffer and we are currently
 	 * holding locks, drop a witness warning, as it's possible that
 	 * write operations to the user page can sleep.
 	 */
 	if (req->lock != REQ_WIRED)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "sysctl_old_user()");
 	i = l;
 	len = req->validlen;
 	if (len <= origidx)
 		i = 0;
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
 		error = copyout(p, (char *)req->oldptr + origidx, i);
 	}
 	if (error)
 		return (error);
 	if (i < l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
 {
 	int error;
 
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "sysctl_new_user()");
 	error = copyin((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (error);
 }
 
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
 {
 	int ret;
 	size_t wiredlen;
 
 	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
 	ret = 0;
 	if (req->lock == REQ_LOCKED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
 		if (wiredlen != 0) {
 			ret = vslock(req->oldptr, wiredlen);
 			if (ret != 0) {
 				if (ret != ENOMEM)
 					return (ret);
 				wiredlen = 0;
 			}
 		}
 		req->lock = REQ_WIRED;
 		req->validlen = wiredlen;
 	}
 	return (0);
 }
 
 int
 sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
     int *nindx, struct sysctl_req *req)
 {
 	struct sysctl_oid *oid;
 	int indx;
 
 	SYSCTL_ASSERT_LOCKED();
 	oid = SLIST_FIRST(&sysctl__children);
 	indx = 0;
 	while (oid && indx < CTL_MAXNAME) {
 		if (oid->oid_number == name[indx]) {
 			indx++;
 			if (oid->oid_kind & CTLFLAG_NOLOCK)
 				req->lock = REQ_UNLOCKED;
 			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 				if (oid->oid_handler != NULL ||
 				    indx == namelen) {
 					*noid = oid;
 					if (nindx != NULL)
 						*nindx = indx;
 					return (0);
 				}
 				oid = SLIST_FIRST(
 				    (struct sysctl_oid_list *)oid->oid_arg1);
 			} else if (indx == namelen) {
 				*noid = oid;
 				if (nindx != NULL)
 					*nindx = indx;
 				return (0);
 			} else {
 				return (ENOTDIR);
 			}
 		} else {
 			oid = SLIST_NEXT(oid, oid_link);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * to, and return the resulting error code.
  */
 
 static int
 sysctl_root(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error, indx, lvl;
 
 	SYSCTL_ASSERT_LOCKED();
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
 		return (error);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		/*
 		 * You can't call a sysctl when it's a node, but has
 		 * no handler.  Inform the user that it's a node.
 		 * The indx may or may not be the same as namelen.
 		 */
 		if (oid->oid_handler == NULL)
 			return (EISDIR);
 	}
 
 	/* Is this sysctl writable? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
 		return (EPERM);
 
 	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
 
 	/* Is this sysctl sensitive to securelevels? */
 	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
 		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
 		error = securelevel_gt(req->td->td_ucred, lvl);
 		if (error)
 			return (error);
 	}
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
 		int priv;
 
 		if (oid->oid_kind & CTLFLAG_PRISON)
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #ifdef VIMAGE
 		else if ((oid->oid_kind & CTLFLAG_VNET) &&
 		     prison_owns_vnet(req->td->td_ucred))
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #endif
 		else
 			priv = PRIV_SYSCTL_WRITE;
 		error = priv_check(req->td, priv);
 		if (error)
 			return (error);
 	}
 
 	if (!oid->oid_handler)
 		return (EINVAL);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		arg1 = (int *)arg1 + indx;
 		arg2 -= indx;
 	} else {
 		arg1 = oid->oid_arg1;
 		arg2 = oid->oid_arg2;
 	}
 #ifdef MAC
 	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
 	    req);
 	if (error != 0)
 		return (error);
 #endif
 	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
 		mtx_lock(&Giant);
 	error = oid->oid_handler(oid, arg1, arg2, req);
 	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
 		mtx_unlock(&Giant);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysctl_args {
 	int	*name;
 	u_int	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
 	int error, i, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
 
  	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, uap->oldlenp, 0,
 		uap->new, uap->newlen, &j, 0);
 	if (error && error != ENOMEM)
 		return (error);
 	if (uap->oldlenp) {
 		i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			return (i);
 	}
 	return (error);
 }
 
 /*
  * This is used from various compatibility syscalls too.  That's why name
  * must be in kernel space.
  */
 int
 userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
     int flags)
 {
 	int error = 0, memlocked;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		if (inkernel) {
 			req.oldlen = *oldlenp;
 		} else {
 			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
 			if (error)
 				return (error);
 		}
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
 			return (EFAULT);
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		if (!useracc(new, newlen, VM_PROT_READ))
 			return (EFAULT);
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_user;
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_LOCKED;
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_SYSCTL))
 		ktrsysctl(name, namelen);
 #endif
 
 	if (req.oldlen > PAGE_SIZE) {
 		memlocked = 1;
 		sx_xlock(&sysctlmemlock);
 	} else
 		memlocked = 0;
 	CURVNET_SET(TD_TO_VNET(td));
 
 	for (;;) {
 		req.oldidx = 0;
 		req.newidx = 0;
 		SYSCTL_SLOCK();
 		error = sysctl_root(0, name, namelen, &req);
 		SYSCTL_SUNLOCK();
 		if (error != EAGAIN)
 			break;
 		uio_yield();
 	}
 
 	CURVNET_RESTORE();
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 	if (memlocked)
 		sx_xunlock(&sysctlmemlock);
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
+}
+
+/*
+ * Drain into a sysctl struct.  The user buffer must be wired.
+ */
+static int
+sbuf_sysctl_drain(void *arg, const char *data, int len)
+{
+	struct sysctl_req *req = arg;
+	int error;
+
+	error = SYSCTL_OUT(req, data, len);
+	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
+	return (error == 0 ? len : -error);
+}
+
+struct sbuf *
+sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
+    struct sysctl_req *req)
+{
+
+	/* Wire the user buffer, so we can write without blocking. */
+	sysctl_wire_old_buffer(req, 0);
+
+	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN);
+	sbuf_set_drain(s, sbuf_sysctl_drain, req);
+	return (s);
 }
Index: head/sys/kern/subr_lock.c
===================================================================
--- head/sys/kern/subr_lock.c	(revision 212369)
+++ head/sys/kern/subr_lock.c	(revision 212370)
@@ -1,669 +1,661 @@
 /*-
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and functions used to maintain
  * lock_object structures.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mprof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/cpufunc.h>
 
 CTASSERT(LOCK_CLASS_MAX == 15);
 
 struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
 	&lock_class_mtx_spin,
 	&lock_class_mtx_sleep,
 	&lock_class_sx,
 	&lock_class_rm,
 	&lock_class_rw,
 	&lock_class_lockmgr,
 };
 
 void
 lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
     const char *type, int flags)
 {
 	int i;
 
 	/* Check for double-init and zero object. */
 	KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
 	    name, lock));
 
 	/* Look up lock class to find its index. */
 	for (i = 0; i < LOCK_CLASS_MAX; i++)
 		if (lock_classes[i] == class) {
 			lock->lo_flags = i << LO_CLASSSHIFT;
 			break;
 		}
 	KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
 
 	/* Initialize the lock object. */
 	lock->lo_name = name;
 	lock->lo_flags |= flags | LO_INITIALIZED;
 	LOCK_LOG_INIT(lock, 0);
 	WITNESS_INIT(lock, (type != NULL) ? type : name);
 }
 
 void
 lock_destroy(struct lock_object *lock)
 {
 
 	KASSERT(lock_initalized(lock), ("lock %p is not initialized", lock));
 	WITNESS_DESTROY(lock);
 	LOCK_LOG_DESTROY(lock, 0);
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 
 	if (!have_addr)
 		return;
 	lock = (struct lock_object *)addr;
 	if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
 		db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
 		return;
 	}
 	class = LOCK_CLASS(lock);
 	db_printf(" class: %s\n", class->lc_name);
 	db_printf(" name: %s\n", lock->lo_name);
 	class->lc_ddb_show(lock);
 }
 #endif
 
 #ifdef LOCK_PROFILING
 
 /*
  * One object per-thread for each lock the thread owns.  Tracks individual
  * lock instances.
  */
 struct lock_profile_object {
 	LIST_ENTRY(lock_profile_object) lpo_link;
 	struct lock_object *lpo_obj;
 	const char	*lpo_file;
 	int		lpo_line;
 	uint16_t	lpo_ref;
 	uint16_t	lpo_cnt;
 	uint64_t	lpo_acqtime;
 	uint64_t	lpo_waittime;
 	u_int		lpo_contest_locking;
 };
 
 /*
  * One lock_prof for each (file, line, lock object) triple.
  */
 struct lock_prof {
 	SLIST_ENTRY(lock_prof) link;
 	struct lock_class *class;
 	const char	*file;
 	const char	*name;
 	int		line;
 	int		ticks;
 	uintmax_t	cnt_wait_max;
 	uintmax_t	cnt_max;
 	uintmax_t	cnt_tot;
 	uintmax_t	cnt_wait;
 	uintmax_t	cnt_cur;
 	uintmax_t	cnt_contest_locking;
 };
 
 SLIST_HEAD(lphead, lock_prof);
 
 #define	LPROF_HASH_SIZE		4096
 #define	LPROF_HASH_MASK		(LPROF_HASH_SIZE - 1)
 #define	LPROF_CACHE_SIZE	4096
 
 /*
  * Array of objects and profs for each type of object for each cpu.  Spinlocks
  * are handled seperately because a thread may be preempted and acquire a
  * spinlock while in the lock profiling code of a non-spinlock.  In this way
  * we only need a critical section to protect the per-cpu lists.
  */
 struct lock_prof_type {
 	struct lphead		lpt_lpalloc;
 	struct lpohead		lpt_lpoalloc;
 	struct lphead		lpt_hash[LPROF_HASH_SIZE];
 	struct lock_prof	lpt_prof[LPROF_CACHE_SIZE];
 	struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE];
 };
 
 struct lock_prof_cpu {
 	struct lock_prof_type	lpc_types[2]; /* One for spin one for other. */
 };
 
 struct lock_prof_cpu *lp_cpu[MAXCPU];
 
 volatile int lock_prof_enable = 0;
 static volatile int lock_prof_resetting;
 
-/* SWAG: sbuf size = avg stat. line size * number of locks */
-#define LPROF_SBUF_SIZE		256 * 400
+#define LPROF_SBUF_SIZE		256
 
 static int lock_prof_rejected;
 static int lock_prof_skipspin;
 static int lock_prof_skipcount;
 
 #ifndef USE_CPU_NANOSECONDS
 uint64_t
 nanoseconds(void)
 {
 	struct bintime bt;
 	uint64_t ns;
 
 	binuptime(&bt);
 	/* From bintime2timespec */
 	ns = bt.sec * (uint64_t)1000000000;
 	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (ns);
 }
 #endif
 
 static void
 lock_prof_init_type(struct lock_prof_type *type)
 {
 	int i;
 
 	SLIST_INIT(&type->lpt_lpalloc);
 	LIST_INIT(&type->lpt_lpoalloc);
 	for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 		SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i],
 		    link);
 		LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i],
 		    lpo_link);
 	}
 }
 
 static void
 lock_prof_init(void *arg)
 {
 	int cpu;
 
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF,
 		    M_WAITOK | M_ZERO);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]);
 	}
 }
 SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
 
 /*
  * To be certain that lock profiling has idled on all cpus before we
  * reset, we schedule the resetting thread on all active cpus.  Since
  * all operations happen within critical sections we can be sure that
  * it is safe to zero the profiling structures.
  */
 static void
 lock_prof_idle(void)
 {
 	struct thread *td;
 	int cpu;
 
 	td = curthread;
 	thread_lock(td);
 	CPU_FOREACH(cpu) {
 		sched_bind(td, cpu);
 	}
 	sched_unbind(td);
 	thread_unlock(td);
 }
 
 static void
 lock_prof_reset_wait(void)
 {
 
 	/*
 	 * Spin relinquishing our cpu so that lock_prof_idle may
 	 * run on it.
 	 */
 	while (lock_prof_resetting)
 		sched_relinquish(curthread);
 }
 
 static void
 lock_prof_reset(void)
 {
 	struct lock_prof_cpu *lpc;
 	int enabled, i, cpu;
 
 	/*
 	 * We not only race with acquiring and releasing locks but also
 	 * thread exit.  To be certain that threads exit without valid head
 	 * pointers they must see resetting set before enabled is cleared.
 	 * Otherwise a lock may not be removed from a per-thread list due
 	 * to disabled being set but not wait for reset() to remove it below.
 	 */
 	atomic_store_rel_int(&lock_prof_resetting, 1);
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	lock_prof_idle();
 	/*
 	 * Some objects may have migrated between CPUs.  Clear all links
 	 * before we zero the structures.  Some items may still be linked
 	 * into per-thread lists as well.
 	 */
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lpc = lp_cpu[cpu];
 		for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 			LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link);
 			LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link);
 		}
 	}
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lpc = lp_cpu[cpu];
 		bzero(lpc, sizeof(*lpc));
 		lock_prof_init_type(&lpc->lpc_types[0]);
 		lock_prof_init_type(&lpc->lpc_types[1]);
 	}
 	atomic_store_rel_int(&lock_prof_resetting, 0);
 	lock_prof_enable = enabled;
 }
 
 static void
 lock_prof_output(struct lock_prof *lp, struct sbuf *sb)
 {
 	const char *p;
 
 	for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3);
 	sbuf_printf(sb,
 	    "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n",
 	    lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000,
 	    lp->cnt_wait / 1000, lp->cnt_cur,
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_tot / (lp->cnt_cur * 1000),
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_wait / (lp->cnt_cur * 1000),
 	    (uintmax_t)0, lp->cnt_contest_locking,
 	    p, lp->line, lp->class->lc_name, lp->name);
 }
 
 static void
 lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash,
     int spin, int t)
 {
 	struct lock_prof_type *type;
 	struct lock_prof *l;
 	int cpu;
 
 	dst->file = match->file;
 	dst->line = match->line;
 	dst->class = match->class;
 	dst->name = match->name;
 
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (lp_cpu[cpu] == NULL)
 			continue;
 		type = &lp_cpu[cpu]->lpc_types[spin];
 		SLIST_FOREACH(l, &type->lpt_hash[hash], link) {
 			if (l->ticks == t)
 				continue;
 			if (l->file != match->file || l->line != match->line ||
 			    l->name != match->name)
 				continue;
 			l->ticks = t;
 			if (l->cnt_max > dst->cnt_max)
 				dst->cnt_max = l->cnt_max;
 			if (l->cnt_wait_max > dst->cnt_wait_max)
 				dst->cnt_wait_max = l->cnt_wait_max;
 			dst->cnt_tot += l->cnt_tot;
 			dst->cnt_wait += l->cnt_wait;
 			dst->cnt_cur += l->cnt_cur;
 			dst->cnt_contest_locking += l->cnt_contest_locking;
 		}
 	}
 	
 }
 
 static void
 lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin,
     int t)
 {
 	struct lock_prof *l;
 	int i;
 
 	for (i = 0; i < LPROF_HASH_SIZE; ++i) {
 		SLIST_FOREACH(l, &type->lpt_hash[i], link) {
 			struct lock_prof lp = {};
 
 			if (l->ticks == t)
 				continue;
 			lock_prof_sum(l, &lp, i, spin, t);
 			lock_prof_output(&lp, sb);
-			if (sbuf_overflowed(sb))
-				return;
 		}
 	}
 }
 
 static int
 dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
-	static int multiplier = 1;
 	struct sbuf *sb;
 	int error, cpu, t;
 	int enabled;
 
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, LPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
+	sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req);
 	sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n",
 	    "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	lock_prof_idle();
 	t = ticks;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (lp_cpu[cpu] == NULL)
 			continue;
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t);
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t);
-		if (sbuf_overflowed(sb)) {
-			sbuf_delete(sb);
-			multiplier++;
-			goto retry_sbufops;
-		}
 	}
 	lock_prof_enable = enabled;
 
-	sbuf_finish(sb);
-	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	error = sbuf_finish(sb);
+	/* Output a trailing NUL. */
+	if (error == 0)
+		error = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (error);
 }
 
 static int
 enable_lock_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = lock_prof_enable;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == lock_prof_enable)
 		return (0);
 	if (v == 1)
 		lock_prof_reset();
 	lock_prof_enable = !!v;
 
 	return (0);
 }
 
 static int
 reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	lock_prof_reset();
 
 	return (0);
 }
 
 static struct lock_prof *
 lock_profile_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	const char *unknown = "(unknown)";
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	struct lphead *head;
 	const char *p;
 	u_int hash;
 
 	p = file;
 	if (p == NULL || *p == '\0')
 		p = unknown;
 	hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line;
 	hash &= LPROF_HASH_MASK;
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	head = &type->lpt_hash[hash];
 	SLIST_FOREACH(lp, head, link) {
 		if (lp->line == line && lp->file == p &&
 		    lp->name == lo->lo_name)
 			return (lp);
 
 	}
 	lp = SLIST_FIRST(&type->lpt_lpalloc);
 	if (lp == NULL) {
 		lock_prof_rejected++;
 		return (lp);
 	}
 	SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link);
 	lp->file = p;
 	lp->line = line;
 	lp->class = LOCK_CLASS(lo);
 	lp->name = lo->lo_name;
 	SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link);
 	return (lp);
 }
 
 static struct lock_profile_object *
 lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lpohead *head;
 
 	head = &curthread->td_lprof[spin];
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo && l->lpo_file == file &&
 		    l->lpo_line == line)
 			return (l);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	l = LIST_FIRST(&type->lpt_lpoalloc);
 	if (l == NULL) {
 		lock_prof_rejected++;
 		return (NULL);
 	}
 	LIST_REMOVE(l, lpo_link);
 	l->lpo_obj = lo;
 	l->lpo_file = file;
 	l->lpo_line = line;
 	l->lpo_cnt = 0;
 	LIST_INSERT_HEAD(head, l, lpo_link);
 
 	return (l);
 }
 
 void
 lock_profile_obtain_lock_success(struct lock_object *lo, int contested,
     uint64_t waittime, const char *file, int line)
 {
 	static int lock_prof_count;
 	struct lock_profile_object *l;
 	int spin;
 
 	/* don't reset the timer when/if recursing */
 	if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE))
 		return;
 	if (lock_prof_skipcount &&
 	    (++lock_prof_count % lock_prof_skipcount) != 0)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	if (spin && lock_prof_skipspin == 1)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0)
 		goto out;
 	l = lock_profile_object_lookup(lo, spin, file, line);
 	if (l == NULL)
 		goto out;
 	l->lpo_cnt++;
 	if (++l->lpo_ref > 1)
 		goto out;
 	l->lpo_contest_locking = contested;
 	l->lpo_acqtime = nanoseconds(); 
 	if (waittime && (l->lpo_acqtime > waittime))
 		l->lpo_waittime = l->lpo_acqtime - waittime;
 	else
 		l->lpo_waittime = 0;
 out:
 	critical_exit();
 }
 
 void
 lock_profile_thread_exit(struct thread *td)
 {
 #ifdef INVARIANTS
 	struct lock_profile_object *l;
 
 	MPASS(curthread->td_critnest == 0);
 #endif
 	/*
 	 * If lock profiling was disabled we have to wait for reset to
 	 * clear our pointers before we can exit safely.
 	 */
 	lock_prof_reset_wait();
 #ifdef INVARIANTS
 	LIST_FOREACH(l, &td->td_lprof[0], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 	LIST_FOREACH(l, &td->td_lprof[1], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 #endif
 	MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL);
 	MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL);
 }
 
 void
 lock_profile_release_lock(struct lock_object *lo)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	uint64_t curtime, holdtime;
 	struct lpohead *head;
 	int spin;
 
 	if (lo->lo_flags & LO_NOPROFILE)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	head = &curthread->td_lprof[spin];
 	if (LIST_FIRST(head) == NULL)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0 && lock_prof_resetting == 1)
 		goto out;
 	/*
 	 * If lock profiling is not enabled we still want to remove the
 	 * lpo from our queue.
 	 */
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo)
 			break;
 	if (l == NULL)
 		goto out;
 	if (--l->lpo_ref > 0)
 		goto out;
 	lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line);
 	if (lp == NULL)
 		goto release;
 	curtime = nanoseconds();
 	if (curtime < l->lpo_acqtime)
 		goto release;
 	holdtime = curtime - l->lpo_acqtime;
 
 	/*
 	 * Record if the lock has been held longer now than ever
 	 * before.
 	 */
 	if (holdtime > lp->cnt_max)
 		lp->cnt_max = holdtime;
 	if (l->lpo_waittime > lp->cnt_wait_max)
 		lp->cnt_wait_max = l->lpo_waittime;
 	lp->cnt_tot += holdtime;
 	lp->cnt_wait += l->lpo_waittime;
 	lp->cnt_contest_locking += l->lpo_contest_locking;
 	lp->cnt_cur += l->lpo_cnt;
 release:
 	LIST_REMOVE(l, lpo_link);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link);
 out:
 	critical_exit();
 }
 
 SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
 SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW,
     &lock_prof_skipspin, 0, "Skip profiling on spinlocks.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW,
     &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
     &lock_prof_rejected, 0, "Number of rejected profiling records");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_lock_prof, "I", "Enable lock profiling");
 
 #endif
Index: head/sys/kern/subr_sbuf.c
===================================================================
--- head/sys/kern/subr_sbuf.c	(revision 212369)
+++ head/sys/kern/subr_sbuf.c	(revision 212370)
@@ -1,738 +1,738 @@
 /*-
  * Copyright (c) 2000-2008 Poul-Henning Kamp
  * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <machine/stdarg.h>
 #else /* _KERNEL */
 #include <ctype.h>
 #include <errno.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <sys/sbuf.h>
 
 #ifdef _KERNEL
 static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
 #define	SBMALLOC(size)		malloc(size, M_SBUF, M_WAITOK)
 #define	SBFREE(buf)		free(buf, M_SBUF)
 #else /* _KERNEL */
 #define	KASSERT(e, m)
 #define	SBMALLOC(size)		malloc(size)
 #define	SBFREE(buf)		free(buf)
 #endif /* _KERNEL */
 
 /*
  * Predicates
  */
 #define	SBUF_ISDYNAMIC(s)	((s)->s_flags & SBUF_DYNAMIC)
 #define	SBUF_ISDYNSTRUCT(s)	((s)->s_flags & SBUF_DYNSTRUCT)
 #define	SBUF_ISFINISHED(s)	((s)->s_flags & SBUF_FINISHED)
 #define	SBUF_HASOVERFLOWED(s)	((s)->s_flags & SBUF_OVERFLOWED)
 #define	SBUF_HASROOM(s)		((s)->s_len < (s)->s_size - 1)
 #define	SBUF_FREESPACE(s)	((s)->s_size - (s)->s_len - 1)
 #define	SBUF_CANEXTEND(s)	((s)->s_flags & SBUF_AUTOEXTEND)
 
 /*
  * Set / clear flags
  */
 #define	SBUF_SETFLAG(s, f)	do { (s)->s_flags |= (f); } while (0)
 #define	SBUF_CLEARFLAG(s, f)	do { (s)->s_flags &= ~(f); } while (0)
 
 #define	SBUF_MINEXTENDSIZE	16		/* Should be power of 2. */
 #define	SBUF_MAXEXTENDSIZE	PAGE_SIZE
 #define	SBUF_MAXEXTENDINCR	PAGE_SIZE
 
 /*
  * Debugging support
  */
 #if defined(_KERNEL) && defined(INVARIANTS)
 
 static void
 _assert_sbuf_integrity(const char *fun, struct sbuf *s)
 {
 
 	KASSERT(s != NULL,
 	    ("%s called with a NULL sbuf pointer", fun));
 	KASSERT(s->s_buf != NULL,
 	    ("%s called with uninitialized or corrupt sbuf", fun));
 	KASSERT(s->s_len < s->s_size,
 	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
 }
 
 static void
 _assert_sbuf_state(const char *fun, struct sbuf *s, int state)
 {
 
 	KASSERT((s->s_flags & SBUF_FINISHED) == state,
 	    ("%s called with %sfinished or corrupt sbuf", fun,
 	    (state ? "un" : "")));
 }
 
 #define	assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
 #define	assert_sbuf_state(s, i)	 _assert_sbuf_state(__func__, (s), (i))
 
 #else /* _KERNEL && INVARIANTS */
 
 #define	assert_sbuf_integrity(s) do { } while (0)
 #define	assert_sbuf_state(s, i)	 do { } while (0)
 
 #endif /* _KERNEL && INVARIANTS */
 
 #ifdef CTASSERT
 CTASSERT(powerof2(SBUF_MAXEXTENDSIZE));
 CTASSERT(powerof2(SBUF_MAXEXTENDINCR));
 #endif
 
 static int
 sbuf_extendsize(int size)
 {
 	int newsize;
 
 	if (size < (int)SBUF_MAXEXTENDSIZE) {
 		newsize = SBUF_MINEXTENDSIZE;
 		while (newsize < size)
 			newsize *= 2;
 	} else {
 		newsize = roundup2(size, SBUF_MAXEXTENDINCR);
 	}
 	KASSERT(newsize >= size, ("%s: %d < %d\n", __func__, newsize, size));
 	return (newsize);
 }
 
 
 /*
  * Extend an sbuf.
  */
 static int
 sbuf_extend(struct sbuf *s, int addlen)
 {
 	char *newbuf;
 	int newsize;
 
 	if (!SBUF_CANEXTEND(s))
 		return (-1);
 	newsize = sbuf_extendsize(s->s_size + addlen);
 	newbuf = SBMALLOC(newsize);
 	if (newbuf == NULL)
 		return (-1);
 	bcopy(s->s_buf, newbuf, s->s_size);
 	if (SBUF_ISDYNAMIC(s))
 		SBFREE(s->s_buf);
 	else
 		SBUF_SETFLAG(s, SBUF_DYNAMIC);
 	s->s_buf = newbuf;
 	s->s_size = newsize;
 	return (0);
 }
 
 /*
  * Initialize an sbuf.
  * If buf is non-NULL, it points to a static or already-allocated string
  * big enough to hold at least length characters.
  */
 struct sbuf *
 sbuf_new(struct sbuf *s, char *buf, int length, int flags)
 {
 
 	KASSERT(length >= 0,
 	    ("attempt to create an sbuf of negative length (%d)", length));
 	KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
 	    ("%s called with invalid flags", __func__));
 
 	flags &= SBUF_USRFLAGMSK;
 	if (s == NULL) {
 		s = SBMALLOC(sizeof(*s));
 		if (s == NULL)
 			return (NULL);
 		bzero(s, sizeof(*s));
 		s->s_flags = flags;
 		SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
 	} else {
 		bzero(s, sizeof(*s));
 		s->s_flags = flags;
 	}
 	s->s_size = length;
 	if (buf != NULL) {
 		s->s_buf = buf;
 		return (s);
 	}
 	if ((flags & SBUF_AUTOEXTEND) != 0)
 		s->s_size = sbuf_extendsize(s->s_size);
 	s->s_buf = SBMALLOC(s->s_size);
 	if (s->s_buf == NULL) {
 		if (SBUF_ISDYNSTRUCT(s))
 			SBFREE(s);
 		return (NULL);
 	}
 	SBUF_SETFLAG(s, SBUF_DYNAMIC);
 	return (s);
 }
 
 #ifdef _KERNEL
 /*
  * Create an sbuf with uio data
  */
 struct sbuf *
 sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
 {
 
 	KASSERT(uio != NULL,
 	    ("%s called with NULL uio pointer", __func__));
 	KASSERT(error != NULL,
 	    ("%s called with NULL error pointer", __func__));
 
 	s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
 	if (s == NULL) {
 		*error = ENOMEM;
 		return (NULL);
 	}
 	*error = uiomove(s->s_buf, uio->uio_resid, uio);
 	if (*error != 0) {
 		sbuf_delete(s);
 		return (NULL);
 	}
 	s->s_len = s->s_size - 1;
 	*error = 0;
 	return (s);
 }
 #endif
 
 /*
  * Clear an sbuf and reset its position.
  */
 void
 sbuf_clear(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 
 	SBUF_CLEARFLAG(s, SBUF_FINISHED);
 	SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
 	s->s_error = 0;
 	s->s_len = 0;
 }
 
 /*
  * Set the sbuf's end position to an arbitrary value.
  * Effectively truncates the sbuf at the new position.
  */
 int
 sbuf_setpos(struct sbuf *s, int pos)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(pos >= 0,
 	    ("attempt to seek to a negative position (%d)", pos));
 	KASSERT(pos < s->s_size,
 	    ("attempt to seek past end of sbuf (%d >= %d)", pos, s->s_size));
 
 	if (pos < 0 || pos > s->s_len)
 		return (-1);
 	s->s_len = pos;
 	return (0);
 }
 
 /*
  * Set up a drain function and argument on an sbuf to flush data to
  * when the sbuf buffer overflows.
  */
 void
 sbuf_set_drain(struct sbuf *s, sbuf_drain_func *func, void *ctx)
 {
 
 	assert_sbuf_state(s, 0);
 	assert_sbuf_integrity(s);
 	KASSERT(func == s->s_drain_func || s->s_len == 0,
 	    ("Cannot change drain to %p on non-empty sbuf %p", func, s));
 	s->s_drain_func = func;
 	s->s_drain_arg = ctx;
 }
 
 /*
  * Call the drain and process the return.
  */
 static int
 sbuf_drain(struct sbuf *s)
 {
 	int len;
 
 	KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s));
 	len = s->s_drain_func(s->s_drain_arg, s->s_buf, s->s_len);
 	if (len < 0) {
 		s->s_error = -len;
 		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
 		return (s->s_error);
 	}
-
-	KASSERT(len > 0, ("Drain must either error or work!"));
+	KASSERT(len > 0 && len <= s->s_len,
+	    ("Bad drain amount %d for sbuf %p", len, s));
 	s->s_len -= len;
 	/*
 	 * Fast path for the expected case where all the data was
 	 * drained.
 	 */
 	if (s->s_len == 0)
 		return (0);
 	/*
 	 * Move the remaining characters to the beginning of the
 	 * string.
 	 */
 	memmove(s->s_buf, s->s_buf + len, s->s_len);
 	return (0);
 }
 
 /*
  * Append a byte to an sbuf.  This is the core function for appending
  * to an sbuf and is the main place that deals with extending the
  * buffer and marking overflow.
  */
 static void
 sbuf_put_byte(int c, struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (SBUF_HASOVERFLOWED(s))
 		return;
 	if (SBUF_FREESPACE(s) <= 0) {
 		/* 
 		 * If there is a drain, use it, otherwise extend the
 		 * buffer.
 		 */
 		if (s->s_drain_func != NULL)
 			(void)sbuf_drain(s);
 		else if (sbuf_extend(s, 1) < 0)
 			SBUF_SETFLAG(s, SBUF_OVERFLOWED);
 		if (SBUF_HASOVERFLOWED(s))
 			return;
 	}
 	s->s_buf[s->s_len++] = c;
 }
 
 /*
  * Append a non-NUL character to an sbuf.  This prototype signature is
  * suitable for use with kvprintf(9).
  */
 static void
 sbuf_putc_func(int c, void *arg)
 {
 
 	if (c != '\0')
 		sbuf_put_byte(c, arg);
 }
 
 /*
  * Append a byte string to an sbuf.
  */
 int
 sbuf_bcat(struct sbuf *s, const void *buf, size_t len)
 {
 	const char *str = buf;
 	const char *end = str + len;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	for (; str < end; str++) {
 		sbuf_put_byte(*str, s);
 		if (SBUF_HASOVERFLOWED(s))
 			return (-1);
  	}
 	return (0);
 }
 
 #ifdef _KERNEL
 /*
  * Copy a byte string from userland into an sbuf.
  */
 int
 sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("Nonsensical copyin to sbuf %p with a drain", s));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	if (len == 0)
 		return (0);
 	if (len > SBUF_FREESPACE(s)) {
 		sbuf_extend(s, len - SBUF_FREESPACE(s));
 		if (SBUF_FREESPACE(s) < len)
 			len = SBUF_FREESPACE(s);
 	}
 	if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
 		return (-1);
 	s->s_len += len;
 
 	return (0);
 }
 #endif
 
 /*
  * Copy a byte string into an sbuf.
  */
 int
 sbuf_bcpy(struct sbuf *s, const void *buf, size_t len)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	sbuf_clear(s);
 	return (sbuf_bcat(s, buf, len));
 }
 
 /*
  * Append a string to an sbuf.
  */
 int
 sbuf_cat(struct sbuf *s, const char *str)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 
 	while (*str != '\0') {
 		sbuf_put_byte(*str, s);
 		if (SBUF_HASOVERFLOWED(s))
 			return (-1);
 	}
 	return (0);
 }
 
 #ifdef _KERNEL
 /*
  * Append a string from userland to an sbuf.
  */
 int
 sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
 {
 	size_t done;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("Nonsensical copyin to sbuf %p with a drain", s));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 
 	if (len == 0)
 		len = SBUF_FREESPACE(s);	/* XXX return 0? */
 	if (len > SBUF_FREESPACE(s)) {
 		sbuf_extend(s, len);
 		if (SBUF_FREESPACE(s) < len)
 			len = SBUF_FREESPACE(s);
 	}
 	switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
 	case ENAMETOOLONG:
 		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
 		/* fall through */
 	case 0:
 		s->s_len += done - 1;
 		break;
 	default:
 		return (-1);	/* XXX */
 	}
 
 	return (done);
 }
 #endif
 
 /*
  * Copy a string into an sbuf.
  */
 int
 sbuf_cpy(struct sbuf *s, const char *str)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	sbuf_clear(s);
 	return (sbuf_cat(s, str));
 }
 
 /*
  * Format the given argument list and append the resulting string to an sbuf.
  */
 #ifdef _KERNEL
 int
 sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(fmt != NULL,
 	    ("%s called with a NULL format string", __func__));
 
 	(void)kvprintf(fmt, sbuf_putc_func, s, 10, ap);
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	return (0);
 }
 #else /* !_KERNEL */
 int
 sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
 {
 	va_list ap_copy;
 	int error, len;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(fmt != NULL,
 	    ("%s called with a NULL format string", __func__));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 
 	/*
 	 * For the moment, there is no way to get vsnprintf(3) to hand
 	 * back a character at a time, to push everything into
 	 * sbuf_putc_func() as was done for the kernel.
 	 *
 	 * In userspace, while drains are useful, there's generally
 	 * not a problem attempting to malloc(3) on out of space.  So
 	 * expand a userland sbuf if there is not enough room for the
 	 * data produced by sbuf_[v]printf(3).
 	 */
 
 	error = 0;
 	do {
 		va_copy(ap_copy, ap);
 		len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
 		    fmt, ap_copy);
 		va_end(ap_copy);
 
 		if (SBUF_FREESPACE(s) >= len)
 			break;
 		/* Cannot print with the current available space. */
 		if (s->s_drain_func != NULL && s->s_len > 0)
 			error = sbuf_drain(s);
 		else
 			error = sbuf_extend(s, len - SBUF_FREESPACE(s));
 	} while (error == 0);
 
 	/*
 	 * s->s_len is the length of the string, without the terminating nul.
 	 * When updating s->s_len, we must subtract 1 from the length that
 	 * we passed into vsnprintf() because that length includes the
 	 * terminating nul.
 	 *
 	 * vsnprintf() returns the amount that would have been copied,
 	 * given sufficient space, so don't over-increment s_len.
 	 */
 	if (SBUF_FREESPACE(s) < len)
 		len = SBUF_FREESPACE(s);
 	s->s_len += len;
 	if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
 		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
 
 	KASSERT(s->s_len < s->s_size,
 	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	return (0);
 }
 #endif /* _KERNEL */
 
 /*
  * Format the given arguments and append the resulting string to an sbuf.
  */
 int
 sbuf_printf(struct sbuf *s, const char *fmt, ...)
 {
 	va_list ap;
 	int result;
 
 	va_start(ap, fmt);
 	result = sbuf_vprintf(s, fmt, ap);
 	va_end(ap);
 	return (result);
 }
 
 /*
  * Append a character to an sbuf.
  */
 int
 sbuf_putc(struct sbuf *s, int c)
 {
 
 	sbuf_putc_func(c, s);
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	return (0);
 }
 
 /*
  * Trim whitespace characters from end of an sbuf.
  */
 int
 sbuf_trim(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 
 	while (s->s_len > 0 && isspace(s->s_buf[s->s_len-1]))
 		--s->s_len;
 
 	return (0);
 }
 
 /*
  * Check if an sbuf overflowed
  */
 int
 sbuf_overflowed(struct sbuf *s)
 {
 
 	return (SBUF_HASOVERFLOWED(s));
 }
 
 /*
  * Finish off an sbuf.
  */
 int
 sbuf_finish(struct sbuf *s)
 {
 	int error = 0;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (s->s_drain_func != NULL) {
 		error = s->s_error;
 		while (s->s_len > 0 && error == 0)
 			error = sbuf_drain(s);
 	} else if (SBUF_HASOVERFLOWED(s))
 		error = ENOMEM;
 	s->s_buf[s->s_len] = '\0';
 	SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
 	SBUF_SETFLAG(s, SBUF_FINISHED);
 #ifdef _KERNEL
 	return (error);
 #else
 	errno = error;
 	return (-1);
 #endif
 }
 
 /*
  * Return a pointer to the sbuf data.
  */
 char *
 sbuf_data(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, SBUF_FINISHED);
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	return (s->s_buf);
 }
 
 /*
  * Return the length of the sbuf data.
  */
 int
 sbuf_len(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	if (SBUF_HASOVERFLOWED(s))
 		return (-1);
 	return (s->s_len);
 }
 
 /*
  * Clear an sbuf, free its buffer if necessary.
  */
 void
 sbuf_delete(struct sbuf *s)
 {
 	int isdyn;
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 
 	if (SBUF_ISDYNAMIC(s))
 		SBFREE(s->s_buf);
 	isdyn = SBUF_ISDYNSTRUCT(s);
 	bzero(s, sizeof(*s));
 	if (isdyn)
 		SBFREE(s);
 }
 
 /*
  * Check if an sbuf has been finished.
  */
 int
 sbuf_done(struct sbuf *s)
 {
 
 	return (SBUF_ISFINISHED(s));
 }
Index: head/sys/kern/subr_sleepqueue.c
===================================================================
--- head/sys/kern/subr_sleepqueue.c	(revision 212369)
+++ head/sys/kern/subr_sleepqueue.c	(revision 212370)
@@ -1,1235 +1,1227 @@
 /*-
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Implementation of sleep queues used to hold queue of threads blocked on
  * a wait channel.  Sleep queues different from turnstiles in that wait
  * channels are not owned by anyone, so there is no priority propagation.
  * Sleep queues can also provide a timeout and can also be interrupted by
  * signals.  That said, there are several similarities between the turnstile
  * and sleep queue implementations.  (Note: turnstiles were implemented
  * first.)  For example, both use a hash table of the same size where each
  * bucket is referred to as a "chain" that contains both a spin lock and
  * a linked list of queues.  An individual queue is located by using a hash
  * to pick a chain, locking the chain, and then walking the chain searching
  * for the queue.  This means that a wait channel object does not need to
  * embed it's queue head just as locks do not embed their turnstile queue
  * head.  Threads also carry around a sleep queue that they lend to the
  * wait channel when blocking.  Just as in turnstiles, the queue includes
  * a free list of the sleep queues of other threads blocked on the same
  * wait channel in the case of multiple waiters.
  *
  * Some additional functionality provided by sleep queues include the
  * ability to set a timeout.  The timeout is managed using a per-thread
  * callout that resumes a thread if it is asleep.  A thread may also
  * catch signals while it is asleep (aka an interruptible sleep).  The
  * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
  * sleep queues also provide some extra assertions.  One is not allowed to
  * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
  * must consistently use the same lock to synchronize with a wait channel,
  * though this check is currently only a warning for sleep/wakeup due to
  * pre-existing abuse of that API.  The same lock must also be held when
  * awakening threads, though that is currently only enforced for condition
  * variables.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Constants for the hash table of sleep queue chains.  These constants are
  * the same ones that 4BSD (and possibly earlier versions of BSD) used.
  * Basically, we ignore the lower 8 bits of the address since most wait
  * channel pointers are aligned and only look at the next 7 bits for the
  * hash.  SC_TABLESIZE must be a power of two for SC_MASK to work properly.
  */
 #define	SC_TABLESIZE	128			/* Must be power of 2. */
 #define	SC_MASK		(SC_TABLESIZE - 1)
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	(((uintptr_t)(wc) >> SC_SHIFT) & SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
 #define NR_SLEEPQS      2
 /*
  * There two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
  * that a sleep queue is on when it is attached to a wait channel.  The
  * second list is the free list hung off of a sleep queue that is attached
  * to a wait channel.
  *
  * Each sleep queue also contains the wait channel it is attached to, the
  * list of threads blocked on that wait channel, flags specific to the
  * wait channel, and the lock used to synchronize with a wait channel.
  * The flags are used to catch mismatches between the various consumers
  * of the sleep queue API (e.g. sleep/wakeup and condition variables).
  * The lock pointer is only used when invariants are enabled for various
  * debugging checks.
  *
  * Locking key:
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
 	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
 	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
 	int	sq_type;			/* (c) Queue type. */
 #ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
 struct sleepqueue_chain {
 	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
 	struct mtx sc_lock;			/* Spin lock for this chain. */
 #ifdef SLEEPQUEUE_PROFILING
 	u_int	sc_depth;			/* Length of sc_queues. */
 	u_int	sc_max_depth;			/* Max length of sc_queues. */
 #endif
 };
 
 #ifdef SLEEPQUEUE_PROFILING
 u_int sleepq_max_depth;
 SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
 SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
     "sleepq chain stats");
 SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
     0, "maxmimum depth achieved of a single chain");
 
 static void	sleepq_profile(const char *wmesg);
 static int	prof_enabled;
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
 static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
 static int	sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
 static int	sleepq_init(void *mem, int size, int flags);
 static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
 		    int pri);
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
  */
 void
 init_sleepqueues(void)
 {
 #ifdef SLEEPQUEUE_PROFILING
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 #endif
 	int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_INIT(&sleepq_chains[i].sc_queues);
 		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
 		    MTX_SPIN | MTX_RECURSE);
 #ifdef SLEEPQUEUE_PROFILING
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
 		    NULL);
 #endif
 	}
 	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
 #ifdef INVARIANTS
 	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #else
 	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #endif
 	
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
  * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
 
 	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
  * Free a sleep queue when a thread is destroyed.
  */
 void
 sleepq_free(struct sleepqueue *sq)
 {
 
 	uma_zfree(sleepq_zone, sq);
 }
 
 /*
  * Lock the sleep queue chain associated with the specified wait channel.
  */
 void
 sleepq_lock(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 }
 
 /*
  * Look up the sleep queue associated with a given wait channel in the hash
  * table locking the associated sleep queue chain.  If no queue is found in
  * the table, NULL is returned.
  */
 struct sleepqueue *
 sleepq_lookup(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			return (sq);
 	return (NULL);
 }
 
 /*
  * Unlock the sleep queue chain associated with a given wait channel.
  */
 void
 sleepq_release(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_unlock_spin(&sc->sc_lock);
 }
 
 /*
  * Places the current thread on the sleep queue for the specified wait
  * channel.  If INVARIANTS is enabled, then it associates the passed in
  * lock with the sleepq to make sure it is held when that sleep queue is
  * woken up.
  */
 void
 sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
     int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	KASSERT(!(td->td_pflags & TDP_NOSLEEPING),
 	    ("Trying sleep, but thread marked as sleeping prohibited"));
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
 
 	/*
 	 * If the wait channel does not already have a sleep queue, use
 	 * this thread's sleep queue.  Otherwise, insert the current thread
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
 #ifdef INVARIANTS
 		int i;
 
 		sq = td->td_sleepqueue;
 		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
 			    ("thread's sleep queue %d is not empty", i));
 			KASSERT(sq->sq_blockedcnt[i] == 0,
 			    ("thread's sleep queue %d count mismatches", i));
 		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
 			sc->sc_max_depth = sc->sc_depth;
 			if (sc->sc_max_depth > sleepq_max_depth)
 				sleepq_max_depth = sc->sc_max_depth;
 		}
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
 		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 		if (flags & SLEEPQ_STOP_ON_BDRY)
 			td->td_flags |= TDF_SBDRY;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Sets a timeout that will remove the current thread from the specified
  * sleep queue after timo ticks if the thread has not already been awakened.
  */
 void
 sleepq_set_timeout(void *wchan, int timo)
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
 	callout_reset_curcpu(&td->td_slpcallout, timo, sleepq_timeout, td);
 }
 
 /*
  * Return the number of actual sleepers for the specified queue.
  */
 u_int
 sleepq_sleepcnt(void *wchan, int queue)
 {
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	return (sq->sq_blockedcnt[queue]);
 }
 
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
  * to sleep. Enters and exits with the thread lock held.  Thread lock
  * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	int sig, ret, stop_allowed;
 
 	td = curthread;
 	p = curproc;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(wchan != NULL);
 	if ((td->td_pflags & TDP_WAKEUP) != 0) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		ret = EINTR;
 		thread_lock(td);
 		goto out;
 	}
 
 	/*
 	 * See if there are any pending signals for this thread.  If not
 	 * we can switch immediately.  Otherwise do the signal processing
 	 * directly.
 	 */
 	thread_lock(td);
 	if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 	stop_allowed = (td->td_flags & TDF_SBDRY) ? SIG_STOP_NOT_ALLOWED :
 	    SIG_STOP_ALLOWED;
 	thread_unlock(td);
 	mtx_unlock_spin(&sc->sc_lock);
 	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 		(void *)td, (long)p->p_pid, td->td_name);
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig = cursig(td, stop_allowed);
 	if (sig == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		ret = thread_suspend_check(1);
 		MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 	} else {
 		if (SIGISMEMBER(ps->ps_sigintr, sig))
 			ret = EINTR;
 		else
 			ret = ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	/*
 	 * Lock the per-process spinlock prior to dropping the PROC_LOCK
 	 * to avoid a signal delivery race.  PROC_LOCK, PROC_SLOCK, and
 	 * thread_lock() are currently held in tdsendsignal().
 	 */
 	PROC_SLOCK(p);
 	mtx_lock_spin(&sc->sc_lock);
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 	if (ret == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 out:
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 	}
 	mtx_unlock_spin(&sc->sc_lock);
 	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
  * Switches to another thread if we are still asleep on a sleep queue.
  * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * If TDF_TIMEOUT is set, then our sleep has been timed out
 	 * already but we are still on the sleep queue, so dequeue the
 	 * thread and return.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		MPASS(TD_ON_SLEEPQ(td));
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 		return;		
 	}
 #ifdef SLEEPQUEUE_PROFILING
 	if (prof_enabled)
 		sleepq_profile(td->td_wmesg);
 #endif
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 }
 
 /*
  * Check to see if we timed out.
  */
 static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
 		return (EWOULDBLOCK);
 	}
 
 	/*
 	 * If TDF_TIMOFAIL is set, the timeout ran after we had
 	 * already been woken up.
 	 */
 	if (td->td_flags & TDF_TIMOFAIL)
 		td->td_flags &= ~TDF_TIMOFAIL;
 
 	/*
 	 * If callout_stop() fails, then the timeout is running on
 	 * another CPU, so synchronize with it to avoid having it
 	 * accidentally wake up a subsequent sleep.
 	 */
 	else if (callout_stop(&td->td_slpcallout) == 0) {
 		td->td_flags |= TDF_TIMEOUT;
 		TD_SET_SLEEPING(td);
 		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
 	}
 	return (0);
 }
 
 /*
  * Check to see if we were awoken by a signal.
  */
 static int
 sleepq_check_signals(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
 		td->td_flags &= ~(TDF_SINTR | TDF_SBDRY);
 
 	if (td->td_flags & TDF_SLEEPABORT) {
 		td->td_flags &= ~TDF_SLEEPABORT;
 		return (td->td_intrval);
 	}
 
 	return (0);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue.
  */
 void
 sleepq_wait(void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	thread_unlock(td);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it is interrupted by a signal.
  */
 int
 sleepq_wait_sig(void *wchan, int pri)
 {
 	int rcatch;
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rval = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it times out while waiting.
  */
 int
 sleepq_timedwait(void *wchan, int pri)
 {
 	struct thread *td;
 	int rval;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	rval = sleepq_check_timeout();
 	thread_unlock(td);
 
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue,
  * it is interrupted by a signal, or it times out waiting to be awakened.
  */
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
 		return (rvals);
 	return (rvalt);
 }
 
 /*
  * Returns the type of sleepqueue given a waitchannel.
  */
 int
 sleepq_type(void *wchan)
 {
 	struct sleepqueue *sq;
 	int type;
 
 	MPASS(wchan != NULL);
 
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		sleepq_release(wchan);
 		return (-1);
 	}
 	type = sq->sq_type;
 	sleepq_release(wchan);
 	return (type);
 }
 
 /*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  */
 static int
 sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
 {
 	struct sleepqueue_chain *sc;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
 	 * use the queue itself and take it out of the chain, otherwise,
 	 * remove a queue from the free list.
 	 */
 	if (LIST_EMPTY(&sq->sq_free)) {
 		td->td_sleepqueue = sq;
 #ifdef INVARIANTS
 		sq->sq_wchan = NULL;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth--;
 #endif
 	} else
 		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
 	LIST_REMOVE(td->td_sleepqueue, sq_hash);
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
 	td->td_flags &= ~(TDF_SINTR | TDF_SBDRY);
 
 	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
 
 	/* Adjust priority if requested. */
 	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
 	if (pri != 0 && td->td_priority > pri)
 		sched_prio(td, pri);
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
 	 * sleepq_catch_signals() on another CPU or is blocked on its
 	 * proc lock to check signals.  There's no need to mark the
 	 * thread runnable in that case.
 	 */
 	if (TD_IS_SLEEPING(td)) {
 		TD_CLR_SLEEPING(td);
 		return (setrunnable(td));
 	}
 	return (0);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 sleepq_dtor(void *mem, int size, void *arg)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
 		MPASS(sq->sq_blockedcnt[i] == 0);
 	}
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 sleepq_init(void *mem, int size, int flags)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	bzero(mem, size);
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
 		sq->sq_blockedcnt[i] = 0;
 	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
 
 /*
  * Find the highest priority thread sleeping on a wait channel and resume it.
  */
 int
 sleepq_signal(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *besttd;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/*
 	 * Find the highest priority thread on the queue.  If there is a
 	 * tie, use the thread that first appears in the queue as it has
 	 * been sleeping the longest since threads are always added to
 	 * the tail of sleep queues.
 	 */
 	besttd = NULL;
 	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
 		if (besttd == NULL || td->td_priority < besttd->td_priority)
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
 	thread_lock(besttd);
 	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
 	thread_unlock(besttd);
 	return (wakeup_swapper);
 }
 
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 int
 sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *tdn;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/* Resume all blocked threads on the sleep queue. */
 	wakeup_swapper = 0;
 	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
 		thread_lock(td);
 		if (sleepq_resume_thread(sq, td, pri))
 			wakeup_swapper = 1;
 		thread_unlock(td);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Time sleeping threads out.  When the timeout expires, the thread is
  * removed from the sleep queue and made runnable if it is still asleep.
  */
 static void
 sleepq_timeout(void *arg)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
 	int wakeup_swapper;
 
 	td = arg;
 	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
 	/*
 	 * First, see if the thread is asleep and get the wait channel if
 	 * it is.
 	 */
 	thread_lock(td);
 	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 		thread_unlock(td);
 		if (wakeup_swapper)
 			kick_proc0();
 		return;
 	}
 
 	/*
 	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
 	 * can either be on another CPU in between sleepq_add() and
 	 * one of the sleepq_*wait*() routines or it can be in
 	 * sleepq_catch_signals().
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		td->td_flags |= TDF_TIMEOUT;
 		thread_unlock(td);
 		return;
 	}
 
 	/*
 	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
 	 * then the other thread has already yielded to us, so clear
 	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
 	 * we know that the other thread is not on a sleep queue, but it
 	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
 	 * to let it know that the timeout has already run and doesn't
 	 * need to be canceled.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		MPASS(TD_IS_SLEEPING(td));
 		td->td_flags &= ~TDF_TIMEOUT;
 		TD_CLR_SLEEPING(td);
 		wakeup_swapper = setrunnable(td);
 	} else
 		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Resumes a specific thread from the sleep queue associated with a specific
  * wait channel if it is on that queue.
  */
 void
 sleepq_remove(struct thread *td, void *wchan)
 {
 	struct sleepqueue *sq;
 	int wakeup_swapper;
 
 	/*
 	 * Look up the sleep queue for this wait channel, then re-check
 	 * that the thread is asleep on that channel, if it is not, then
 	 * bail.
 	 */
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	/*
 	 * We can not lock the thread here as it may be sleeping on a
 	 * different sleepq.  However, holding the sleepq lock for this
 	 * wchan can guarantee that we do not miss a wakeup for this
 	 * channel.  The asserts below will catch any false positives.
 	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
 		sleepq_release(wchan);
 		return;
 	}
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	thread_lock(td);
 	MPASS(sq != NULL);
 	MPASS(td->td_wchan == wchan);
 	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	thread_unlock(td);
 	sleepq_release(wchan);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
  */
 int
 sleepq_abort(struct thread *td, int intrval)
 {
 	struct sleepqueue *sq;
 	void *wchan;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
 
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if (td->td_flags & TDF_TIMEOUT)
 		return (0);
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 	td->td_intrval = intrval;
 	td->td_flags |= TDF_SLEEPABORT;
 	/*
 	 * If the thread has not slept yet it will find the signal in
 	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
 	 * we have to do it here.
 	 */
 	if (!TD_IS_SLEEPING(td))
 		return (0);
 	wchan = td->td_wchan;
 	MPASS(wchan != NULL);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	return (sleepq_resume_thread(sq, td, 0));
 }
 
 #ifdef SLEEPQUEUE_PROFILING
 #define	SLEEPQ_PROF_LOCATIONS	1024
-#define	SLEEPQ_SBUFSIZE		(40 * 512)
+#define	SLEEPQ_SBUFSIZE		512
 struct sleepq_prof {
 	LIST_ENTRY(sleepq_prof) sp_link;
 	const char	*sp_wmesg;
 	long		sp_count;
 };
 
 LIST_HEAD(sqphead, sleepq_prof);
 
 struct sqphead sleepq_prof_free;
 struct sqphead sleepq_hash[SC_TABLESIZE];
 static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
 static struct mtx sleepq_prof_lock;
 MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
 
 static void
 sleepq_profile(const char *wmesg)
 {
 	struct sleepq_prof *sp;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	if (prof_enabled == 0)
 		goto unlock;
 	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
 		if (sp->sp_wmesg == wmesg)
 			goto done;
 	sp = LIST_FIRST(&sleepq_prof_free);
 	if (sp == NULL)
 		goto unlock;
 	sp->sp_wmesg = wmesg;
 	LIST_REMOVE(sp, sp_link);
 	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
 done:
 	sp->sp_count++;
 unlock:
 	mtx_unlock_spin(&sleepq_prof_lock);
 	return;
 }
 
 static void
 sleepq_prof_reset(void)
 {
 	struct sleepq_prof *sp;
 	int enabled;
 	int i;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	enabled = prof_enabled;
 	prof_enabled = 0;
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_INIT(&sleepq_hash[i]);
 	LIST_INIT(&sleepq_prof_free);
 	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
 		sp = &sleepq_profent[i];
 		sp->sp_wmesg = NULL;
 		sp->sp_count = 0;
 		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
 	}
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 }
 
 static int
 enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = prof_enabled;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == prof_enabled)
 		return (0);
 	if (v == 1)
 		sleepq_prof_reset();
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = !!v;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	return (0);
 }
 
 static int
 reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	sleepq_prof_reset();
 
 	return (0);
 }
 
 static int
 dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
-	static int multiplier = 1;
 	struct sleepq_prof *sp;
 	struct sbuf *sb;
 	int enabled;
 	int error;
 	int i;
 
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, SLEEPQ_SBUFSIZE * multiplier, SBUF_FIXEDLEN);
+	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
 	sbuf_printf(sb, "\nwmesg\tcount\n");
 	enabled = prof_enabled;
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = 0;
 	mtx_unlock_spin(&sleepq_prof_lock);
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
 			sbuf_printf(sb, "%s\t%ld\n",
 			    sp->sp_wmesg, sp->sp_count);
-			if (sbuf_overflowed(sb)) {
-				sbuf_delete(sb);
-				multiplier++;
-				goto retry_sbufops;
-			}
 		}
 	}
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
-	sbuf_finish(sb);
-	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_sleepq_prof_stats, "I",
     "Reset sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 #ifdef INVARIANTS
 	struct lock_object *lock;
 #endif
 	struct thread *td;
 	void *wchan;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active sleep queue for the wait channel
 	 * indicated by the address.
 	 */
 	wchan = (void *)addr;
 	sc = SC_LOOKUP(wchan);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			goto found;
 
 	/*
 	 * Second, see if there is an active sleep queue at the address
 	 * indicated.
 	 */
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
 			if (sq == (struct sleepqueue *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
 	db_printf("Queue type: %d\n", sq->sq_type);
 #ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
 		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	}
 #endif
 	db_printf("Blocked threads:\n");
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		db_printf("\nQueue[%d]:\n", i);
 		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
 			db_printf("\tempty\n");
 		else
 			TAILQ_FOREACH(td, &sq->sq_blocked[0],
 				      td_slpq) {
 				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
 		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }
 
 /* Alias 'show sleepqueue' to 'show sleepq'. */
 DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
 #endif
Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c	(revision 212369)
+++ head/sys/kern/subr_witness.c	(revision 212370)
@@ -1,2810 +1,2799 @@
 /*-
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #define	WITNESS_COUNT 		1024
 #define	WITNESS_CHILDCOUNT 	(WITNESS_COUNT * 4)
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	768
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	BADSTACK_SBUF_SIZE	(256 * WITNESS_COUNT)
-#define	CYCLEGRAPH_SBUF_SIZE	8192
-#define	FULLGRAPH_SBUF_SIZE	32768
+#define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < WITNESS_COUNT)
 
 MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_empty(const struct witness_lock_order_key *key)
 {
 
 	return (key->from == 0 && key->to == 0);
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 #ifdef KDB
 static void	_witness_debugger(int cond, const char *msg);
 #endif
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 #ifdef KDB
 #define	witness_debugger(c)	_witness_debugger(c, __func__)
 #else
 #define	witness_debugger(c)
 #endif
 
 SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL, "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 TUNABLE_INT("debug.witness.watch", &witness_watch);
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RW | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 TUNABLE_INT("debug.witness.kdb", &witness_kdb);
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RW, &witness_kdb, 0, "");
 
 /*
  * When KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 TUNABLE_INT("debug.witness.trace", &witness_trace);
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RW, &witness_trace, 0, "");
 #endif /* KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 TUNABLE_INT("debug.witness.skipspin", &witness_skipspin);
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin,
     0, "");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t w_rmatrix[WITNESS_COUNT+1][WITNESS_COUNT+1];
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rw },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_mtx", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_mtx", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_global_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_rw },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_rw },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * netatalk
 	 */
 	{ "ddp_list_mtx", &lock_class_mtx_sleep },
 	{ "ddp_mtx", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_mtx_sleep },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "system map", &lock_class_mtx_sleep },
 	{ "vm page queue mutex", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 * 
 	 */
 	{ "vm object", &lock_class_mtx_sleep },
 	{ "page lock", &lock_class_mtx_sleep },
 	{ "vm page queue mutex", &lock_class_mtx_sleep },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 	{ "scrlock", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ "process slock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "umtx lock", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #ifdef __ia64__
 	{ "MCA spin lock", &lock_class_mtx_spin },
 #endif
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 static int blessed_count =
 	sizeof(blessed_list) / sizeof(struct witness_blessed);
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 static void
 witness_initialize(void *dummy __unused)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	int i;
 
 	w_data = malloc(sizeof (struct witness) * WITNESS_COUNT, M_WITNESS,
 	    M_NOWAIT | M_ZERO);
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = WITNESS_COUNT - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	memset(w_rmatrix, 0,
 	    (sizeof(**w_rmatrix) * (WITNESS_COUNT+1) * (WITNESS_COUNT+1)));
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
     NULL);
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		panic("%s: lock (%s) %s can not be recursable", __func__,
 		    class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		panic("%s: lock (%s) %s can not be sleepable", __func__,
 		    class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		panic("%s: lock (%s) %s can not be upgradable", __func__,
 		    class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, bump it\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", w->w_file,
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 	}
 }
 #endif /* DDB */
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 	file = fixup_filename(file);
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			panic("blockable sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			printf("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name, file, line);
 			printf("while exclusively locked from %s:%d\n",
 			    lock1->li_file, lock1->li_line);
 			panic("share->excl");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			printf("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name, file, line);
 			printf("while share locked from %s:%d\n",
 			    lock1->li_file, lock1->li_line);
 			panic("excl->share");
 		}
 		return;
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	mtx_lock_spin(&w_mtx);
 	witness_lock_order_add(w1, w);
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			printf(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			printf(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			       plock->li_file, plock->li_line);
 			printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line);
 			witness_debugger(1);
 		    } else
 			    mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < WITNESS_COUNT);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock the first time we see it.
 			 */
 			if (interlock != NULL && interlock == lock1->li_lock) {
 				interlock = NULL;
 				continue;
 			}
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 			
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				printf(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				printf(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				printf("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				printf(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, lock1->li_file, lock1->li_line);
 				printf(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name, file, line);
 			} else {
 				printf(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    lock2->li_file, lock2->li_line);
 				printf(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, lock1->li_file, lock1->li_line);
 				printf(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name, file, line);
 			}
 			witness_debugger(1);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 	file = fixup_filename(file);
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			panic("upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			panic("upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL)
 		panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name, file, line);
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			panic("upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK, file, line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 		panic("downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			panic("downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL)
 		panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name, file, line);
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			panic("downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK, file, line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0)
 		panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, file, line);
 	else
 		return;
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		printf("shared unlock of (%s) %s @ %s:%d\n", class->lc_name,
 		    lock->lo_name, file, line);
 		printf("while exclusively locked from %s:%d\n",
 		    instance->li_file, instance->li_line);
 		panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		printf("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name,
 		    lock->lo_name, file, line);
 		printf("while share locked from %s:%d\n", instance->li_file,
 		    instance->li_line);
 		panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		printf("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name,
 		    lock->lo_name, file, line);
 		panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 		printf("Thread %p exiting with the following locks held:\n",
 					    td);
 				n++;
 				witness_list_lock(&lle->ll_children[i], printf);
 				
 			}
 		panic("Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * console along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following");
 				if (flags & WARN_SLEEPOK)
 					printf(" non-sleepable");
 				printf(" locks held:\n");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following");
 		if (flags & WARN_SLEEPOK)
 			printf(" non-sleepable");
 		printf(" locks held:\n");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		panic("%s", __func__);
 	else
 		witness_debugger(n);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 	struct witness_list *typelist;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 		else
 			typelist = &w_spin;
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK))
 		typelist = &w_sleep;
 	else
 		panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		panic(
 			"lock (%s) %s does not match earlier (%s) lock",
 			description, lock_class->lc_name,
 			w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 	struct witness_list *list;
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		list = &w_sleep;
 		w_sleep_cnt--;
 	} else {
 		list = &w_spin;
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0)
 			mtx_unlock_spin(&w_mtx);
 		panic("%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 		(WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < blessed_count; i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < WITNESS_COUNT);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock, instance->li_file,
 	    instance->li_line);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(struct lock_object *lock, int flags, const char *file, int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 	}
 	file = fixup_filename(file);
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			panic("Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			panic("Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name, file, line);
 		break;
 	default:
 		panic("Invalid lock assertion at %s:%d.", file, line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, TRUE);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	struct sbuf *sb;
 	u_int w_rmatrix1, w_rmatrix2;
 	int error, generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, BADSTACK_SBUF_SIZE, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			req->oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				req->oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 			w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
 			w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 #if 0
  			sbuf_printf(sb,
 			"w_rmatrix[%s][%s] == %x, w_rmatrix[%s][%s] == %x\n",
  			    tmp_w1->name, tmp_w2->w_name, w_rmatrix1,
  			    tmp_w2->name, tmp_w1->w_name, w_rmatrix2);
 #endif
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		req->oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
-	sb = sbuf_new(NULL, NULL, FULLGRAPH_SBUF_SIZE, SBUF_FIXEDLEN);
+	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
-	 * While using SBUF_FIXEDLEN, check if the sbuf overflowed.
-	 */
-	if (sbuf_overflowed(sb)) {
-		sbuf_delete(sb);
-		panic("%s: sbuf overflowed, bump FULLGRAPH_SBUF_SIZE value\n",
-		    __func__);
-	}
-
-	/*
 	 * Close the sbuf and return to userland.
 	 */
-	sbuf_finish(sb);
-	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenver the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 #ifdef KDB
 static void
 _witness_debugger(int cond, const char *msg)
 {
 
 	if (witness_trace && cond)
 		kdb_backtrace();
 	if (witness_kdb && cond)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 }
 #endif
Index: head/sys/sys/sysctl.h
===================================================================
--- head/sys/sys/sysctl.h	(revision 212369)
+++ head/sys/sys/sysctl.h	(revision 212370)
@@ -1,723 +1,726 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sysctl.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_SYSCTL_H_
 #define	_SYS_SYSCTL_H_
 
 #include <sys/queue.h>
 
 struct thread;
 /*
  * Definitions for sysctl call.  The sysctl call uses a hierarchical name
  * for objects that can be examined or modified.  The name is expressed as
  * a sequence of integers.  Like a file path name, the meaning of each
  * component depends on its place in the hierarchy.  The top-level and kern
  * identifiers are defined here, and other identifiers are defined in the
  * respective subsystem header files.
  */
 
 #define CTL_MAXNAME	24	/* largest number of components supported */
 
 /*
  * Each subsystem defined by sysctl defines a list of variables
  * for that subsystem. Each name is either a node with further
  * levels defined below it, or it is a leaf of some particular
  * type given below. Each sysctl level defines a set of name/type
  * pairs to be used by sysctl(8) in manipulating the subsystem.
  */
 struct ctlname {
 	char	*ctl_name;	/* subsystem name */
 	int	ctl_type;	/* type of name */
 };
 
 #define CTLTYPE		0xf	/* Mask for the type */
 #define	CTLTYPE_NODE	1	/* name is a node */
 #define	CTLTYPE_INT	2	/* name describes an integer */
 #define	CTLTYPE_STRING	3	/* name describes a string */
 #define	CTLTYPE_QUAD	4	/* name describes a 64-bit number */
 #define	CTLTYPE_OPAQUE	5	/* name describes a structure */
 #define	CTLTYPE_STRUCT	CTLTYPE_OPAQUE	/* name describes a structure */
 #define	CTLTYPE_UINT	6	/* name describes an unsigned integer */
 #define	CTLTYPE_LONG	7	/* name describes a long */
 #define	CTLTYPE_ULONG	8	/* name describes an unsigned long */
 
 #define CTLFLAG_RD	0x80000000	/* Allow reads of variable */
 #define CTLFLAG_WR	0x40000000	/* Allow writes to the variable */
 #define CTLFLAG_RW	(CTLFLAG_RD|CTLFLAG_WR)
 #define CTLFLAG_NOLOCK	0x20000000	/* XXX Don't Lock */
 #define CTLFLAG_ANYBODY	0x10000000	/* All users can set this var */
 #define CTLFLAG_SECURE	0x08000000	/* Permit set only if securelevel<=0 */
 #define CTLFLAG_PRISON	0x04000000	/* Prisoned roots can fiddle */
 #define CTLFLAG_DYN	0x02000000	/* Dynamic oid - can be freed */
 #define CTLFLAG_SKIP	0x01000000	/* Skip this sysctl when listing */
 #define CTLMASK_SECURE	0x00F00000	/* Secure level */
 #define CTLFLAG_TUN	0x00080000	/* Tunable variable */
 #define CTLFLAG_MPSAFE	0x00040000	/* Handler is MP safe */
 #define CTLFLAG_VNET	0x00020000	/* Prisons with vnet can fiddle */
 #define CTLFLAG_RDTUN	(CTLFLAG_RD|CTLFLAG_TUN)
 
 /*
  * Secure level.   Note that CTLFLAG_SECURE == CTLFLAG_SECURE1.  
  *
  * Secure when the securelevel is raised to at least N.
  */
 #define CTLSHIFT_SECURE	20
 #define CTLFLAG_SECURE1	(CTLFLAG_SECURE | (0 << CTLSHIFT_SECURE))
 #define CTLFLAG_SECURE2	(CTLFLAG_SECURE | (1 << CTLSHIFT_SECURE))
 #define CTLFLAG_SECURE3	(CTLFLAG_SECURE | (2 << CTLSHIFT_SECURE))
 
 /*
  * USE THIS instead of a hardwired number from the categories below
  * to get dynamically assigned sysctl entries using the linker-set
  * technology. This is the way nearly all new sysctl variables should
  * be implemented.
  * e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, "");
  */ 
 #define OID_AUTO	(-1)
 
 /*
  * The starting number for dynamically-assigned entries.  WARNING!
  * ALL static sysctl entries should have numbers LESS than this!
  */
 #define CTL_AUTO_START	0x100
 
 #ifdef _KERNEL
 #define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, int arg2, \
 	struct sysctl_req *req
 
 /* definitions for sysctl_req 'lock' member */
 #define REQ_UNLOCKED	0	/* not locked and not wired */
 #define REQ_LOCKED	1	/* locked and not wired */
 #define REQ_WIRED	2	/* locked and wired */
 
 /* definitions for sysctl_req 'flags' member */
 #if defined(__amd64__) || defined(__ia64__) || defined(__powerpc64__)
 #define	SCTL_MASK32	1	/* 32 bit emulation */
 #endif
 
 /*
  * This describes the access space for a sysctl request.  This is needed
  * so that we can use the interface from the kernel or from user-space.
  */
 struct sysctl_req {
 	struct thread	*td;		/* used for access checking */
 	int		lock;		/* locking/wiring state */
 	void		*oldptr;
 	size_t		oldlen;
 	size_t		oldidx;
 	int		(*oldfunc)(struct sysctl_req *, const void *, size_t);
 	void		*newptr;
 	size_t		newlen;
 	size_t		newidx;
 	int		(*newfunc)(struct sysctl_req *, void *, size_t);
 	size_t		validlen;
 	int		flags;
 };
 
 SLIST_HEAD(sysctl_oid_list, sysctl_oid);
 
 /*
  * This describes one "oid" in the MIB tree.  Potentially more nodes can
  * be hidden behind it, expanded by the handler.
  */
 struct sysctl_oid {
 	struct sysctl_oid_list *oid_parent;
 	SLIST_ENTRY(sysctl_oid) oid_link;
 	int		oid_number;
 	u_int		oid_kind;
 	void		*oid_arg1;
 	int		oid_arg2;
 	const char	*oid_name;
 	int 		(*oid_handler)(SYSCTL_HANDLER_ARGS);
 	const char	*oid_fmt;
 	int		oid_refcnt;
 	const char	*oid_descr;
 };
 
 #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l)
 #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l)
 
 int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
 int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_long(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_quad(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_intptr(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_string(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS);
 
 int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS);
 
 /*
  * These functions are used to add/remove an oid from the mib.
  */
 void sysctl_register_oid(struct sysctl_oid *oidp);
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
 /* Declare a static oid to allow child oids to be added to it. */
 #define SYSCTL_DECL(name)					\
 	extern struct sysctl_oid_list sysctl_##name##_children
 
 /* Hide these in macros */
 #define	SYSCTL_CHILDREN(oid_ptr) (struct sysctl_oid_list *) \
 	(oid_ptr)->oid_arg1
 #define	SYSCTL_CHILDREN_SET(oid_ptr, val) \
 	(oid_ptr)->oid_arg1 = (val);
 #define	SYSCTL_STATIC_CHILDREN(oid_name) \
 	(&sysctl_##oid_name##_children)
 
 /* === Structs and macros related to context handling === */
 
 /* All dynamically created sysctls can be tracked in a context list. */
 struct sysctl_ctx_entry {
 	struct sysctl_oid *entry;
 	TAILQ_ENTRY(sysctl_ctx_entry) link;
 };
 
 TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);
 
 #define SYSCTL_NODE_CHILDREN(parent, name) \
 	sysctl_##parent##_##name##_children
 
 #ifndef NO_SYSCTL_DESCR
 #define __DESCR(d) d
 #else
 #define __DESCR(d) ""
 #endif
 
 /* This constructs a "raw" MIB oid. */
 #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 	static struct sysctl_oid sysctl__##parent##_##name = {		 \
 		&sysctl_##parent##_children, { NULL }, nbr, kind,	 \
 		a1, a2, #name, handler, fmt, 0, __DESCR(descr) };     \
 	DATA_SET(sysctl_set, sysctl__##parent##_##name)
 
 #define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 	sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr))
 
 /* This constructs a node from which other oids can hang. */
 #define SYSCTL_NODE(parent, nbr, name, access, handler, descr)		    \
 	struct sysctl_oid_list SYSCTL_NODE_CHILDREN(parent, name);	    \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|(access),		    \
 	    (void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, "N", descr)
 
 #define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE|(access),	    \
 	NULL, 0, handler, "N", __DESCR(descr))
 
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \
 		arg, len, sysctl_handle_string, "A", descr)
 
 #define SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr)  \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access),	    \
 	arg, len, sysctl_handle_string, "A", __DESCR(descr))
 
 /* Oid for an int.  If ptr is NULL, val is returned. */
 #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_int, "I", descr)
 
 #define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_INT|CTLFLAG_MPSAFE|(access),	    \
 	ptr, val, sysctl_handle_int, "I", __DESCR(descr))
 
 /* Oid for an unsigned int.  If ptr is NULL, val is returned. */
 #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_UINT|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_int, "IU", descr)
 
 #define SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr)    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_UINT|CTLFLAG_MPSAFE|(access),	    \
 	ptr, val, sysctl_handle_int, "IU", __DESCR(descr))
 
 #define SYSCTL_XINT(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_UINT|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_int, "IX", descr)
 
 #define SYSCTL_ADD_XINT(ctx, parent, nbr, name, access, ptr, val, descr)    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_UINT|CTLFLAG_MPSAFE|(access),	    \
 	ptr, val, sysctl_handle_int, "IX", __DESCR(descr))
 
 /* Oid for a long.  The pointer must be non NULL. */
 #define SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_LONG|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_long, "L", descr)
 
 #define SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_LONG|CTLFLAG_MPSAFE|(access),	    \
 	ptr, 0, sysctl_handle_long, "L", __DESCR(descr))
 
 /* Oid for an unsigned long.  The pointer must be non NULL. */
 #define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_ULONG|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_long, "LU", __DESCR(descr))
 
 #define SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_ULONG|CTLFLAG_MPSAFE|(access),	    \
 	ptr, 0, sysctl_handle_long, "LU", __DESCR(descr))
 
 #define SYSCTL_XLONG(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_ULONG|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_long, "LX", __DESCR(descr))
 
 #define SYSCTL_ADD_XLONG(ctx, parent, nbr, name, access, ptr, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_ULONG|CTLFLAG_MPSAFE|(access),	    \
 	ptr, 0, sysctl_handle_long, "LX", __DESCR(descr))
 
 /* Oid for a quad.  The pointer must be non NULL. */
 #define SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD|CTLFLAG_MPSAFE|(access), \
 		ptr, val, sysctl_handle_quad, "Q", __DESCR(descr))
 
 #define SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr)	    \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_QUAD|CTLFLAG_MPSAFE|(access),	    \
 	ptr, 0, sysctl_handle_quad, "Q", __DESCR(descr))
 
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \
 		ptr, len, sysctl_handle_opaque, fmt, descr)
 
 #define SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr)\
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE|(access),	    \
 	ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr))
 
 /* Oid for a struct.  Specified by a pointer and a type. */
 #define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \
 		ptr, sizeof(struct type), sysctl_handle_opaque, \
 		"S," #type, descr)
 
 #define SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE|(access),	    \
 	ptr, sizeof(struct type), sysctl_handle_opaque, "S," #type, __DESCR(descr))
 
 /* Oid for a procedure.  Specified by a pointer and an arg. */
 #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 	SYSCTL_OID(parent, nbr, name, (access), \
 		ptr, arg, handler, fmt, descr)
 
 #define SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 	sysctl_add_oid(ctx, parent, nbr, name, (access),			    \
 	ptr, arg, handler, fmt, __DESCR(descr))
 
 /*
  * A macro to generate a read-only sysctl to indicate the presense of optional
  * kernel features.
  */
 #define	FEATURE(name, desc)						\
 	SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD, 0, 1, desc)
 
 #endif /* _KERNEL */
 
 /*
  * Top-level identifiers
  */
 #define	CTL_UNSPEC	0		/* unused */
 #define	CTL_KERN	1		/* "high kernel": proc, limits */
 #define	CTL_VM		2		/* virtual memory */
 #define	CTL_VFS		3		/* filesystem, mount type is next */
 #define	CTL_NET		4		/* network, see socket.h */
 #define	CTL_DEBUG	5		/* debugging parameters */
 #define	CTL_HW		6		/* generic cpu/io */
 #define	CTL_MACHDEP	7		/* machine dependent */
 #define	CTL_USER	8		/* user-level */
 #define	CTL_P1003_1B	9		/* POSIX 1003.1B */
 #define	CTL_MAXID	10		/* number of valid top-level ids */
 
 #define CTL_NAMES { \
 	{ 0, 0 }, \
 	{ "kern", CTLTYPE_NODE }, \
 	{ "vm", CTLTYPE_NODE }, \
 	{ "vfs", CTLTYPE_NODE }, \
 	{ "net", CTLTYPE_NODE }, \
 	{ "debug", CTLTYPE_NODE }, \
 	{ "hw", CTLTYPE_NODE }, \
 	{ "machdep", CTLTYPE_NODE }, \
 	{ "user", CTLTYPE_NODE }, \
 	{ "p1003_1b", CTLTYPE_NODE }, \
 }
 
 /*
  * CTL_KERN identifiers
  */
 #define	KERN_OSTYPE	 	 1	/* string: system version */
 #define	KERN_OSRELEASE	 	 2	/* string: system release */
 #define	KERN_OSREV	 	 3	/* int: system revision */
 #define	KERN_VERSION	 	 4	/* string: compile time info */
 #define	KERN_MAXVNODES	 	 5	/* int: max vnodes */
 #define	KERN_MAXPROC	 	 6	/* int: max processes */
 #define	KERN_MAXFILES	 	 7	/* int: max open files */
 #define	KERN_ARGMAX	 	 8	/* int: max arguments to exec */
 #define	KERN_SECURELVL	 	 9	/* int: system security level */
 #define	KERN_HOSTNAME		10	/* string: hostname */
 #define	KERN_HOSTID		11	/* int: host identifier */
 #define	KERN_CLOCKRATE		12	/* struct: struct clockrate */
 #define	KERN_VNODE		13	/* struct: vnode structures */
 #define	KERN_PROC		14	/* struct: process entries */
 #define	KERN_FILE		15	/* struct: file entries */
 #define	KERN_PROF		16	/* node: kernel profiling info */
 #define	KERN_POSIX1		17	/* int: POSIX.1 version */
 #define	KERN_NGROUPS		18	/* int: # of supplemental group ids */
 #define	KERN_JOB_CONTROL	19	/* int: is job control available */
 #define	KERN_SAVED_IDS		20	/* int: saved set-user/group-ID */
 #define	KERN_BOOTTIME		21	/* struct: time kernel was booted */
 #define KERN_NISDOMAINNAME	22	/* string: YP domain name */
 #define KERN_UPDATEINTERVAL	23	/* int: update process sleep time */
 #define KERN_OSRELDATE		24	/* int: kernel release date */
 #define KERN_NTP_PLL		25	/* node: NTP PLL control */
 #define	KERN_BOOTFILE		26	/* string: name of booted kernel */
 #define	KERN_MAXFILESPERPROC	27	/* int: max open files per proc */
 #define	KERN_MAXPROCPERUID 	28	/* int: max processes per uid */
 #define KERN_DUMPDEV		29	/* struct cdev *: device to dump on */
 #define	KERN_IPC		30	/* node: anything related to IPC */
 #define	KERN_DUMMY		31	/* unused */
 #define	KERN_PS_STRINGS		32	/* int: address of PS_STRINGS */
 #define	KERN_USRSTACK		33	/* int: address of USRSTACK */
 #define	KERN_LOGSIGEXIT		34	/* int: do we log sigexit procs? */
 #define	KERN_IOV_MAX		35	/* int: value of UIO_MAXIOV */
 #define	KERN_HOSTUUID		36	/* string: host UUID identifier */
 #define	KERN_ARND		37	/* int: from arc4rand() */
 #define	KERN_MAXID		38	/* number of valid kern ids */
 
 #define CTL_KERN_NAMES { \
 	{ 0, 0 }, \
 	{ "ostype", CTLTYPE_STRING }, \
 	{ "osrelease", CTLTYPE_STRING }, \
 	{ "osrevision", CTLTYPE_INT }, \
 	{ "version", CTLTYPE_STRING }, \
 	{ "maxvnodes", CTLTYPE_INT }, \
 	{ "maxproc", CTLTYPE_INT }, \
 	{ "maxfiles", CTLTYPE_INT }, \
 	{ "argmax", CTLTYPE_INT }, \
 	{ "securelevel", CTLTYPE_INT }, \
 	{ "hostname", CTLTYPE_STRING }, \
 	{ "hostid", CTLTYPE_UINT }, \
 	{ "clockrate", CTLTYPE_STRUCT }, \
 	{ "vnode", CTLTYPE_STRUCT }, \
 	{ "proc", CTLTYPE_STRUCT }, \
 	{ "file", CTLTYPE_STRUCT }, \
 	{ "profiling", CTLTYPE_NODE }, \
 	{ "posix1version", CTLTYPE_INT }, \
 	{ "ngroups", CTLTYPE_INT }, \
 	{ "job_control", CTLTYPE_INT }, \
 	{ "saved_ids", CTLTYPE_INT }, \
 	{ "boottime", CTLTYPE_STRUCT }, \
 	{ "nisdomainname", CTLTYPE_STRING }, \
 	{ "update", CTLTYPE_INT }, \
 	{ "osreldate", CTLTYPE_INT }, \
 	{ "ntp_pll", CTLTYPE_NODE }, \
 	{ "bootfile", CTLTYPE_STRING }, \
 	{ "maxfilesperproc", CTLTYPE_INT }, \
 	{ "maxprocperuid", CTLTYPE_INT }, \
 	{ "ipc", CTLTYPE_NODE }, \
 	{ "dummy", CTLTYPE_INT }, \
 	{ "ps_strings", CTLTYPE_INT }, \
 	{ "usrstack", CTLTYPE_INT }, \
 	{ "logsigexit", CTLTYPE_INT }, \
 	{ "iov_max", CTLTYPE_INT }, \
 	{ "hostuuid", CTLTYPE_STRING }, \
 }
 
 /*
  * CTL_VFS identifiers
  */
 #define CTL_VFS_NAMES { \
 	{ "vfsconf", CTLTYPE_STRUCT }, \
 }
 
 /*
  * KERN_PROC subtypes
  */
 #define KERN_PROC_ALL		0	/* everything */
 #define	KERN_PROC_PID		1	/* by process id */
 #define	KERN_PROC_PGRP		2	/* by process group id */
 #define	KERN_PROC_SESSION	3	/* by session of pid */
 #define	KERN_PROC_TTY		4	/* by controlling tty */
 #define	KERN_PROC_UID		5	/* by effective uid */
 #define	KERN_PROC_RUID		6	/* by real uid */
 #define	KERN_PROC_ARGS		7	/* get/set arguments/proctitle */
 #define	KERN_PROC_PROC		8	/* only return procs */
 #define	KERN_PROC_SV_NAME	9	/* get syscall vector name */
 #define	KERN_PROC_RGID		10	/* by real group id */
 #define	KERN_PROC_GID		11	/* by effective group id */
 #define	KERN_PROC_PATHNAME	12	/* path to executable */
 #define	KERN_PROC_OVMMAP	13	/* Old VM map entries for process */
 #define	KERN_PROC_OFILEDESC	14	/* Old file descriptors for process */
 #define	KERN_PROC_KSTACK	15	/* Kernel stacks for process */
 #define	KERN_PROC_INC_THREAD	0x10	/*
 					 * modifier for pid, pgrp, tty,
 					 * uid, ruid, gid, rgid and proc
 					 * This effectively uses 16-31
 					 */
 #define	KERN_PROC_VMMAP		32	/* VM map entries for process */
 #define	KERN_PROC_FILEDESC	33	/* File descriptors for process */
 #define	KERN_PROC_GROUPS	34	/* process groups */
 
 /*
  * KERN_IPC identifiers
  */
 #define KIPC_MAXSOCKBUF		1	/* int: max size of a socket buffer */
 #define	KIPC_SOCKBUF_WASTE	2	/* int: wastage factor in sockbuf */
 #define	KIPC_SOMAXCONN		3	/* int: max length of connection q */
 #define	KIPC_MAX_LINKHDR	4	/* int: max length of link header */
 #define	KIPC_MAX_PROTOHDR	5	/* int: max length of network header */
 #define	KIPC_MAX_HDR		6	/* int: max total length of headers */
 #define	KIPC_MAX_DATALEN	7	/* int: max length of data? */
 
 /*
  * CTL_HW identifiers
  */
 #define	HW_MACHINE	 1		/* string: machine class */
 #define	HW_MODEL	 2		/* string: specific machine model */
 #define	HW_NCPU		 3		/* int: number of cpus */
 #define	HW_BYTEORDER	 4		/* int: machine byte order */
 #define	HW_PHYSMEM	 5		/* int: total memory */
 #define	HW_USERMEM	 6		/* int: non-kernel memory */
 #define	HW_PAGESIZE	 7		/* int: software page size */
 #define	HW_DISKNAMES	 8		/* strings: disk drive names */
 #define	HW_DISKSTATS	 9		/* struct: diskstats[] */
 #define HW_FLOATINGPT	10		/* int: has HW floating point? */
 #define HW_MACHINE_ARCH	11		/* string: machine architecture */
 #define	HW_REALMEM	12		/* int: 'real' memory */
 #define	HW_MAXID	13		/* number of valid hw ids */
 
 #define CTL_HW_NAMES { \
 	{ 0, 0 }, \
 	{ "machine", CTLTYPE_STRING }, \
 	{ "model", CTLTYPE_STRING }, \
 	{ "ncpu", CTLTYPE_INT }, \
 	{ "byteorder", CTLTYPE_INT }, \
 	{ "physmem", CTLTYPE_ULONG }, \
 	{ "usermem", CTLTYPE_ULONG }, \
 	{ "pagesize", CTLTYPE_INT }, \
 	{ "disknames", CTLTYPE_STRUCT }, \
 	{ "diskstats", CTLTYPE_STRUCT }, \
 	{ "floatingpoint", CTLTYPE_INT }, \
 	{ "machine_arch", CTLTYPE_STRING }, \
 	{ "realmem", CTLTYPE_ULONG }, \
 }
 
 /*
  * CTL_USER definitions
  */
 #define	USER_CS_PATH		 1	/* string: _CS_PATH */
 #define	USER_BC_BASE_MAX	 2	/* int: BC_BASE_MAX */
 #define	USER_BC_DIM_MAX		 3	/* int: BC_DIM_MAX */
 #define	USER_BC_SCALE_MAX	 4	/* int: BC_SCALE_MAX */
 #define	USER_BC_STRING_MAX	 5	/* int: BC_STRING_MAX */
 #define	USER_COLL_WEIGHTS_MAX	 6	/* int: COLL_WEIGHTS_MAX */
 #define	USER_EXPR_NEST_MAX	 7	/* int: EXPR_NEST_MAX */
 #define	USER_LINE_MAX		 8	/* int: LINE_MAX */
 #define	USER_RE_DUP_MAX		 9	/* int: RE_DUP_MAX */
 #define	USER_POSIX2_VERSION	10	/* int: POSIX2_VERSION */
 #define	USER_POSIX2_C_BIND	11	/* int: POSIX2_C_BIND */
 #define	USER_POSIX2_C_DEV	12	/* int: POSIX2_C_DEV */
 #define	USER_POSIX2_CHAR_TERM	13	/* int: POSIX2_CHAR_TERM */
 #define	USER_POSIX2_FORT_DEV	14	/* int: POSIX2_FORT_DEV */
 #define	USER_POSIX2_FORT_RUN	15	/* int: POSIX2_FORT_RUN */
 #define	USER_POSIX2_LOCALEDEF	16	/* int: POSIX2_LOCALEDEF */
 #define	USER_POSIX2_SW_DEV	17	/* int: POSIX2_SW_DEV */
 #define	USER_POSIX2_UPE		18	/* int: POSIX2_UPE */
 #define	USER_STREAM_MAX		19	/* int: POSIX2_STREAM_MAX */
 #define	USER_TZNAME_MAX		20	/* int: POSIX2_TZNAME_MAX */
 #define	USER_MAXID		21	/* number of valid user ids */
 
 #define	CTL_USER_NAMES { \
 	{ 0, 0 }, \
 	{ "cs_path", CTLTYPE_STRING }, \
 	{ "bc_base_max", CTLTYPE_INT }, \
 	{ "bc_dim_max", CTLTYPE_INT }, \
 	{ "bc_scale_max", CTLTYPE_INT }, \
 	{ "bc_string_max", CTLTYPE_INT }, \
 	{ "coll_weights_max", CTLTYPE_INT }, \
 	{ "expr_nest_max", CTLTYPE_INT }, \
 	{ "line_max", CTLTYPE_INT }, \
 	{ "re_dup_max", CTLTYPE_INT }, \
 	{ "posix2_version", CTLTYPE_INT }, \
 	{ "posix2_c_bind", CTLTYPE_INT }, \
 	{ "posix2_c_dev", CTLTYPE_INT }, \
 	{ "posix2_char_term", CTLTYPE_INT }, \
 	{ "posix2_fort_dev", CTLTYPE_INT }, \
 	{ "posix2_fort_run", CTLTYPE_INT }, \
 	{ "posix2_localedef", CTLTYPE_INT }, \
 	{ "posix2_sw_dev", CTLTYPE_INT }, \
 	{ "posix2_upe", CTLTYPE_INT }, \
 	{ "stream_max", CTLTYPE_INT }, \
 	{ "tzname_max", CTLTYPE_INT }, \
 }
 
 #define CTL_P1003_1B_ASYNCHRONOUS_IO		1	/* boolean */
 #define CTL_P1003_1B_MAPPED_FILES		2	/* boolean */
 #define CTL_P1003_1B_MEMLOCK			3	/* boolean */
 #define CTL_P1003_1B_MEMLOCK_RANGE		4	/* boolean */
 #define CTL_P1003_1B_MEMORY_PROTECTION		5	/* boolean */
 #define CTL_P1003_1B_MESSAGE_PASSING		6	/* boolean */
 #define CTL_P1003_1B_PRIORITIZED_IO		7	/* boolean */
 #define CTL_P1003_1B_PRIORITY_SCHEDULING	8	/* boolean */
 #define CTL_P1003_1B_REALTIME_SIGNALS		9	/* boolean */
 #define CTL_P1003_1B_SEMAPHORES			10	/* boolean */
 #define CTL_P1003_1B_FSYNC			11	/* boolean */
 #define CTL_P1003_1B_SHARED_MEMORY_OBJECTS	12	/* boolean */
 #define CTL_P1003_1B_SYNCHRONIZED_IO		13	/* boolean */
 #define CTL_P1003_1B_TIMERS			14	/* boolean */
 #define CTL_P1003_1B_AIO_LISTIO_MAX		15	/* int */
 #define CTL_P1003_1B_AIO_MAX			16	/* int */
 #define CTL_P1003_1B_AIO_PRIO_DELTA_MAX		17	/* int */
 #define CTL_P1003_1B_DELAYTIMER_MAX		18	/* int */
 #define CTL_P1003_1B_MQ_OPEN_MAX		19	/* int */
 #define CTL_P1003_1B_PAGESIZE			20	/* int */
 #define CTL_P1003_1B_RTSIG_MAX			21	/* int */
 #define CTL_P1003_1B_SEM_NSEMS_MAX		22	/* int */
 #define CTL_P1003_1B_SEM_VALUE_MAX		23	/* int */
 #define CTL_P1003_1B_SIGQUEUE_MAX		24	/* int */
 #define CTL_P1003_1B_TIMER_MAX			25	/* int */
 
 #define CTL_P1003_1B_MAXID		26
 
 #define	CTL_P1003_1B_NAMES { \
 	{ 0, 0 }, \
 	{ "asynchronous_io", CTLTYPE_INT }, \
 	{ "mapped_files", CTLTYPE_INT }, \
 	{ "memlock", CTLTYPE_INT }, \
 	{ "memlock_range", CTLTYPE_INT }, \
 	{ "memory_protection", CTLTYPE_INT }, \
 	{ "message_passing", CTLTYPE_INT }, \
 	{ "prioritized_io", CTLTYPE_INT }, \
 	{ "priority_scheduling", CTLTYPE_INT }, \
 	{ "realtime_signals", CTLTYPE_INT }, \
 	{ "semaphores", CTLTYPE_INT }, \
 	{ "fsync", CTLTYPE_INT }, \
 	{ "shared_memory_objects", CTLTYPE_INT }, \
 	{ "synchronized_io", CTLTYPE_INT }, \
 	{ "timers", CTLTYPE_INT }, \
 	{ "aio_listio_max", CTLTYPE_INT }, \
 	{ "aio_max", CTLTYPE_INT }, \
 	{ "aio_prio_delta_max", CTLTYPE_INT }, \
 	{ "delaytimer_max", CTLTYPE_INT }, \
 	{ "mq_open_max", CTLTYPE_INT }, \
 	{ "pagesize", CTLTYPE_INT }, \
 	{ "rtsig_max", CTLTYPE_INT }, \
 	{ "nsems_max", CTLTYPE_INT }, \
 	{ "sem_value_max", CTLTYPE_INT }, \
 	{ "sigqueue_max", CTLTYPE_INT }, \
 	{ "timer_max", CTLTYPE_INT }, \
 }
 
 #ifdef _KERNEL
 
 /*
  * Declare some common oids.
  */
 extern struct sysctl_oid_list sysctl__children;
 SYSCTL_DECL(_kern);
 SYSCTL_DECL(_kern_features);
 SYSCTL_DECL(_kern_ipc);
 SYSCTL_DECL(_kern_proc);
 SYSCTL_DECL(_kern_sched);
 SYSCTL_DECL(_kern_sched_stats);
 SYSCTL_DECL(_sysctl);
 SYSCTL_DECL(_vm);
 SYSCTL_DECL(_vm_stats);
 SYSCTL_DECL(_vm_stats_misc);
 SYSCTL_DECL(_vfs);
 SYSCTL_DECL(_net);
 SYSCTL_DECL(_debug);
 SYSCTL_DECL(_debug_sizeof);
 SYSCTL_DECL(_dev);
 SYSCTL_DECL(_hw);
 SYSCTL_DECL(_hw_bus);
 SYSCTL_DECL(_hw_bus_devices);
 SYSCTL_DECL(_hw_bus_info);
 SYSCTL_DECL(_machdep);
 SYSCTL_DECL(_user);
 SYSCTL_DECL(_compat);
 SYSCTL_DECL(_regression);
 SYSCTL_DECL(_security);
 SYSCTL_DECL(_security_bsd);
 
 extern char	machine[];
 extern char	osrelease[];
 extern char	ostype[];
 extern char	kern_ident[];
 
 /* Dynamic oid handling */
 struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist,
 		struct sysctl_oid_list *parent, int nbr, const char *name,
 		int kind, void *arg1, int arg2,
 		int (*handler) (SYSCTL_HANDLER_ARGS),
 		const char *fmt, const char *descr);
 void	sysctl_rename_oid(struct sysctl_oid *oidp, const char *name);
 int	sysctl_move_oid(struct sysctl_oid *oidp,
 		struct sysctl_oid_list *parent);
 int	sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse);
 int	sysctl_ctx_init(struct sysctl_ctx_list *clist);
 int	sysctl_ctx_free(struct sysctl_ctx_list *clist);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_add(struct sysctl_ctx_list *clist,
 		struct sysctl_oid *oidp);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_find(struct sysctl_ctx_list *clist,
 		struct sysctl_oid *oidp);
 int	sysctl_ctx_entry_del(struct sysctl_ctx_list *clist,
 		struct sysctl_oid *oidp);
 
 int	kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 		      size_t *oldlenp, void *new, size_t newlen,
 		      size_t *retval, int flags);
 int	kernel_sysctlbyname(struct thread *td, char *name,
 		void *old, size_t *oldlenp, void *new, size_t newlen,
 		size_t *retval, int flags);
 int	userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 			size_t *oldlenp, int inkernel, void *new, size_t newlen,
 			size_t *retval, int flags);
 int	sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
 			int *nindx, struct sysctl_req *req);
 void	sysctl_lock(void);
 void	sysctl_unlock(void);
 int	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);
 
+struct sbuf;
+struct sbuf	*sbuf_new_for_sysctl(struct sbuf *, char *, int,
+		    struct sysctl_req *);
 #else	/* !_KERNEL */
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
 int	sysctl(const int *, u_int, void *, size_t *, const void *, size_t);
 int	sysctlbyname(const char *, void *, size_t *, const void *, size_t);
 int	sysctlnametomib(const char *, int *, size_t *);
 __END_DECLS
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_SYSCTL_H_ */
Index: head/sys/vm/uma_core.c
===================================================================
--- head/sys/vm/uma_core.c	(revision 212369)
+++ head/sys/vm/uma_core.c	(revision 212370)
@@ -1,3334 +1,3298 @@
 /*-
  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * effecient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* I should really use ktr.. */
 /*
 #define UMA_DEBUG 1
 #define UMA_DEBUG_ALLOC 1
 #define UMA_DEBUG_ALLOC_1 1
 */
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 /*
  * This is the zone and keg from which all zones are spawned.  The idea is that
  * even the zone & keg heads are allocated from the allocator, so we use the
  * bss section to bootstrap us.
  */
 static struct uma_keg masterkeg;
 static struct uma_zone masterzone_k;
 static struct uma_zone masterzone_z;
 static uma_zone_t kegs = &masterzone_k;
 static uma_zone_t zones = &masterzone_z;
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 static int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
 /* This mutex protects the keg list */
 static struct mtx uma_mtx;
 
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
     LIST_HEAD_INITIALIZER(uma_boot_pages);
 
 /* This mutex protects the boot time pages list */
 static struct mtx uma_boot_pages_mtx;
 
 /* Is the VM done starting up? */
 static int booted = 0;
 
 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 static u_int uma_max_ipers;
 static u_int uma_max_ipers_ref;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_keg_t keg;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
 	int		ubz_entries;
 };
 
 #define	BUCKET_MAX	128
 
 struct uma_bucket_zone bucket_zones[] = {
 	{ NULL, "16 Bucket", 16 },
 	{ NULL, "32 Bucket", 32 },
 	{ NULL, "64 Bucket", 64 },
 	{ NULL, "128 Bucket", 128 },
 	{ NULL, NULL, 0}
 };
 
 #define	BUCKET_SHIFT	4
 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 
 /*
  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
  * of approximately the right size.
  */
 static uint8_t bucket_size[BUCKET_ZONES];
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 
 #define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
 #define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
 
 /* Prototypes.. */
 
 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 static void page_free(void *, int, u_int8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
 static void keg_small_init(uma_keg_t keg);
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *zone_alloc_item(uma_zone_t, void *, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
     int);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(int, int);
 static void bucket_free(uma_bucket_t);
 static void bucket_zone_drain(void);
 static int zone_alloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, u_int32_t flags);
 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 
 static void
 bucket_enable(void)
 {
 	if (cnt.v_free_count < cnt.v_free_min)
 		bucketdisable = 1;
 	else
 		bucketdisable = 0;
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.  Initialize bucket_size[] to point
  * the range of appropriate bucket sizes at the zone.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int i;
 	int j;
 
 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 		int size;
 
 		ubz = &bucket_zones[j];
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 			bucket_size[i >> BUCKET_SHIFT] = j;
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	int idx;
 
 	idx = howmany(entries, 1 << BUCKET_SHIFT);
 	return (&bucket_zones[bucket_size[idx]]);
 }
 
 static uma_bucket_t
 bucket_alloc(int entries, int bflags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * This is to stop us from allocating per cpu buckets while we're
 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
 	 * boot pages.  This also prevents us from allocating buckets in
 	 * low memory situations.
 	 */
 	if (bucketdisable)
 		return (NULL);
 
 	ubz = bucket_zone_lookup(entries);
 	bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = ubz->ubz_entries;
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_bucket_t bucket)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 	    ZFREE_STATFREE);
 }
 
 static void
 bucket_zone_drain(void)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		zone_drain(ubz->ubz_zone);
 }
 
 static inline uma_keg_t
 zone_first_keg(uma_zone_t zone)
 {
 
 	return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
 }
 
 static void
 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 {
 	uma_klink_t klink;
 
 	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 		kegfn(klink->kl_keg);
 }
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Returns nothing.
  */
 static void
 keg_timeout(uma_keg_t keg)
 {
 
 	KEG_LOCK(keg);
 	/*
 	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		newhash = keg->uk_hash;
 		KEG_UNLOCK(keg);
 		ret = hash_alloc(&newhash);
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
 			KEG_LOCK(keg);
 		}
 	}
 	KEG_UNLOCK(keg);
 }
 
 static void
 zone_timeout(uma_zone_t zone)
 {
 
 	zone_foreach_keg(zone, &keg_timeout);
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on sucess and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash)
 {
 	int oldsize;
 	int alloc;
 
 	oldsize = hash->uh_hashsize;
 
 	/* We're just going to go to a power of two greater */
 	if (oldsize)  {
 		hash->uh_hashsize = oldsize * 2;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
 	int hval;
 	int i;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (i = 0; i < oldhash->uh_hashsize; i++)
 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		zone_free_item(hashzone,
 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items, cpu queue must be locked.
  *
  * Returns:
  *	Nothing
  */
 
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	void *item;
 
 	if (bucket == NULL)
 		return;
 
 	while (bucket->ub_cnt > 0)  {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 		KASSERT(item != NULL,
 		    ("bucket_drain: botched ptr, item is NULL"));
 #endif
 		zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 	}
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being turn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
 			bucket_free(cache->uc_allocbucket);
 		if (cache->uc_freebucket != NULL)
 			bucket_free(cache->uc_freebucket);
 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
 	}
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 }
 
 /*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
 	uma_bucket_t bucket;
 
 	/*
 	 * Drain the bucket queues and free the buckets, we just keep two per
 	 * cpu (alloc/free).
 	 */
 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		bucket_drain(zone, bucket);
 		bucket_free(bucket);
 		ZONE_LOCK(zone);
 	}
 
 	/* Now we do the free queue.. */
 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		bucket_free(bucket);
 	}
 }
 
 /*
  * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Returns nothing.
  */
 static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 #ifdef UMA_DEBUG
 	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 #endif
 	KEG_LOCK(keg);
 	if (keg->uk_free == 0)
 		goto finished;
 
 	slab = LIST_FIRST(&keg->uk_free_slab);
 	while (slab) {
 		n = LIST_NEXT(slab, us_link);
 
 		/* We have no where to free these to */
 		if (slab->us_flags & UMA_SLAB_BOOT) {
 			slab = n;
 			continue;
 		}
 
 		LIST_REMOVE(slab, us_link);
 		keg->uk_pages -= keg->uk_ppera;
 		keg->uk_free -= keg->uk_ipers;
 
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
 		slab = n;
 	}
 finished:
 	KEG_UNLOCK(keg);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 		if (keg->uk_fini)
 			for (i = 0; i < keg->uk_ipers; i++)
 				keg->uk_fini(
 				    slab->us_data + (keg->uk_rsize * i),
 				    keg->uk_size);
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
 		if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
 				obj = kmem_object;
 			else if (flags & UMA_SLAB_KERNEL)
 				obj = kernel_object;
 			else
 				obj = NULL;
 			for (i = 0; i < keg->uk_ppera; i++)
 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 				    obj);
 		}
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
 		    keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
 }
 
 static void
 zone_drain_wait(uma_zone_t zone, int waitok)
 {
 
 	/*
 	 * Set draining to interlock with zone_dtor() so we can release our
 	 * locks as we go.  Only dtor() should do a WAITOK call since it
 	 * is the only call that knows the structure will still be available
 	 * when it wakes up.
 	 */
 	ZONE_LOCK(zone);
 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 		if (waitok == M_NOWAIT)
 			goto out;
 		mtx_unlock(&uma_mtx);
 		msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
 		mtx_lock(&uma_mtx);
 	}
 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 	/*
 	 * The DRAINING flag protects us from being freed while
 	 * we're running.  Normally the uma_mtx would protect us but we
 	 * must be able to release and acquire the right lock for each keg.
 	 */
 	zone_foreach_keg(zone, &keg_drain);
 	ZONE_LOCK(zone);
 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 	wakeup(zone);
 out:
 	ZONE_UNLOCK(zone);
 }
 
 void
 zone_drain(uma_zone_t zone)
 {
 
 	zone_drain_wait(zone, M_NOWAIT);
 }
 
 /*
  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
  *
  * Arguments:
  *	wait  Shall we wait?
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 {
 	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
 	uma_slab_t slab;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 
 #ifdef UMA_DEBUG
 	printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 #endif
 	allocf = keg->uk_allocf;
 	KEG_UNLOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
 			KEG_LOCK(keg);
 			return NULL;
 		}
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
 	/* zone is passed for legacy reasons. */
 	mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 		KEG_LOCK(keg);
 		return (NULL);
 	}
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_firstfree = 0;
 	slab->us_flags = flags;
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		for (i = 0; i < keg->uk_ipers; i++) {
 			slabref->us_freelist[i].us_refcnt = 0;
 			slabref->us_freelist[i].us_item = i+1;
 		}
 	} else {
 		for (i = 0; i < keg->uk_ipers; i++)
 			slab->us_freelist[i].us_item = i+1;
 	}
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size, wait) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			if (keg->uk_fini != NULL) {
 				for (i--; i > -1; i--)
 					keg->uk_fini(slab->us_data +
 					    (keg->uk_rsize * i),
 					    keg->uk_size);
 			}
 			if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 				vm_object_t obj;
 
 				if (flags & UMA_SLAB_KMEM)
 					obj = kmem_object;
 				else if (flags & UMA_SLAB_KERNEL)
 					obj = kernel_object;
 				else
 					obj = NULL;
 				for (i = 0; i < keg->uk_ppera; i++)
 					vsetobj((vm_offset_t)mem +
 					    (i * PAGE_SIZE), obj);
 			}
 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 				zone_free_item(keg->uk_slabzone, slab,
 				    NULL, SKIP_NONE, ZFREE_STATFREE);
 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
 			    flags);
 			KEG_LOCK(keg);
 			return (NULL);
 		}
 	}
 	KEG_LOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 	keg->uk_pages += keg->uk_ppera;
 	keg->uk_free += keg->uk_ipers;
 
 	return (slab);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc() so
  * that we may use the boot time page cache to satisfy allocations before
  * the VM is ready.
  */
 static void *
 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	uma_keg_t keg;
 	uma_slab_t tmps;
 
 	keg = zone_first_keg(zone);
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
 	mtx_lock(&uma_boot_pages_mtx);
 	if ((tmps = LIST_FIRST(&uma_boot_pages)) != NULL) {
 		LIST_REMOVE(tmps, us_link);
 		mtx_unlock(&uma_boot_pages_mtx);
 		*pflag = tmps->us_flags;
 		return (tmps->us_data);
 	}
 	mtx_unlock(&uma_boot_pages_mtx);
 	if (booted == 0)
 		panic("UMA: Increase vm.boot_pages");
 	/*
 	 * Now that we've booted reset these users to their real allocator.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = uma_small_alloc;
 #else
 	keg->uk_allocf = page_alloc;
 #endif
 	return keg->uk_allocf(zone, bytes, pflag, wait);
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KMEM;
 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
 
 	return (p);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	vm_object_t object;
 	vm_offset_t retkva, zkva;
 	vm_page_t p;
 	int pages, startpages;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	object = keg->uk_obj;
 	retkva = 0;
 
 	/*
 	 * This looks a little weird since we're getting one page at a time.
 	 */
 	VM_OBJECT_LOCK(object);
 	p = TAILQ_LAST(&object->memq, pglist);
 	pages = p != NULL ? p->pindex + 1 : 0;
 	startpages = pages;
 	zkva = keg->uk_kva + pages * PAGE_SIZE;
 	for (; bytes > 0; bytes -= PAGE_SIZE) {
 		p = vm_page_alloc(object, pages,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
 		if (p == NULL) {
 			if (pages != startpages)
 				pmap_qremove(retkva, pages - startpages);
 			while (pages != startpages) {
 				pages--;
 				p = TAILQ_LAST(&object->memq, pglist);
 				vm_page_unwire(p, 0);
 				vm_page_free(p);
 			}
 			retkva = 0;
 			goto done;
 		}
 		pmap_qenter(zkva, &p, 1);
 		if (retkva == 0)
 			retkva = zkva;
 		zkva += PAGE_SIZE;
 		pages += 1;
 	}
 done:
 	VM_OBJECT_UNLOCK(object);
 	*flags = UMA_SLAB_PRIV;
 
 	return ((void *)retkva);
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, int size, u_int8_t flags)
 {
 	vm_map_t map;
 
 	if (flags & UMA_SLAB_KMEM)
 		map = kmem_map;
 	else if (flags & UMA_SLAB_KERNEL)
 		map = kernel_map;
 	else
 		panic("UMA: page_free used with invalid flags %d", flags);
 
 	kmem_free(map, (vm_offset_t)mem, size);
 }
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 /*
  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
  *
  * Arguments
  *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_small_init(uma_keg_t keg)
 {
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
 	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
 		rsize = UMA_SMALLEST_UNIT;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = 1;
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
 		shsize = sizeof(struct uma_slab_refcnt);
 	} else {
 		rsize += UMA_FRITM_SZ;	/* Account for linkage */
 		shsize = sizeof(struct uma_slab);
 	}
 
 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
 	KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = UMA_SLAB_SIZE - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM (kmem_map) for slabs which we
 	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
 	 * result of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
 	if ((wastedspace >= UMA_MAX_WASTE) &&
 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
 		KASSERT(keg->uk_ipers <= 255,
 		    ("keg_small_init: keg->uk_ipers too high!"));
 #ifdef UMA_DEBUG
 		printf("UMA decided we need offpage slab headers for "
 		    "keg: %s, calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", keg->uk_name, wastedspace,
 		    UMA_MAX_WASTE, keg->uk_ipers,
 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
 #endif
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 		if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 			keg->uk_flags |= UMA_ZONE_HASH;
 	}
 }
 
 /*
  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
  *	keg  The keg we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_large_init(uma_keg_t keg)
 {
 	int pages;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
 
 	pages = keg->uk_size / UMA_SLAB_SIZE;
 
 	/* Account for remainder */
 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
 		pages++;
 
 	keg->uk_ppera = pages;
 	keg->uk_ipers = 1;
 
 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
 	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 
 	keg->uk_rsize = keg->uk_size;
 }
 
 static void
 keg_cachespread_init(uma_keg_t keg)
 {
 	int alignsize;
 	int trailer;
 	int pages;
 	int rsize;
 
 	alignsize = keg->uk_align + 1;
 	rsize = keg->uk_size;
 	/*
 	 * We want one item to start on every align boundary in a page.  To
 	 * do this we will span pages.  We will also extend the item by the
 	 * size of align if it is an even multiple of align.  Otherwise, it
 	 * would fall on the same boundary every time.
 	 */
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + alignsize;
 	if ((rsize & alignsize) == 0)
 		rsize += alignsize;
 	trailer = rsize - keg->uk_size;
 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = pages;
 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
 	KASSERT(keg->uk_ipers <= uma_max_ipers,
 	    ("keg_small_init: keg->uk_ipers too high(%d) increase max_ipers",
 	    keg->uk_ipers));
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_free = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_allocf = page_alloc;
 	keg->uk_freef = page_free;
 	keg->uk_recurse = 0;
 	keg->uk_slabzone = NULL;
 
 	/*
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	/*
 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
 	 * linkage that is added to the size in keg_small_init().  If
 	 * we don't account for this here then we may end up in
 	 * keg_small_init() with a calculated 'ipers' of 0.
 	 */
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
 			keg_cachespread_init(keg);
 		else if ((keg->uk_size+UMA_FRITMREF_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	} else {
 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
 			keg_cachespread_init(keg);
 		else if ((keg->uk_size+UMA_FRITM_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			keg->uk_slabzone = slabrefzone;
 		else
 			keg->uk_slabzone = slabzone;
 	}
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 	if (keg->uk_ppera == 1) {
 #ifdef UMA_MD_SMALL_ALLOC
 		keg->uk_allocf = uma_small_alloc;
 		keg->uk_freef = uma_small_free;
 #endif
 		if (booted == 0)
 			keg->uk_allocf = startup_alloc;
 	}
 
 	/*
 	 * Initialize keg's lock (shared among zones).
 	 */
 	if (arg->flags & UMA_ZONE_MTXCLASS)
 		KEG_LOCK_INIT(keg, 1);
 	else
 		KEG_LOCK_INIT(keg, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		u_int totsize;
 
 		/* Size of the slab struct and free list */
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = sizeof(struct uma_slab_refcnt) +
 			    keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = sizeof(struct uma_slab) +
 			    keg->uk_ipers * UMA_FRITM_SZ;
 
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
 
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
 			    + keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
 			    + keg->uk_ipers * UMA_FRITM_SZ;
 
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		if (totsize > UMA_SLAB_SIZE) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
 			    keg->uk_size);
 			panic("UMA slab won't fit.");
 		}
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
 	printf("UMA: %s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 #endif
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	mtx_lock(&uma_mtx);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	mtx_unlock(&uma_mtx);
 	return (0);
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zctor_args *arg = udata;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_slab = zone_fetch_slab;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_allocs = 0;
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_sleeps = 0;
 	zone->uz_fills = zone->uz_count = 0;
 	zone->uz_flags = 0;
 	keg = arg->keg;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lock = &keg->uk_lock;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		mtx_lock(&uma_mtx);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		mtx_unlock(&uma_mtx);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = arg->flags;
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Link in the first keg.
 	 */
 	zone->uz_klink.kl_keg = keg;
 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
 	zone->uz_lock = &keg->uk_lock;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
 		zone->uz_count = BUCKET_MAX;
 	else if (keg->uk_ipers <= BUCKET_MAX)
 		zone->uz_count = keg->uk_ipers;
 	else
 		zone->uz_count = BUCKET_MAX;
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
 	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_free, keg->uk_pages);
 	}
 	KEG_UNLOCK(keg);
 
 	hash_free(&keg->uk_hash);
 
 	KEG_LOCK_FINI(keg);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_klink_t klink;
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
 	keg = zone_first_keg(zone);
 
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	mtx_lock(&uma_mtx);
 	LIST_REMOVE(zone, uz_link);
 	mtx_unlock(&uma_mtx);
 	/*
 	 * XXX there are some races here where
 	 * the zone can be drained but zone lock
 	 * released and then refilled before we
 	 * remove it... we dont care for now
 	 */
 	zone_drain_wait(zone, M_WAITOK);
 	/*
 	 * Unlink all of our kegs.
 	 */
 	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
 		klink->kl_keg = NULL;
 		LIST_REMOVE(klink, kl_link);
 		if (klink == &zone->uz_klink)
 			continue;
 		free(klink, M_TEMP);
 	}
 	/*
 	 * We only destroy kegs from non secondary zones.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
 		mtx_lock(&uma_mtx);
 		LIST_REMOVE(keg, uk_link);
 		mtx_unlock(&uma_mtx);
 		zone_free_item(kegs, keg, NULL, SKIP_NONE,
 		    ZFREE_STATFREE);
 	}
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
 	mtx_unlock(&uma_mtx);
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_startup(void *bootmem, int boot_pages)
 {
 	struct uma_zctor_args args;
 	uma_slab_t slab;
 	u_int slabsize;
 	u_int objsize, totsize, wsize;
 	int i;
 
 #ifdef UMA_DEBUG
 	printf("Creating uma keg headers zone and keg.\n");
 #endif
 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
 
 	/*
 	 * Figure out the maximum number of items-per-slab we'll have if
 	 * we're using the OFFPAGE slab header to track free items, given
 	 * all possible object sizes and the maximum desired wastage
 	 * (UMA_MAX_WASTE).
 	 *
 	 * We iterate until we find an object size for
 	 * which the calculated wastage in keg_small_init() will be
 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
 	 * is an overall increasing see-saw function, we find the smallest
 	 * objsize such that the wastage is always acceptable for objects
 	 * with that objsize or smaller.  Since a smaller objsize always
 	 * generates a larger possible uma_max_ipers, we use this computed
 	 * objsize to calculate the largest ipers possible.  Since the
 	 * ipers calculated for OFFPAGE slab headers is always larger than
 	 * the ipers initially calculated in keg_small_init(), we use
 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
 	 * obtain the maximum ipers possible for offpage slab headers.
 	 *
 	 * It should be noted that ipers versus objsize is an inversly
 	 * proportional function which drops off rather quickly so as
 	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
 	 * falls into the portion of the inverse relation AFTER the steep
 	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
 	 *
 	 * Note that we have 8-bits (1 byte) to use as a freelist index
 	 * inside the actual slab header itself and this is enough to
 	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
 	 * object with offpage slab header would have ipers =
 	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
 	 * 1 greater than what our byte-integer freelist index can
 	 * accomodate, but we know that this situation never occurs as
 	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
 	 * that we need to go to offpage slab headers.  Or, if we do,
 	 * then we trap that condition below and panic in the INVARIANTS case.
 	 */
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
 		    (objsize + UMA_FRITM_SZ);
 		totsize *= (UMA_FRITM_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
 
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
 		    (objsize + UMA_FRITMREF_SZ);
 		totsize *= (UMA_FRITMREF_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
 
 	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
 	    ("uma_startup: calculated uma_max_ipers values too large!"));
 
 #ifdef UMA_DEBUG
 	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
 	printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n",
 	    uma_max_ipers_ref);
 #endif
 
 	/* "manually" create the initial zone */
 	args.name = "UMA Kegs";
 	args.size = sizeof(struct uma_keg);
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = &masterkeg;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Filling boot free list.\n");
 #endif
 	for (i = 0; i < boot_pages; i++) {
 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
 		slab->us_data = (u_int8_t *)slab;
 		slab->us_flags = UMA_SLAB_BOOT;
 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
 	}
 	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
 
 #ifdef UMA_DEBUG
 	printf("Creating uma zone headers zone and keg.\n");
 #endif
 	args.name = "UMA Zones";
 	args.size = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Initializing pcpu cache locks.\n");
 #endif
 #ifdef UMA_DEBUG
 	printf("Creating slab and hash zones.\n");
 #endif
 
 	/*
 	 * This is the max number of free list items we'll have with
 	 * offpage slabs.
 	 */
 	slabsize = uma_max_ipers * UMA_FRITM_SZ;
 	slabsize += sizeof(struct uma_slab);
 
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
 				slabsize,
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	/*
 	 * We also create a zone for the bigger slabs with reference
 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
 	 */
 	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
 	slabsize += sizeof(struct uma_slab_refcnt);
 	slabrefzone = uma_zcreate("UMA RCntSlabs",
 				  slabsize,
 				  NULL, NULL, NULL, NULL,
 				  UMA_ALIGN_PTR,
 				  UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 
 #if defined(UMA_MD_SMALL_ALLOC) && !defined(UMA_MD_SMALL_ALLOC_NEEDS_VM)
 	booted = 1;
 #endif
 
 #ifdef UMA_DEBUG
 	printf("UMA startup complete.\n");
 #endif
 }
 
 /* see uma.h */
 void
 uma_startup2(void)
 {
 	booted = 1;
 	bucket_enable();
 #ifdef UMA_DEBUG
 	printf("UMA startup2 complete.\n");
 #endif
 }
 
 /*
  * Initialize our callout handle
  *
  */
 
 static void
 uma_startup3(void)
 {
 #ifdef UMA_DEBUG
 	printf("Starting callout.\n");
 #endif
 	callout_init(&uma_callout, CALLOUT_MPSAFE);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
 #endif
 }
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, u_int32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (zone_alloc_item(kegs, &args, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
 
 {
 	struct uma_zctor_args args;
 
 	/* This stuff is essential for the zone ctor */
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	return (zone_alloc_item(zones, &args, M_WAITOK));
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(master);
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = keg->uk_align;
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
 	/* XXX Attaches only one keg of potentially many. */
 	return (zone_alloc_item(zones, &args, M_WAITOK));
 }
 
 static void
 zone_lock_pair(uma_zone_t a, uma_zone_t b)
 {
 	if (a < b) {
 		ZONE_LOCK(a);
 		mtx_lock_flags(b->uz_lock, MTX_DUPOK);
 	} else {
 		ZONE_LOCK(b);
 		mtx_lock_flags(a->uz_lock, MTX_DUPOK);
 	}
 }
 
 static void
 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
 {
 
 	ZONE_UNLOCK(a);
 	ZONE_UNLOCK(b);
 }
 
 int
 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
 {
 	uma_klink_t klink;
 	uma_klink_t kl;
 	int error;
 
 	error = 0;
 	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
 
 	zone_lock_pair(zone, master);
 	/*
 	 * zone must use vtoslab() to resolve objects and must already be
 	 * a secondary.
 	 */
 	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
 	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The new master must also use vtoslab().
 	 */
 	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * Both must either be refcnt, or not be refcnt.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
 	    (master->uz_flags & UMA_ZONE_REFCNT)) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The underlying object must be the same size.  rsize
 	 * may be different.
 	 */
 	if (master->uz_size != zone->uz_size) {
 		error = E2BIG;
 		goto out;
 	}
 	/*
 	 * Put it at the end of the list.
 	 */
 	klink->kl_keg = zone_first_keg(master);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
 		if (LIST_NEXT(kl, kl_link) == NULL) {
 			LIST_INSERT_AFTER(kl, klink, kl_link);
 			break;
 		}
 	}
 	klink = NULL;
 	zone->uz_flags |= UMA_ZFLAG_MULTI;
 	zone->uz_slab = zone_fetch_slab_multi;
 
 out:
 	zone_unlock_pair(zone, master);
 	if (klink != NULL)
 		free(klink, M_TEMP);
 
 	return (error);
 }
 
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
 	    zone->uz_name, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
 
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 zalloc_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
 
 	if (bucket) {
 		if (bucket->ub_cnt > 0) {
 			bucket->ub_cnt--;
 			item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
 			KASSERT(item != NULL,
 			    ("uma_zalloc: Bucket pointer mangled."));
 			cache->uc_allocs++;
 			critical_exit();
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
 			uma_dbg_alloc(zone, NULL, item);
 			ZONE_UNLOCK(zone);
 #endif
 			if (zone->uz_ctor != NULL) {
 				if (zone->uz_ctor(item, zone->uz_size,
 				    udata, flags) != 0) {
 					zone_free_item(zone, item, udata,
 					    SKIP_DTOR, ZFREE_STATFAIL |
 					    ZFREE_STATFREE);
 					return (NULL);
 				}
 			}
 			if (flags & M_ZERO)
 				bzero(item, zone->uz_size);
 			return (item);
 		} else if (cache->uc_freebucket) {
 			/*
 			 * We have run out of items in our allocbucket.
 			 * See if we can switch with our free bucket.
 			 */
 			if (cache->uc_freebucket->ub_cnt > 0) {
 #ifdef UMA_DEBUG_ALLOC
 				printf("uma_zalloc: Swapping empty with"
 				    " alloc.\n");
 #endif
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 
 				goto zalloc_start;
 			}
 		}
 	}
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	bucket = cache->uc_allocbucket;
 	if (bucket != NULL) {
 		if (bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 		bucket = cache->uc_freebucket;
 		if (bucket != NULL && bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	/* Our old one is now a free bucket */
 	if (cache->uc_allocbucket) {
 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
 		    ("uma_zalloc_arg: Freeing a non free bucket."));
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    cache->uc_allocbucket, ub_link);
 		cache->uc_allocbucket = NULL;
 	}
 
 	/* Check the free list for a new alloc bucket */
 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
 		LIST_REMOVE(bucket, ub_link);
 		cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the begining.
 	 */
 	if (zone_alloc_bucket(zone, flags)) {
 		ZONE_UNLOCK(zone);
 		goto zalloc_restart;
 	}
 	ZONE_UNLOCK(zone);
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 #ifdef UMA_DEBUG
 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
 #endif
 
 	item = zone_alloc_item(zone, udata, flags);
 	return (item);
 }
 
 static uma_slab_t
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 
 	for (;;) {
 		/*
 		 * Find a slab with some space.  Prefer slabs that are partially
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
 		if (keg->uk_free != 0) {
 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
 				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
 				slab = LIST_FIRST(&keg->uk_free_slab);
 				LIST_REMOVE(slab, us_link);
 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
 				    us_link);
 			}
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
 			keg->uk_flags |= UMA_ZFLAG_FULL;
 			/*
 			 * If this is not a multi-zone, set the FULL bit.
 			 * Otherwise slab_multi() takes care of it.
 			 */
 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0)
 				zone->uz_flags |= UMA_ZFLAG_FULL;
 			if (flags & M_NOWAIT)
 				break;
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
 		keg->uk_recurse++;
 		slab = keg_alloc_slab(keg, zone, flags);
 		keg->uk_recurse--;
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
 		 * We might not have been able to get a slab but another cpu
 		 * could have while we were unlocked.  Check again before we
 		 * fail.
 		 */
 		flags |= M_NOVM;
 	}
 	return (slab);
 }
 
 static inline void
 zone_relock(uma_zone_t zone, uma_keg_t keg)
 {
 	if (zone->uz_lock != &keg->uk_lock) {
 		KEG_UNLOCK(keg);
 		ZONE_LOCK(zone);
 	}
 }
 
 static inline void
 keg_relock(uma_keg_t keg, uma_zone_t zone)
 {
 	if (zone->uz_lock != &keg->uk_lock) {
 		ZONE_UNLOCK(zone);
 		KEG_LOCK(keg);
 	}
 }
 
 static uma_slab_t
 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
 {
 	uma_slab_t slab;
 
 	if (keg == NULL)
 		keg = zone_first_keg(zone);
 	/*
 	 * This is to prevent us from recursively trying to allocate
 	 * buckets.  The problem is that if an allocation forces us to
 	 * grab a new bucket we will call page_alloc, which will go off
 	 * and cause the vm to allocate vm_map_entries.  If we need new
 	 * buckets there too we will recurse in kmem_alloc and bad
 	 * things happen.  So instead we return a NULL bucket, and make
 	 * the code that allocates buckets smart enough to deal with it
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
 		return (NULL);
 
 	for (;;) {
 		slab = keg_fetch_slab(keg, zone, flags);
 		if (slab)
 			return (slab);
 		if (flags & (M_NOWAIT | M_NOVM))
 			break;
 	}
 	return (NULL);
 }
 
 /*
  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
  * with the keg locked.  Caller must call zone_relock() afterwards if the
  * zone lock is required.  On NULL the zone lock is held.
  *
  * The last pointer is used to seed the search.  It is not required.
  */
 static uma_slab_t
 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
 {
 	uma_klink_t klink;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int flags;
 	int empty;
 	int full;
 
 	/*
 	 * Don't wait on the first pass.  This will skip limit tests
 	 * as well.  We don't want to block if we can find a provider
 	 * without blocking.
 	 */
 	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
 	/*
 	 * Use the last slab allocated as a hint for where to start
 	 * the search.
 	 */
 	if (last) {
 		slab = keg_fetch_slab(last, zone, flags);
 		if (slab)
 			return (slab);
 		zone_relock(zone, last);
 		last = NULL;
 	}
 	/*
 	 * Loop until we have a slab incase of transient failures
 	 * while M_WAITOK is specified.  I'm not sure this is 100%
 	 * required but we've done it for so long now.
 	 */
 	for (;;) {
 		empty = 0;
 		full = 0;
 		/*
 		 * Search the available kegs for slabs.  Be careful to hold the
 		 * correct lock while calling into the keg layer.
 		 */
 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
 			keg = klink->kl_keg;
 			keg_relock(keg, zone);
 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
 				slab = keg_fetch_slab(keg, zone, flags);
 				if (slab)
 					return (slab);
 			}
 			if (keg->uk_flags & UMA_ZFLAG_FULL)
 				full++;
 			else
 				empty++;
 			zone_relock(zone, keg);
 		}
 		if (rflags & (M_NOWAIT | M_NOVM))
 			break;
 		flags = rflags;
 		/*
 		 * All kegs are full.  XXX We can't atomically check all kegs
 		 * and sleep so just sleep for a short period and retry.
 		 */
 		if (full && !empty) {
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
 			msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
 			continue;
 		}
 	}
 	return (NULL);
 }
 
 static void *
 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
 {
 	uma_keg_t keg;
 	uma_slabrefcnt_t slabref;
 	void *item;
 	u_int8_t freei;
 
 	keg = slab->us_keg;
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 
 	freei = slab->us_firstfree;
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slab->us_firstfree = slabref->us_freelist[freei].us_item;
 	} else {
 		slab->us_firstfree = slab->us_freelist[freei].us_item;
 	}
 	item = slab->us_data + (keg->uk_rsize * freei);
 
 	slab->us_freecount--;
 	keg->uk_free--;
 #ifdef INVARIANTS
 	uma_dbg_alloc(zone, slab, item);
 #endif
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 zone_alloc_bucket(uma_zone_t zone, int flags)
 {
 	uma_bucket_t bucket;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int16_t saved;
 	int max, origflags = flags;
 
 	/*
 	 * Try this zone's free list first so we don't allocate extra buckets.
 	 */
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt == 0,
 		    ("zone_alloc_bucket: Bucket on free list is not empty."));
 		LIST_REMOVE(bucket, ub_link);
 	} else {
 		int bflags;
 
 		bflags = (flags & ~M_ZERO);
 		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
 			bflags |= M_NOVM;
 
 		ZONE_UNLOCK(zone);
 		bucket = bucket_alloc(zone->uz_count, bflags);
 		ZONE_LOCK(zone);
 	}
 
 	if (bucket == NULL) {
 		return (0);
 	}
 
 #ifdef SMP
 	/*
 	 * This code is here to limit the number of simultaneous bucket fills
 	 * for any given zone to the number of per cpu caches in this zone. This
 	 * is done so that we don't allocate more memory than we really need.
 	 */
 	if (zone->uz_fills >= mp_ncpus)
 		goto done;
 
 #endif
 	zone->uz_fills++;
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	/* Try to keep the buckets totally full */
 	saved = bucket->ub_cnt;
 	slab = NULL;
 	keg = NULL;
 	while (bucket->ub_cnt < max &&
 	    (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
 		keg = slab->us_keg;
 		while (slab->us_freecount && bucket->ub_cnt < max) {
 			bucket->ub_bucket[bucket->ub_cnt++] =
 			    slab_alloc_item(zone, slab);
 		}
 
 		/* Don't block on the next fill */
 		flags |= M_NOWAIT;
 	}
 	if (slab)
 		zone_relock(zone, keg);
 
 	/*
 	 * We unlock here because we need to call the zone's init.
 	 * It should be safe to unlock because the slab dealt with
 	 * above is already on the appropriate list within the keg
 	 * and the bucket we filled is not yet on any list, so we
 	 * own it.
 	 */
 	if (zone->uz_init != NULL) {
 		int i;
 
 		ZONE_UNLOCK(zone);
 		for (i = saved; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
 			    origflags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			int j;
 
 			for (j = i; j < bucket->ub_cnt; j++) {
 				zone_free_item(zone, bucket->ub_bucket[j],
 				    NULL, SKIP_FINI, 0);
 #ifdef INVARIANTS
 				bucket->ub_bucket[j] = NULL;
 #endif
 			}
 			bucket->ub_cnt = i;
 		}
 		ZONE_LOCK(zone);
 	}
 
 	zone->uz_fills--;
 	if (bucket->ub_cnt != 0) {
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 		return (1);
 	}
 #ifdef SMP
 done:
 #endif
 	bucket_free(bucket);
 
 	return (0);
 }
 /*
  * Allocates an item for an internal zone
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
 {
 	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	ZONE_LOCK(zone);
 
 	slab = zone->uz_slab(zone, NULL, flags);
 	if (slab == NULL) {
 		zone->uz_fails++;
 		ZONE_UNLOCK(zone);
 		return (NULL);
 	}
 
 	item = slab_alloc_item(zone, slab);
 
 	zone_relock(zone, slab->us_keg);
 	zone->uz_allocs++;
 	ZONE_UNLOCK(zone);
 
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_FINI,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (zone->uz_ctor != NULL) {
 		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
 			zone_free_item(zone, item, udata, SKIP_DTOR,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (flags & M_ZERO)
 		bzero(item, zone->uz_size);
 
 	return (item);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int bflags;
 	int cpu;
 
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	if (zone->uz_dtor)
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 #ifdef INVARIANTS
 	ZONE_LOCK(zone);
 	if (zone->uz_flags & UMA_ZONE_MALLOC)
 		uma_dbg_free(zone, udata, item);
 	else
 		uma_dbg_free(zone, NULL, item);
 	ZONE_UNLOCK(zone);
 #endif
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (zone->uz_flags & UMA_ZFLAG_FULL)
 		goto zfree_internal;
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 zfree_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
 	bucket = cache->uc_freebucket;
 
 	if (bucket) {
 		/*
 		 * Do we have room in our bucket? It is OK for this uz count
 		 * check to be slightly out of sync.
 		 */
 
 		if (bucket->ub_cnt < bucket->ub_entries) {
 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
 			    ("uma_zfree: Freeing to non free bucket index."));
 			bucket->ub_bucket[bucket->ub_cnt] = item;
 			bucket->ub_cnt++;
 			cache->uc_frees++;
 			critical_exit();
 			return;
 		} else if (cache->uc_allocbucket) {
 #ifdef UMA_DEBUG_ALLOC
 			printf("uma_zfree: Swapping buckets.\n");
 #endif
 			/*
 			 * We have run out of space in our freebucket.
 			 * See if we can switch with our alloc bucket.
 			 */
 			if (cache->uc_allocbucket->ub_cnt <
 			    cache->uc_freebucket->ub_cnt) {
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 				goto zfree_start;
 			}
 		}
 	}
 	/*
 	 * We can get here for two reasons:
 	 *
 	 * 1) The buckets are NULL
 	 * 2) The alloc and free buckets are both somewhat full.
 	 *
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	if (cache->uc_freebucket != NULL) {
 		if (cache->uc_freebucket->ub_cnt <
 		    cache->uc_freebucket->ub_entries) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 		if (cache->uc_allocbucket != NULL &&
 		    (cache->uc_allocbucket->ub_cnt <
 		    cache->uc_freebucket->ub_cnt)) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	bucket = cache->uc_freebucket;
 	cache->uc_freebucket = NULL;
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 #ifdef UMA_DEBUG_ALLOC
 		printf("uma_zfree: Putting old bucket on the free list.\n");
 #endif
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 	}
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		cache->uc_freebucket = bucket;
 		goto zfree_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* And the zone.. */
 	ZONE_UNLOCK(zone);
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("uma_zfree: Allocating new free bucket.\n");
 #endif
 	bflags = M_NOWAIT;
 
 	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
 		bflags |= M_NOVM;
 	bucket = bucket_alloc(zone->uz_count, bflags);
 	if (bucket) {
 		ZONE_LOCK(zone);
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		goto zfree_restart;
 	}
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_internal:
 	zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
 
 	return;
 }
 
 /*
  * Frees an item to an INTERNAL zone or allocates a free bucket
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static void
 zone_free_item(uma_zone_t zone, void *item, void *udata,
     enum zfreeskip skip, int flags)
 {
 	uma_slab_t slab;
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t freei;
 	int clearfull;
 
 	if (skip < SKIP_DTOR && zone->uz_dtor)
 		zone->uz_dtor(item, zone->uz_size, udata);
 
 	if (skip < SKIP_FINI && zone->uz_fini)
 		zone->uz_fini(item, zone->uz_size);
 
 	ZONE_LOCK(zone);
 
 	if (flags & ZFREE_STATFAIL)
 		zone->uz_fails++;
 	if (flags & ZFREE_STATFREE)
 		zone->uz_frees++;
 
 	if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
 		keg = zone_first_keg(zone); /* Must only be one. */
 		if (zone->uz_flags & UMA_ZONE_HASH) {
 			slab = hash_sfind(&keg->uk_hash, mem);
 		} else {
 			mem += keg->uk_pgoff;
 			slab = (uma_slab_t)mem;
 		}
 	} else {
 		/* This prevents redundant lookups via free(). */
 		if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
 			slab = (uma_slab_t)udata;
 		else
 			slab = vtoslab((vm_offset_t)item);
 		keg = slab->us_keg;
 		keg_relock(keg, zone);
 	}
 	MPASS(keg == slab->us_keg);
 
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
 	/* Slab management stuff */
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
 		/ keg->uk_rsize;
 
 #ifdef INVARIANTS
 	if (!skip)
 		uma_dbg_free(zone, slab, item);
 #endif
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slabref->us_freelist[freei].us_item = slab->us_firstfree;
 	} else {
 		slab->us_freelist[freei].us_item = slab->us_firstfree;
 	}
 	slab->us_firstfree = freei;
 	slab->us_freecount++;
 
 	/* Zone statistics */
 	keg->uk_free++;
 
 	clearfull = 0;
 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
 		if (keg->uk_pages < keg->uk_maxpages) {
 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 			clearfull = 1;
 		}
 
 		/* 
 		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
 		 * wake up all procs blocked on pages. This should be uncommon, so 
 		 * keeping this simple for now (rather than adding count of blocked 
 		 * threads etc).
 		 */
 		wakeup(keg);
 	}
 	if (clearfull) {
 		zone_relock(zone, keg);
 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
 		wakeup(zone);
 		ZONE_UNLOCK(zone);
 	} else
 		KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
 void
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
 		keg->uk_maxpages += keg->uk_ppera;
 
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_max(uma_zone_t zone)
 {
 	int nitems;
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	if (keg->uk_maxpages)
 		nitems = keg->uk_maxpages * keg->uk_ipers;
 	else
 		nitems = 0;
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	keg->uk_init = uminit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	keg->uk_fini = fini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
 	zone->uz_init = zinit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
 	zone->uz_fini = zfini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_freef is not actually used with the zone locked */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 
 	ZONE_LOCK(zone);
 	zone_first_keg(zone)->uk_freef = freef;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_allocf is not actually used with the zone locked */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	uma_keg_t keg;
 
 	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
 	keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
 	keg->uk_allocf = allocf;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	int pages;
 
 	keg = zone_first_keg(zone);
 	pages = count / keg->uk_ipers;
 
 	if (pages * keg->uk_ipers < count)
 		pages++;
 
 	kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
 
 	if (kva == 0)
 		return (0);
 	if (obj == NULL)
 		obj = vm_object_allocate(OBJT_PHYS, pages);
 	else {
 		VM_OBJECT_LOCK_INIT(obj, "uma object");
 		_vm_object_allocate(OBJT_PHYS, pages, obj);
 	}
 	ZONE_LOCK(zone);
 	keg->uk_kva = kva;
 	keg->uk_obj = obj;
 	keg->uk_maxpages = pages;
 	keg->uk_allocf = obj_alloc;
 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
 	ZONE_UNLOCK(zone);
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	int slabs;
 	uma_slab_t slab;
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
 	ZONE_LOCK(zone);
 	slabs = items / keg->uk_ipers;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = keg_alloc_slab(keg, zone, M_WAITOK);
 		if (slab == NULL)
 			break;
 		MPASS(slab->us_keg == keg);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 u_int32_t *
 uma_find_refcnt(uma_zone_t zone, void *item)
 {
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int32_t *refcnt;
 	int idx;
 
 	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
 	    (~UMA_SLAB_MASK));
 	keg = slabref->us_keg;
 	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
 	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
 	    / keg->uk_rsize;
 	refcnt = &slabref->us_freelist[idx].us_refcnt;
 	return refcnt;
 }
 
 /* See uma.h */
 void
 uma_reclaim(void)
 {
 #ifdef UMA_DEBUG
 	printf("UMA: vm asked us to release pages!\n");
 #endif
 	bucket_enable();
 	zone_foreach(zone_drain);
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
 	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 	int full;
 
 	ZONE_LOCK(zone);
 	full = (zone->uz_flags & UMA_ZFLAG_FULL);
 	ZONE_UNLOCK(zone);
 	return (full);	
 }
 
 int
 uma_zone_exhausted_nolock(uma_zone_t zone)
 {
 	return (zone->uz_flags & UMA_ZFLAG_FULL);
 }
 
 void *
 uma_large_malloc(int size, int wait)
 {
 	void *mem;
 	uma_slab_t slab;
 	u_int8_t flags;
 
 	slab = zone_alloc_item(slabzone, NULL, wait);
 	if (slab == NULL)
 		return (NULL);
 	mem = page_alloc(NULL, size, &flags, wait);
 	if (mem) {
 		vsetslab((vm_offset_t)mem, slab);
 		slab->us_data = mem;
 		slab->us_flags = flags | UMA_SLAB_MALLOC;
 		slab->us_size = size;
 	} else {
 		zone_free_item(slabzone, slab, NULL, SKIP_NONE,
 		    ZFREE_STATFAIL | ZFREE_STATFREE);
 	}
 
 	return (mem);
 }
 
 void
 uma_large_free(uma_slab_t slab)
 {
 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
 	page_free(slab->us_data, slab->us_size, slab->us_flags);
 	zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 void
 uma_print_stats(void)
 {
 	zone_foreach(uma_print_zone);
 }
 
 static void
 slab_print(uma_slab_t slab)
 {
 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
 		slab->us_keg, slab->us_data, slab->us_freecount,
 		slab->us_firstfree);
 }
 
 static void
 cache_print(uma_cache_t cache)
 {
 	printf("alloc: %p(%d), free: %p(%d)\n",
 		cache->uc_allocbucket,
 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
 		cache->uc_freebucket,
 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
 }
 
 static void
 uma_print_keg(uma_keg_t keg)
 {
 	uma_slab_t slab;
 
 	printf("keg: %s(%p) size %d(%d) flags %d ipers %d ppera %d "
 	    "out %d free %d limit %d\n",
 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
 	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
 	printf("Part slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
 	printf("Free slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
 		slab_print(slab);
 	printf("Full slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
 		slab_print(slab);
 }
 
 void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_klink_t kl;
 	int i;
 
 	printf("zone: %s(%p) size %d flags %d\n",
 	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
 		uma_print_keg(kl->kl_keg);
 	CPU_FOREACH(i) {
 		cache = &zone->uz_cpu[i];
 		printf("CPU %d Cache:\n", i);
 		cache_print(cache);
 	}
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
  * safe from off-CPU; we should modify the caches to track this information
  * directly so that we don't have to.
  */
 static void
 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
     u_int64_t *freesp, u_int64_t *sleepsp)
 {
 	uma_cache_t cache;
 	u_int64_t allocs, frees, sleeps;
 	int cachefree, cpu;
 
 	allocs = frees = sleeps = 0;
 	cachefree = 0;
 	CPU_FOREACH(cpu) {
 		cache = &z->uz_cpu[cpu];
 		if (cache->uc_allocbucket != NULL)
 			cachefree += cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			cachefree += cache->uc_freebucket->ub_cnt;
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += z->uz_allocs;
 	frees += z->uz_frees;
 	sleeps += z->uz_sleeps;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 	if (sleepsp != NULL)
 		*sleepsp = sleeps;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	mtx_unlock(&uma_mtx);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat ups;
 	uma_bucket_t bucket;
 	struct sbuf sbuf;
 	uma_cache_t cache;
 	uma_klink_t kl;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uma_keg_t k;
-	char *buffer;
-	int buflen, count, error, i;
+	int count, error, i;
 
-	mtx_lock(&uma_mtx);
-restart:
-	mtx_assert(&uma_mtx, MA_OWNED);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+
 	count = 0;
+	mtx_lock(&uma_mtx);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
-	mtx_unlock(&uma_mtx);
 
-	buflen = sizeof(ush) + count * (sizeof(uth) + sizeof(ups) *
-	    (mp_maxid + 1)) + 1;
-	buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
-
-	mtx_lock(&uma_mtx);
-	i = 0;
-	LIST_FOREACH(kz, &uma_kegs, uk_link) {
-		LIST_FOREACH(z, &kz->uk_zones, uz_link)
-			i++;
-	}
-	if (i > count) {
-		free(buffer, M_TEMP);
-		goto restart;
-	}
-	count =  i;
-
-	sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN);
-
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
-	if (sbuf_bcat(&sbuf, &ush, sizeof(ush)) < 0) {
-		mtx_unlock(&uma_mtx);
-		error = ENOMEM;
-		goto out;
-	}
+	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			ZONE_LOCK(z);
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
 				k = kl->kl_keg;
 				uth.uth_maxpages += k->uk_maxpages;
 				uth.uth_pages += k->uk_pages;
 				uth.uth_keg_free += k->uk_free;
 				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
 				    * k->uk_ipers;
 			}
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				uth.uth_zone_free += bucket->ub_cnt;
 			uth.uth_allocs = z->uz_allocs;
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			uth.uth_sleeps = z->uz_sleeps;
-			if (sbuf_bcat(&sbuf, &uth, sizeof(uth)) < 0) {
-				ZONE_UNLOCK(z);
-				mtx_unlock(&uma_mtx);
-				error = ENOMEM;
-				goto out;
-			}
+			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
 			 * cache, we only allow the pointers to be exchanged
 			 * without the zone lock held, not invalidated, so
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
 			for (i = 0; i < (mp_maxid + 1); i++) {
 				bzero(&ups, sizeof(ups));
 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
 					goto skip;
 				if (CPU_ABSENT(i))
 					goto skip;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
 				ups.ups_allocs = cache->uc_allocs;
 				ups.ups_frees = cache->uc_frees;
 skip:
-				if (sbuf_bcat(&sbuf, &ups, sizeof(ups)) < 0) {
-					ZONE_UNLOCK(z);
-					mtx_unlock(&uma_mtx);
-					error = ENOMEM;
-					goto out;
-				}
+				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
 			}
 			ZONE_UNLOCK(z);
 		}
 	}
 	mtx_unlock(&uma_mtx);
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
-out:
-	free(buffer, M_TEMP);
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
 	return (error);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	u_int64_t allocs, frees, sleeps;
 	uma_bucket_t bucket;
 	uma_keg_t kz;
 	uma_zone_t z;
 	int cachefree;
 
 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 	    "Requests", "Sleeps");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 				allocs = z->uz_allocs;
 				frees = z->uz_frees;
 				sleeps = z->uz_sleeps;
 				cachefree = 0;
 			} else
 				uma_zone_sumstat(z, &cachefree, &allocs,
 				    &frees, &sleeps);
 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				cachefree += bucket->ub_cnt;
 			db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
 			    (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
 			    (uintmax_t)allocs, sleeps);
 		}
 	}
 }
 #endif
Index: head/sys/vm/vm_phys.c
===================================================================
--- head/sys/vm/vm_phys.c	(revision 212369)
+++ head/sys/vm/vm_phys.c	(revision 212370)
@@ -1,901 +1,886 @@
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_reserv.h>
 
 /*
  * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
  * domain.  These extra lists are stored at the end of the regular
  * free lists starting with VM_NFREELIST.
  */
 #define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
 
 struct vm_freelist {
 	struct pglist pl;
 	int lcnt;
 };
 
 struct vm_phys_seg {
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
 	int		domain;
 	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
 };
 
 struct mem_affinity *mem_affinity;
 
 static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
 
 static int vm_phys_nsegs;
 
 static struct vm_freelist
     vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 static struct vm_freelist
 (*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
 
 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
 
 static int cnt_prezero;
 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
 
 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 
 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
 #if VM_NDOMAIN > 1
 static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
 #endif
 
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
     int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order);
 
 /*
  * Outputs the state of the physical memory allocator, specifically,
  * the amount of physical memory in each free list.
  */
 static int
 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_freelist *fl;
-	char *cbuf;
-	const int cbufsize = vm_nfreelists*(VM_NFREEORDER + 1)*81;
 	int error, flind, oind, pind;
 
-	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
-	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 		    "\n  ORDER (SIZE)  |  NUMBER"
 		    "\n              ", flind);
 		for (pind = 0; pind < VM_NFREEPOOL; pind++)
 			sbuf_printf(&sbuf, "  |  POOL %d", pind);
 		sbuf_printf(&sbuf, "\n--            ");
 		for (pind = 0; pind < VM_NFREEPOOL; pind++)
 			sbuf_printf(&sbuf, "-- --      ");
 		sbuf_printf(&sbuf, "--\n");
 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 			sbuf_printf(&sbuf, "  %2.2d (%6.6dK)", oind,
 			    1 << (PAGE_SHIFT - 10 + oind));
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[flind][pind];
 				sbuf_printf(&sbuf, "  |  %6.6d", fl[oind].lcnt);
 			}
 			sbuf_printf(&sbuf, "\n");
 		}
 	}
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(cbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Outputs the set of physical memory segments.
  */
 static int
 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_phys_seg *seg;
-	char *cbuf;
-	const int cbufsize = VM_PHYSSEG_MAX*(VM_NFREEORDER + 1)*81;
 	int error, segind;
 
-	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
-	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 		seg = &vm_phys_segs[segind];
 		sbuf_printf(&sbuf, "start:     %#jx\n",
 		    (uintmax_t)seg->start);
 		sbuf_printf(&sbuf, "end:       %#jx\n",
 		    (uintmax_t)seg->end);
 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 	}
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(cbuf, M_TEMP);
 	return (error);
 }
 
 #if VM_NDOMAIN > 1
 /*
  * Outputs the set of free list lookup lists.
  */
 static int
 sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
-	char *cbuf;
-	const int cbufsize = (vm_nfreelists + 1) * VM_NDOMAIN * 81;
 	int domain, error, flind, ndomains;
 
 	ndomains = vm_nfreelists - VM_NFREELIST + 1;
-	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
-	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	for (domain = 0; domain < ndomains; domain++) {
 		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
 		for (flind = 0; flind < vm_nfreelists; flind++)
 			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
 			    vm_phys_lookup_lists[domain][flind]);
 	}
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(cbuf, M_TEMP);
 	return (error);
 }
 #endif
 	
 /*
  * Create a physical memory segment.
  */
 static void
 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
 {
 	struct vm_phys_seg *seg;
 #ifdef VM_PHYSSEG_SPARSE
 	long pages;
 	int segind;
 
 	pages = 0;
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		pages += atop(seg->end - seg->start);
 	}
 #endif
 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 	seg = &vm_phys_segs[vm_phys_nsegs++];
 	seg->start = start;
 	seg->end = end;
 	seg->domain = domain;
 #ifdef VM_PHYSSEG_SPARSE
 	seg->first_page = &vm_page_array[pages];
 #else
 	seg->first_page = PHYS_TO_VM_PAGE(start);
 #endif
 #if VM_NDOMAIN > 1
 	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
 		flind = VM_NFREELIST + (domain - 1);
 		if (flind >= vm_nfreelists)
 			vm_nfreelists = flind + 1;
 	}
 #endif
 	seg->free_queues = &vm_phys_free_queues[flind];
 }
 
 static void
 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
 {
 	int i;
 
 	if (mem_affinity == NULL) {
 		_vm_phys_create_seg(start, end, flind, 0);
 		return;
 	}
 
 	for (i = 0;; i++) {
 		if (mem_affinity[i].end == 0)
 			panic("Reached end of affinity info");
 		if (mem_affinity[i].end <= start)
 			continue;
 		if (mem_affinity[i].start > start)
 			panic("No affinity info for start %jx",
 			    (uintmax_t)start);
 		if (mem_affinity[i].end >= end) {
 			_vm_phys_create_seg(start, end, flind,
 			    mem_affinity[i].domain);
 			break;
 		}
 		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
 }
 
 /*
  * Initialize the physical memory allocator.
  */
 void
 vm_phys_init(void)
 {
 	struct vm_freelist *fl;
 	int flind, i, oind, pind;
 #if VM_NDOMAIN > 1
 	int ndomains, j;
 #endif
 
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 #ifdef	VM_FREELIST_ISADMA
 		if (phys_avail[i] < 16777216) {
 			if (phys_avail[i + 1] > 16777216) {
 				vm_phys_create_seg(phys_avail[i], 16777216,
 				    VM_FREELIST_ISADMA);
 				vm_phys_create_seg(16777216, phys_avail[i + 1],
 				    VM_FREELIST_DEFAULT);
 			} else {
 				vm_phys_create_seg(phys_avail[i],
 				    phys_avail[i + 1], VM_FREELIST_ISADMA);
 			}
 			if (VM_FREELIST_ISADMA >= vm_nfreelists)
 				vm_nfreelists = VM_FREELIST_ISADMA + 1;
 		} else
 #endif
 #ifdef	VM_FREELIST_HIGHMEM
 		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
 			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
 				vm_phys_create_seg(phys_avail[i],
 				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
 				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
 				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
 			} else {
 				vm_phys_create_seg(phys_avail[i],
 				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
 			}
 			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
 				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
 		} else
 #endif
 		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
 		    VM_FREELIST_DEFAULT);
 	}
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			fl = vm_phys_free_queues[flind][pind];
 			for (oind = 0; oind < VM_NFREEORDER; oind++)
 				TAILQ_INIT(&fl[oind].pl);
 		}
 	}
 #if VM_NDOMAIN > 1
 	/*
 	 * Build a free list lookup list for each domain.  All of the
 	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
 	 * index in a round-robin order starting with the current
 	 * domain.
 	 */
 	ndomains = vm_nfreelists - VM_NFREELIST + 1;
 	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
 		for (i = 0; i < ndomains; i++)
 			vm_phys_lookup_lists[i][flind] =
 			    &vm_phys_free_queues[flind];
 	for (i = 0; i < ndomains; i++)
 		for (j = 0; j < ndomains; j++) {
 			flind = (i + j) % ndomains;
 			if (flind == 0)
 				flind = VM_FREELIST_DEFAULT;
 			else
 				flind += VM_NFREELIST - 1;
 			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
 			    &vm_phys_free_queues[flind];
 		}
 	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
 	     flind++)
 		for (i = 0; i < ndomains; i++)
 			vm_phys_lookup_lists[i][flind + ndomains - 1] =
 			    &vm_phys_free_queues[flind];
 #else
 	for (flind = 0; flind < vm_nfreelists; flind++)
 		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
 #endif
 }
 
 /*
  * Split a contiguous, power of two-sized set of physical pages.
  */
 static __inline void
 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
 {
 	vm_page_t m_buddy;
 
 	while (oind > order) {
 		oind--;
 		m_buddy = &m[1 << oind];
 		KASSERT(m_buddy->order == VM_NFREEORDER,
 		    ("vm_phys_split_pages: page %p has unexpected order %d",
 		    m_buddy, m_buddy->order));
 		m_buddy->order = oind;
 		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
 		fl[oind].lcnt++;
         }
 }
 
 /*
  * Initialize a physical page and add it to the free lists.
  */
 void
 vm_phys_add_page(vm_paddr_t pa)
 {
 	vm_page_t m;
 
 	cnt.v_page_count++;
 	m = vm_phys_paddr_to_vm_page(pa);
 	m->phys_addr = pa;
 	m->segind = vm_phys_paddr_to_segind(pa);
 	m->flags = PG_FREE;
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_add_page: page %p has unexpected order %d",
 	    m, m->order));
 	m->pool = VM_FREEPOOL_DEFAULT;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
 	cnt.v_free_count++;
 	vm_phys_free_pages(m, 0);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * Allocate a contiguous, power of two-sized set of physical pages
  * from the free lists.
  *
  * The free page queues must be locked.
  */
 vm_page_t
 vm_phys_alloc_pages(int pool, int order)
 {
 	vm_page_t m;
 	int flind;
 
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		m = vm_phys_alloc_freelist_pages(flind, pool, order);
 		if (m != NULL)
 			return (m);
 	}
 	return (NULL);
 }
 
 /*
  * Find and dequeue a free page on the given free list, with the 
  * specified pool and order
  */
 vm_page_t
 vm_phys_alloc_freelist_pages(int flind, int pool, int order)
 {	
 	struct vm_freelist *fl;
 	struct vm_freelist *alt;
 	int domain, oind, pind;
 	vm_page_t m;
 
 	KASSERT(flind < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 
 #if VM_NDOMAIN > 1
 	domain = PCPU_GET(domain);
 #else
 	domain = 0;
 #endif
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
 	for (oind = order; oind < VM_NFREEORDER; oind++) {
 		m = TAILQ_FIRST(&fl[oind].pl);
 		if (m != NULL) {
 			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
 			fl[oind].lcnt--;
 			m->order = VM_NFREEORDER;
 			vm_phys_split_pages(m, oind, fl, order);
 			return (m);
 		}
 	}
 
 	/*
 	 * The given pool was empty.  Find the largest
 	 * contiguous, power-of-two-sized set of pages in any
 	 * pool.  Transfer these pages to the given pool, and
 	 * use them to satisfy the allocation.
 	 */
 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
 			m = TAILQ_FIRST(&alt[oind].pl);
 			if (m != NULL) {
 				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
 				alt[oind].lcnt--;
 				m->order = VM_NFREEORDER;
 				vm_phys_set_pool(pool, m, oind);
 				vm_phys_split_pages(m, oind, fl, order);
 				return (m);
 			}
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Allocate physical memory from phys_avail[].
  */
 vm_paddr_t
 vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = round_page(size);
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		return (pa);
 	}
 	panic("vm_phys_bootstrap_alloc");
 }
 
 /*
  * Find the vm_page corresponding to the given physical address.
  */
 vm_page_t
 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 {
 	struct vm_phys_seg *seg;
 	int segind;
 
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (pa >= seg->start && pa < seg->end)
 			return (&seg->first_page[atop(pa - seg->start)]);
 	}
 	return (NULL);
 }
 
 /*
  * Find the segment containing the given physical address.
  */
 static int
 vm_phys_paddr_to_segind(vm_paddr_t pa)
 {
 	struct vm_phys_seg *seg;
 	int segind;
 
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (pa >= seg->start && pa < seg->end)
 			return (segind);
 	}
 	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
 	    (uintmax_t)pa);
 }
 
 /*
  * Free a contiguous, power of two-sized set of physical pages.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_free_pages(vm_page_t m, int order)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa, pa_buddy;
 	vm_page_t m_buddy;
 
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_free_pages: page %p has unexpected order %d",
 	    m, m->order));
 	KASSERT(m->pool < VM_NFREEPOOL,
 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
 	    m, m->pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_free_pages: order %d is out of range", order));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	pa = VM_PAGE_TO_PHYS(m);
 	seg = &vm_phys_segs[m->segind];
 	while (order < VM_NFREEORDER - 1) {
 		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
 		if (pa_buddy < seg->start ||
 		    pa_buddy >= seg->end)
 			break;
 		m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
 		if (m_buddy->order != order)
 			break;
 		fl = (*seg->free_queues)[m_buddy->pool];
 		TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
 		fl[m_buddy->order].lcnt--;
 		m_buddy->order = VM_NFREEORDER;
 		if (m_buddy->pool != m->pool)
 			vm_phys_set_pool(m->pool, m_buddy, order);
 		order++;
 		pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
 		m = &seg->first_page[atop(pa - seg->start)];
 	}
 	m->order = order;
 	fl = (*seg->free_queues)[m->pool];
 	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
 	fl[order].lcnt++;
 }
 
 /*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
 vm_phys_set_pool(int pool, vm_page_t m, int order)
 {
 	vm_page_t m_tmp;
 
 	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 		m_tmp->pool = pool;
 }
 
 /*
  * Search for the given physical page "m" in the free lists.  If the search
  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
  * FALSE, indicating that "m" is not in the free lists.
  *
  * The free page queues must be locked.
  */
 boolean_t
 vm_phys_unfree_page(vm_page_t m)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa, pa_half;
 	vm_page_t m_set, m_tmp;
 	int order;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 
 	/*
 	 * First, find the contiguous, power of two-sized set of free
 	 * physical pages containing the given physical page "m" and
 	 * assign it to "m_set".
 	 */
 	seg = &vm_phys_segs[m->segind];
 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 	    order < VM_NFREEORDER - 1; ) {
 		order++;
 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 		if (pa >= seg->start)
 			m_set = &seg->first_page[atop(pa - seg->start)];
 		else
 			return (FALSE);
 	}
 	if (m_set->order < order)
 		return (FALSE);
 	if (m_set->order == VM_NFREEORDER)
 		return (FALSE);
 	KASSERT(m_set->order < VM_NFREEORDER,
 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
 	    m_set, m_set->order));
 
 	/*
 	 * Next, remove "m_set" from the free lists.  Finally, extract
 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
 	 * is larger than a page, shrink "m_set" by returning the half
 	 * of "m_set" that does not contain "m" to the free lists.
 	 */
 	fl = (*seg->free_queues)[m_set->pool];
 	order = m_set->order;
 	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
 	fl[order].lcnt--;
 	m_set->order = VM_NFREEORDER;
 	while (order > 0) {
 		order--;
 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 		if (m->phys_addr < pa_half)
 			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 		else {
 			m_tmp = m_set;
 			m_set = &seg->first_page[atop(pa_half - seg->start)];
 		}
 		m_tmp->order = order;
 		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
 		fl[order].lcnt++;
 	}
 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 	return (TRUE);
 }
 
 /*
  * Try to zero one physical page.  Used by an idle priority thread.
  */
 boolean_t
 vm_phys_zero_pages_idle(void)
 {
 	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
 	static int flind, oind, pind;
 	vm_page_t m, m_tmp;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	for (;;) {
 		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
 			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
 					vm_phys_unfree_page(m_tmp);
 					cnt.v_free_count--;
 					mtx_unlock(&vm_page_queue_free_mtx);
 					pmap_zero_page_idle(m_tmp);
 					m_tmp->flags |= PG_ZERO;
 					mtx_lock(&vm_page_queue_free_mtx);
 					cnt.v_free_count++;
 					vm_phys_free_pages(m_tmp, 0);
 					vm_page_zero_count++;
 					cnt_prezero++;
 					return (TRUE);
 				}
 			}
 		}
 		oind++;
 		if (oind == VM_NFREEORDER) {
 			oind = 0;
 			pind++;
 			if (pind == VM_NFREEPOOL) {
 				pind = 0;
 				flind++;
 				if (flind == vm_nfreelists)
 					flind = 0;
 			}
 			fl = vm_phys_free_queues[flind][pind];
 		}
 	}
 }
 
 /*
  * Allocate a contiguous set of physical pages of the given size
  * "npages" from the free lists.  All of the physical pages must be at
  * or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross
  * any physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
     unsigned long alignment, unsigned long boundary)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	struct vnode *vp;
 	vm_paddr_t pa, pa_last, size;
 	vm_page_t deferred_vdrop_list, m, m_ret;
 	int domain, flind, i, oind, order, pind;
 
 #if VM_NDOMAIN > 1
 	domain = PCPU_GET(domain);
 #else
 	domain = 0;
 #endif
 	size = npages << PAGE_SHIFT;
 	KASSERT(size != 0,
 	    ("vm_phys_alloc_contig: size must not be 0"));
 	KASSERT((alignment & (alignment - 1)) == 0,
 	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
 	KASSERT((boundary & (boundary - 1)) == 0,
 	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
 	deferred_vdrop_list = NULL;
 	/* Compute the queue that is the best fit for npages. */
 	for (order = 0; (1 << order) < npages; order++);
 	mtx_lock(&vm_page_queue_free_mtx);
 #if VM_NRESERVLEVEL > 0
 retry:
 #endif
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = (*vm_phys_lookup_lists[domain][flind])
 				    [pind];
 				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
 					/*
 					 * A free list may contain physical pages
 					 * from one or more segments.
 					 */
 					seg = &vm_phys_segs[m_ret->segind];
 					if (seg->start > high ||
 					    low >= seg->end)
 						continue;
 
 					/*
 					 * Is the size of this allocation request
 					 * larger than the largest block size?
 					 */
 					if (order >= VM_NFREEORDER) {
 						/*
 						 * Determine if a sufficient number
 						 * of subsequent blocks to satisfy
 						 * the allocation request are free.
 						 */
 						pa = VM_PAGE_TO_PHYS(m_ret);
 						pa_last = pa + size;
 						for (;;) {
 							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
 							if (pa >= pa_last)
 								break;
 							if (pa < seg->start ||
 							    pa >= seg->end)
 								break;
 							m = &seg->first_page[atop(pa - seg->start)];
 							if (m->order != VM_NFREEORDER - 1)
 								break;
 						}
 						/* If not, continue to the next block. */
 						if (pa < pa_last)
 							continue;
 					}
 
 					/*
 					 * Determine if the blocks are within the given range,
 					 * satisfy the given alignment, and do not cross the
 					 * given boundary.
 					 */
 					pa = VM_PAGE_TO_PHYS(m_ret);
 					if (pa >= low &&
 					    pa + size <= high &&
 					    (pa & (alignment - 1)) == 0 &&
 					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
 						goto done;
 				}
 			}
 		}
 	}
 #if VM_NRESERVLEVEL > 0
 	if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
 		goto retry;
 #endif
 	mtx_unlock(&vm_page_queue_free_mtx);
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 		fl = (*seg->free_queues)[m->pool];
 		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
 		fl[m->order].lcnt--;
 		m->order = VM_NFREEORDER;
 	}
 	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
 		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
 	fl = (*seg->free_queues)[m_ret->pool];
 	vm_phys_split_pages(m_ret, oind, fl, order);
 	for (i = 0; i < npages; i++) {
 		m = &m_ret[i];
 		vp = vm_page_alloc_init(m);
 		if (vp != NULL) {
 			/*
 			 * Enqueue the vnode for deferred vdrop().
 			 *
 			 * Unmanaged pages don't use "pageq", so it
 			 * can be safely abused to construct a short-
 			 * lived queue of vnodes.
 			 */
 			m->pageq.tqe_prev = (void *)vp;
 			m->pageq.tqe_next = deferred_vdrop_list;
 			deferred_vdrop_list = m;
 		}
 	}
 	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
 		m = &m_ret[i];
 		KASSERT(m->order == VM_NFREEORDER,
 		    ("vm_phys_alloc_contig: page %p has unexpected order %d",
 		    m, m->order));
 		vm_phys_free_pages(m, 0);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 	while (deferred_vdrop_list != NULL) {
 		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
 		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
 	}
 	return (m_ret);
 }
 
 #ifdef DDB
 /*
  * Show the number of physical pages in each of the free lists.
  */
 DB_SHOW_COMMAND(freepages, db_show_freepages)
 {
 	struct vm_freelist *fl;
 	int flind, oind, pind;
 
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		db_printf("FREE LIST %d:\n"
 		    "\n  ORDER (SIZE)  |  NUMBER"
 		    "\n              ", flind);
 		for (pind = 0; pind < VM_NFREEPOOL; pind++)
 			db_printf("  |  POOL %d", pind);
 		db_printf("\n--            ");
 		for (pind = 0; pind < VM_NFREEPOOL; pind++)
 			db_printf("-- --      ");
 		db_printf("--\n");
 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 			db_printf("  %2.2d (%6.6dK)", oind,
 			    1 << (PAGE_SHIFT - 10 + oind));
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[flind][pind];
 				db_printf("  |  %6.6d", fl[oind].lcnt);
 			}
 			db_printf("\n");
 		}
 		db_printf("\n");
 	}
 }
 #endif
Index: head/sys/vm/vm_reserv.c
===================================================================
--- head/sys/vm/vm_reserv.c	(revision 212369)
+++ head/sys/vm/vm_reserv.c	(revision 212370)
@@ -1,731 +1,726 @@
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007-2008 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  *	Superpage reservation management module
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_reserv.h>
 
 /*
  * The reservation system supports the speculative allocation of large physical
  * pages ("superpages").  Speculative allocation enables the fully-automatic
  * utilization of superpages by the virtual memory system.  In other words, no
  * programmatic directives are required to use superpages.
  */
 
 #if VM_NRESERVLEVEL > 0
 
 /*
  * The number of small pages that are contained in a level 0 reservation
  */
 #define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
 
 /*
  * The number of bits by which a physical address is shifted to obtain the
  * reservation number
  */
 #define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
 
 /*
  * The size of a level 0 reservation in bytes
  */
 #define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)
 
 /*
  * Computes the index of the small page underlying the given (object, pindex)
  * within the reservation's array of small pages.
  */
 #define	VM_RESERV_INDEX(object, pindex)	\
     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
 
 /*
  * The reservation structure
  *
  * A reservation structure is constructed whenever a large physical page is
  * speculatively allocated to an object.  The reservation provides the small
  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  * within that object.  The reservation's "popcnt" tracks the number of these
  * small physical pages that are in use at any given time.  When and if the
  * reservation is not fully utilized, it appears in the queue of partially-
  * populated reservations.  The reservation always appears on the containing
  * object's list of reservations.
  *
  * A partially-populated reservation can be broken and reclaimed at any time.
  */
 struct vm_reserv {
 	TAILQ_ENTRY(vm_reserv) partpopq;
 	LIST_ENTRY(vm_reserv) objq;
 	vm_object_t	object;			/* containing object */
 	vm_pindex_t	pindex;			/* offset within object */
 	vm_page_t	pages;			/* first page of a superpage */
 	int		popcnt;			/* # of pages in use */
 	char		inpartpopq;
 };
 
 /*
  * The reservation array
  *
  * This array is analoguous in function to vm_page_array.  It differs in the
  * respect that it may contain a greater number of useful reservation
  * structures than there are (physical) superpages.  These "invalid"
  * reservation structures exist to trade-off space for time in the
  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  * distinguishable from "valid" reservation structures by inspecting the
  * reservation's "pages" field.  Invalid reservation structures have a NULL
  * "pages" field.
  *
  * vm_reserv_from_page() maps a small (physical) page to an element of this
  * array by computing a physical reservation number from the page's physical
  * address.  The physical reservation number is used as the array index.
  *
  * An "active" reservation is a valid reservation structure that has a non-NULL
  * "object" field and a non-zero "popcnt" field.  In other words, every active
  * reservation belongs to a particular object.  Moreover, every active
  * reservation has an entry in the containing object's list of reservations.  
  */
 static vm_reserv_t vm_reserv_array;
 
 /*
  * The partially-populated reservation queue
  *
  * This queue enables the fast recovery of an unused cached or free small page
  * from a partially-populated reservation.  The reservation at the head of
  * this queue is the least-recently-changed, partially-populated reservation.
  *
  * Access to this queue is synchronized by the free page queue lock.
  */
 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop =
 			    TAILQ_HEAD_INITIALIZER(vm_rvq_partpop);
 
 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 
 static long vm_reserv_broken;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 
 static long vm_reserv_freed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 
 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
 
 static long vm_reserv_reclaimed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 
 static void		vm_reserv_depopulate(vm_reserv_t rv);
 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
 			    vm_pindex_t pindex);
 static void		vm_reserv_populate(vm_reserv_t rv);
 static void		vm_reserv_reclaim(vm_reserv_t rv);
 
 /*
  * Describes the current state of the partially-populated reservation queue.
  */
 static int
 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	vm_reserv_t rv;
-	char *cbuf;
-	const int cbufsize = (VM_NRESERVLEVEL + 1) * 81;
 	int counter, error, level, unused_pages;
 
-	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
-	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, "\nLEVEL     SIZE  NUMBER\n\n");
 	for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 		counter = 0;
 		unused_pages = 0;
 		mtx_lock(&vm_page_queue_free_mtx);
 		TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) {
 			counter++;
 			unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 		sbuf_printf(&sbuf, "%5.5d: %6.6dK, %6.6d\n", level,
 		    unused_pages * (PAGE_SIZE / 1024), counter);
 	}
-	sbuf_finish(&sbuf);
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
-	free(cbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
  * reservation to the tail of the partially-populated reservations queue if the
  * population count is non-zero.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_depopulate(vm_reserv_t rv)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
 	KASSERT(rv->popcnt > 0,
 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
 		LIST_REMOVE(rv, objq);
 		rv->object = NULL;
 		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 		vm_reserv_freed++;
 	} else {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 	}
 }
 
 /*
  * Returns the reservation to which the given page might belong.
  */
 static __inline vm_reserv_t
 vm_reserv_from_page(vm_page_t m)
 {
 
 	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 }
 
 /*
  * Returns TRUE if the given reservation contains the given page index and
  * FALSE otherwise.
  */
 static __inline boolean_t
 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 {
 
 	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 }
 
 /*
  * Increases the given reservation's population count.  Moves the reservation
  * to the tail of the partially-populated reservation queue.
  *
  * The free page queue must be locked.
  */
 static void
 vm_reserv_populate(vm_reserv_t rv)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
 	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 	}
 }
 
 /*
  * Allocates a page from an existing or newly-created reservation.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m, mpred, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 
 	/*
 	 * Is a reservation fundamentally not possible?
 	 */
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex >= object->size)
 		return (NULL);
 
 	/*
 	 * Look for an existing reservation.
 	 */
 	msucc = NULL;
 	mpred = object->root;
 	while (mpred != NULL) {
 		KASSERT(mpred->pindex != pindex,
 		    ("vm_reserv_alloc_page: pindex already allocated"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) {
 			m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
 			/* Handle vm_page_rename(m, new_object, ...). */
 			if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
 				return (NULL);
 			vm_reserv_populate(rv);
 			return (m);
 		} else if (mpred->pindex < pindex) {
 			if (msucc != NULL ||
 			    (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
 				break;
 			KASSERT(msucc->pindex != pindex,
 			    ("vm_reserv_alloc_page: pindex already allocated"));
 			rv = vm_reserv_from_page(msucc);
 			if (rv->object == object &&
 			    vm_reserv_has_pindex(rv, pindex)) {
 				m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
 				/* Handle vm_page_rename(m, new_object, ...). */
 				if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
 					return (NULL);
 				vm_reserv_populate(rv);
 				return (m);
 			} else if (pindex < msucc->pindex)
 				break;
 		} else if (msucc == NULL) {
 			msucc = mpred;
 			mpred = TAILQ_PREV(msucc, pglist, listq);
 			continue;
 		}
 		msucc = NULL;
 		mpred = object->root = vm_page_splay(pindex, object->root);
 	}
 
 	/*
 	 * Determine the first index to the left that can be used.
 	 */
 	if (mpred == NULL)
 		leftcap = 0;
 	else if ((rv = vm_reserv_from_page(mpred))->object != object)
 		leftcap = mpred->pindex + 1;
 	else
 		leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 
 	/*
 	 * Determine the first index to the right that cannot be used.
 	 */
 	if (msucc == NULL)
 		rightcap = pindex + VM_LEVEL_0_NPAGES;
 	else if ((rv = vm_reserv_from_page(msucc))->object != object)
 		rightcap = msucc->pindex;
 	else
 		rightcap = rv->pindex;
 
 	/*
 	 * Determine if a reservation fits between the first index to
 	 * the left that can be used and the first index to the right
 	 * that cannot be used. 
 	 */
 	first = pindex - VM_RESERV_INDEX(object, pindex);
 	if (first < leftcap || first + VM_LEVEL_0_NPAGES > rightcap)
 		return (NULL);
 
 	/*
 	 * Would a new reservation extend past the end of the given object? 
 	 */
 	if (object->size < first + VM_LEVEL_0_NPAGES) {
 		/*
 		 * Don't allocate a new reservation if the object is a vnode or
 		 * backed by another object that is a vnode. 
 		 */
 		if (object->type == OBJT_VNODE ||
 		    (object->backing_object != NULL &&
 		    object->backing_object->type == OBJT_VNODE))
 			return (NULL);
 		/* Speculate that the object may grow. */
 	}
 
 	/*
 	 * Allocate a new reservation.
 	 */
 	m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 	if (m != NULL) {
 		rv = vm_reserv_from_page(m);
 		KASSERT(rv->pages == m,
 		    ("vm_reserv_alloc_page: reserv %p's pages is corrupted",
 		    rv));
 		KASSERT(rv->object == NULL,
 		    ("vm_reserv_alloc_page: reserv %p isn't free", rv));
 		LIST_INSERT_HEAD(&object->rvq, rv, objq);
 		rv->object = object;
 		rv->pindex = first;
 		KASSERT(rv->popcnt == 0,
 		    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted",
 		    rv));
 		KASSERT(!rv->inpartpopq,
 		    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE",
 		    rv));
 		vm_reserv_populate(rv);
 		m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
 	}
 	return (m);
 }
 
 /*
  * Breaks all reservations belonging to the given object.
  */
 void
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
 	int i;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 		KASSERT(rv->object == object,
 		    ("vm_reserv_break_all: reserv %p is corrupted", rv));
 		if (rv->inpartpopq) {
 			TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
 		LIST_REMOVE(rv, objq);
 		rv->object = NULL;
 		for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 				vm_phys_free_pages(&rv->pages[i], 0);
 			else
 				rv->popcnt--;
 		}
 		KASSERT(rv->popcnt == 0,
 		    ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
 		    rv));
 		vm_reserv_broken++;
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
  * page is freed and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_free_page(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
 	if (rv->object != NULL) {
 		vm_reserv_depopulate(rv);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  * Initializes the reservation management system.  Specifically, initializes
  * the reservation array.
  *
  * Requires that vm_page_array and first_page are initialized!
  */
 void
 vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
 	 * "pages" field for every element that has an underlying superpage.
 	 */
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 			    PHYS_TO_VM_PAGE(paddr);
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
 }
 
 /*
  * Returns a reservation level if the given page belongs to a fully-populated
  * reservation and -1 otherwise.
  */
 int
 vm_reserv_level_iffullpop(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 }
 
 /*
  * Prepare for the reactivation of a cached page.
  *
  * First, suppose that the given page "m" was allocated individually, i.e., not
  * as part of a reservation, and cached.  Then, suppose a reservation
  * containing "m" is allocated by the same object.  Although "m" and the
  * reservation belong to the same object, "m"'s pindex may not match the
  * reservation's.
  *
  * The free page queue must be locked.
  */
 boolean_t
 vm_reserv_reactivate_page(vm_page_t m)
 {
 	vm_reserv_t rv;
 	int i, m_index;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_reserv_uncache_page: page %p is not cached", m));
 	if (m->object == rv->object &&
 	    m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
 		vm_reserv_populate(rv);
 	else {
 		KASSERT(rv->inpartpopq,
 		    ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
 		    rv));
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 		LIST_REMOVE(rv, objq);
 		rv->object = NULL;
 		/* Don't vm_phys_free_pages(m, 0). */
 		m_index = m - rv->pages;
 		for (i = 0; i < m_index; i++) {
 			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 				vm_phys_free_pages(&rv->pages[i], 0);
 			else
 				rv->popcnt--;
 		}
 		for (i++; i < VM_LEVEL_0_NPAGES; i++) {
 			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 				vm_phys_free_pages(&rv->pages[i], 0);
 			else
 				rv->popcnt--;
 		}
 		KASSERT(rv->popcnt == 0,
 		    ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
 		    rv));
 		vm_reserv_broken++;
 	}
 	return (TRUE);
 }
 
 /*
  * Breaks the given partially-populated reservation, releasing its cached and
  * free pages to the physical memory allocator.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
 	int i;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->inpartpopq,
 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
 	TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 	rv->inpartpopq = FALSE;
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_reclaim: reserv %p is free", rv));
 	LIST_REMOVE(rv, objq);
 	rv->object = NULL;
 	for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 		if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 			vm_phys_free_pages(&rv->pages[i], 0);
 		else
 			rv->popcnt--;
 	}
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
 	vm_reserv_reclaimed++;
 }
 
 /*
  * Breaks the reservation at the head of the partially-populated reservation
  * queue, releasing its cached and free pages to the physical memory
  * allocator.  Returns TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_inactive(void)
 {
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) {
 		vm_reserv_reclaim(rv);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  * Searches the partially-populated reservation queue for the least recently
  * active reservation with unused pages, i.e., cached or free, that satisfy the
  * given request for contiguous physical memory.  If a satisfactory reservation
  * is found, it is broken.  Returns TRUE if a reservation is broken and FALSE
  * otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low, vm_paddr_t high,
     unsigned long alignment, unsigned long boundary)
 {
 	vm_paddr_t pa, pa_length;
 	vm_reserv_t rv;
 	int i;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (size > VM_LEVEL_0_SIZE - PAGE_SIZE)
 		return (FALSE);
 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
 			/* this entire reservation is too low; go to next */
 			continue;
 		}
 		pa_length = 0;
 		for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
 			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
 				pa_length += PAGE_SIZE;
 				if (pa_length == PAGE_SIZE) {
 					pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
 					if (pa + size > high) {
 						/* skip to next reservation */
 						break;
 					} else if (pa < low ||
 					    (pa & (alignment - 1)) != 0 ||
 					    ((pa ^ (pa + size - 1)) &
 					    ~(boundary - 1)) != 0)
 						pa_length = 0;
 				} else if (pa_length >= size) {
 					vm_reserv_reclaim(rv);
 					return (TRUE);
 				}
 			} else
 				pa_length = 0;
 	}
 	return (FALSE);
 }
 
 /*
  * Transfers the reservation underlying the given page to a new object.
  *
  * The object must be locked.
  */
 void
 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
     vm_pindex_t old_object_offset)
 {
 	vm_reserv_t rv;
 
 	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == old_object) {
 		mtx_lock(&vm_page_queue_free_mtx);
 		if (rv->object == old_object) {
 			LIST_REMOVE(rv, objq);
 			LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 			rv->object = new_object;
 			rv->pindex -= old_object_offset;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 }
 
 /*
  * Allocates the virtual and physical memory required by the reservation
  * management system's data structures, in particular, the reservation array.
  */
 vm_paddr_t
 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 {
 	vm_paddr_t new_end;
 	size_t size;
 
 	/*
 	 * Calculate the size (in bytes) of the reservation array.  Round up
 	 * from "high_water" because every small page is mapped to an element
 	 * in the reservation array based on its physical address.  Thus, the
 	 * number of elements in the reservation array can be greater than the
 	 * number of superpages. 
 	 */
 	size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 
 	/*
 	 * Allocate and map the physical memory for the reservation array.  The
 	 * next available virtual address is returned by reference.
 	 */
 	new_end = end - round_page(size);
 	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero(vm_reserv_array, size);
 
 	/*
 	 * Return the next available physical address.
 	 */
 	return (new_end);
 }
 
 #endif	/* VM_NRESERVLEVEL > 0 */