Index: head/sys/dev/en/midway.c
===================================================================
--- head/sys/dev/en/midway.c	(revision 276691)
+++ head/sys/dev/en/midway.c	(revision 276692)
@@ -1,3367 +1,3367 @@
 /*	$NetBSD: midway.c,v 1.30 1997/09/29 17:40:38 chuck Exp $	*/
 /*	(sync'd to midway.c 1.68)	*/
 
 /*-
  * Copyright (c) 1996 Charles D. Cranor and Washington University.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Charles D. Cranor and
  *	Washington University.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *
  * m i d w a y . c   e n i 1 5 5   d r i v e r 
  *
  * author: Chuck Cranor <chuck@ccrc.wustl.edu>
  * started: spring, 1996 (written from scratch).
  *
  * notes from the author:
  *   Extra special thanks go to Werner Almesberger, EPFL LRC.   Werner's
  *   ENI driver was especially useful in figuring out how this card works.
  *   I would also like to thank Werner for promptly answering email and being
  *   generally helpful.
  */
 
 #define	EN_DIAG
 #define EN_DDBHOOK	1	/* compile in ddb functions */
 
 /*
  * Note on EN_ENIDMAFIX: the byte aligner on the ENI version of the card
  * appears to be broken.   it works just fine if there is no load... however
  * when the card is loaded the data get corrupted.   to see this, one only
  * has to use "telnet" over ATM.   do the following command in "telnet":
  * 	cat /usr/share/misc/termcap
  * "telnet" seems to generate lots of 1023 byte mbufs (which make great
  * use of the byte aligner).   watch "netstat -s" for checksum errors.
  * 
  * I further tested this by adding a function that compared the transmit 
  * data on the card's SRAM with the data in the mbuf chain _after_ the 
  * "transmit DMA complete" interrupt.   using the "telnet" test I got data
  * mismatches where the byte-aligned data should have been.   using ddb
  * and en_dumpmem() I verified that the DTQs fed into the card were 
  * absolutely correct.   thus, we are forced to concluded that the ENI
  * hardware is buggy.   note that the Adaptec version of the card works
  * just fine with byte DMA.
  *
  * bottom line: we set EN_ENIDMAFIX to 1 to avoid byte DMAs on the ENI
  * card.
  */
 
 #if defined(DIAGNOSTIC) && !defined(EN_DIAG)
 #define EN_DIAG			/* link in with master DIAG option */
 #endif
 
 #define EN_COUNT(X) (X)++
 
 #ifdef EN_DEBUG
 
 #undef	EN_DDBHOOK
 #define	EN_DDBHOOK	1
 
 /*
  * This macro removes almost all the EN_DEBUG conditionals in the code that make
  * to code a good deal less readable.
  */
 #define DBG(SC, FL, PRINT) do {						\
 	if ((SC)->debug & DBG_##FL) {					\
 		device_printf((SC)->dev, "%s: "#FL": ", __func__);	\
 		printf PRINT;						\
 		printf("\n");						\
 	}								\
     } while (0)
 
 enum {
 	DBG_INIT	= 0x0001,	/* debug attach/detach */
 	DBG_TX		= 0x0002,	/* debug transmitting */
 	DBG_SERV	= 0x0004,	/* debug service interrupts */
 	DBG_IOCTL	= 0x0008,	/* debug ioctls */
 	DBG_VC		= 0x0010,	/* debug VC handling */
 	DBG_INTR	= 0x0020,	/* debug interrupts */
 	DBG_DMA		= 0x0040,	/* debug DMA probing */
 	DBG_IPACKETS	= 0x0080,	/* print input packets */
 	DBG_REG		= 0x0100,	/* print all register access */
 	DBG_LOCK	= 0x0200,	/* debug locking */
 };
 
 #else /* EN_DEBUG */
 
 #define DBG(SC, FL, PRINT) do { } while (0)
 
 #endif /* EN_DEBUG */
 
 #include "opt_inet.h"
 #include "opt_natm.h"
 #include "opt_ddb.h"
 
 #ifdef DDB
 #undef	EN_DDBHOOK
 #define	EN_DDBHOOK	1
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/sockio.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 #include <sys/endian.h>
 #include <sys/stdint.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_atm.h>
 
 #if defined(NATM) || defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/if_atm.h>
 #endif
 #endif
 
 #ifdef NATM
 #include <netnatm/natm.h>
 #endif
 
 #include <sys/bus.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <machine/resource.h>
 #include <dev/utopia/utopia.h>
 #include <dev/en/midwayreg.h>
 #include <dev/en/midwayvar.h>
 
 #include <net/bpf.h>
 
 /*
  * params
  */
 #ifndef EN_TXHIWAT
 #define EN_TXHIWAT	(64 * 1024)	/* max 64 KB waiting to be DMAd out */
 #endif
 
 SYSCTL_DECL(_hw_atm);
 
 /*
  * dma tables
  *
  * The plan is indexed by the number of words to transfer.
  * The maximum index is 15 for 60 words.
  */
 struct en_dmatab {
 	uint8_t bcode;		/* code */
 	uint8_t divshift;	/* byte divisor */
 };
 
 static const struct en_dmatab en_dmaplan[] = {
   { 0, 0 },		/* 0 */		{ MIDDMA_WORD, 2},	/* 1 */
   { MIDDMA_2WORD, 3},	/* 2 */		{ MIDDMA_WORD, 2},	/* 3 */
   { MIDDMA_4WORD, 4},	/* 4 */		{ MIDDMA_WORD, 2},	/* 5 */
   { MIDDMA_2WORD, 3},	/* 6 */		{ MIDDMA_WORD, 2},	/* 7 */
   { MIDDMA_8WORD, 5},   /* 8 */		{ MIDDMA_WORD, 2},	/* 9 */
   { MIDDMA_2WORD, 3},	/* 10 */	{ MIDDMA_WORD, 2},	/* 11 */
   { MIDDMA_4WORD, 4},	/* 12 */	{ MIDDMA_WORD, 2},	/* 13 */
   { MIDDMA_2WORD, 3},	/* 14 */	{ MIDDMA_WORD, 2},	/* 15 */
   { MIDDMA_16WORD,6},	/* 16 */
 };
 
 /*
  * prototypes
  */
 #ifdef EN_DDBHOOK
 int en_dump(int unit, int level);
 int en_dumpmem(int,int,int);
 #endif
 static void en_close_finish(struct en_softc *sc, struct en_vcc *vc);
 
 #define EN_LOCK(SC)	do {				\
 	DBG(SC, LOCK, ("ENLOCK %d\n", __LINE__));	\
 	mtx_lock(&sc->en_mtx);				\
     } while (0)
 #define EN_UNLOCK(SC)	do {				\
 	DBG(SC, LOCK, ("ENUNLOCK %d\n", __LINE__));	\
 	mtx_unlock(&sc->en_mtx);			\
     } while (0)
 #define EN_CHECKLOCK(sc)	mtx_assert(&sc->en_mtx, MA_OWNED)
 
 /*
  * While a transmit mbuf is waiting to get transmit DMA resources we
  * need to keep some information with it. We don't want to allocate
  * additional memory for this so we stuff it into free fields in the
  * mbuf packet header. Neither the checksum fields nor the rcvif field are used
  * so use these.
  */
 #define TX_AAL5		0x1	/* transmit AAL5 PDU */
 #define TX_HAS_TBD	0x2	/* TBD did fit into mbuf */
 #define TX_HAS_PAD	0x4	/* padding did fit into mbuf */
 #define TX_HAS_PDU	0x8	/* PDU trailer did fit into mbuf */
 
 #define MBUF_SET_TX(M, VCI, FLAGS, DATALEN, PAD, MAP) do {		\
 	(M)->m_pkthdr.csum_data = (VCI) | ((FLAGS) << MID_VCI_BITS);	\
 	(M)->m_pkthdr.csum_flags = ((DATALEN) & 0xffff) |		\
 	    ((PAD & 0x3f) << 16);					\
 	(M)->m_pkthdr.rcvif = (void *)(MAP);				\
     } while (0)
 
 #define MBUF_GET_TX(M, VCI, FLAGS, DATALEN, PAD, MAP) do {		\
 	(VCI) = (M)->m_pkthdr.csum_data & ((1 << MID_VCI_BITS) - 1);	\
 	(FLAGS) = ((M)->m_pkthdr.csum_data >> MID_VCI_BITS) & 0xf;	\
 	(DATALEN) = (M)->m_pkthdr.csum_flags & 0xffff;			\
 	(PAD) = ((M)->m_pkthdr.csum_flags >> 16) & 0x3f;		\
 	(MAP) = (void *)((M)->m_pkthdr.rcvif);				\
     } while (0)
 
 
 #define EN_WRAPADD(START, STOP, CUR, VAL) do {			\
 	(CUR) = (CUR) + (VAL);					\
 	if ((CUR) >= (STOP))					\
 		(CUR) = (START) + ((CUR) - (STOP));		\
     } while (0)
 
 #define WORD_IDX(START, X) (((X) - (START)) / sizeof(uint32_t))
 
 #define SETQ_END(SC, VAL) ((SC)->is_adaptec ?			\
 	((VAL) | (MID_DMA_END >> 4)) :				\
 	((VAL) | (MID_DMA_END)))
 
 /*
  * The dtq and drq members are set for each END entry in the corresponding
  * card queue entry. It is used to find out, when a buffer has been
  * finished DMAing and can be freed.
  *
  * We store sc->dtq and sc->drq data in the following format...
  * the 0x80000 ensures we != 0
  */
 #define EN_DQ_MK(SLOT, LEN)	(((SLOT) << 20) | (LEN) | (0x80000))
 #define EN_DQ_SLOT(X)		((X) >> 20)
 #define EN_DQ_LEN(X)		((X) & 0x3ffff)
 
 /*
  * Variables
  */
 static uma_zone_t en_vcc_zone;
 
 /***********************************************************************/
 
 /*
  * en_read{x}: read a word from the card. These are the only functions
  * that read from the card.
  */
 static __inline uint32_t
 en_readx(struct en_softc *sc, uint32_t r)
 {
 	uint32_t v;
 
 #ifdef EN_DIAG
 	if (r > MID_MAXOFF || (r % 4))
 		panic("en_read out of range, r=0x%x", r);
 #endif
 	v = bus_space_read_4(sc->en_memt, sc->en_base, r);
 	return (v);
 }
 
 static __inline uint32_t
 en_read(struct en_softc *sc, uint32_t r)
 {
 	uint32_t v;
 
 #ifdef EN_DIAG
 	if (r > MID_MAXOFF || (r % 4))
 		panic("en_read out of range, r=0x%x", r);
 #endif
 	v = bus_space_read_4(sc->en_memt, sc->en_base, r);
 	DBG(sc, REG, ("en_read(%#x) -> %08x", r, v));
 	return (v);
 }
 
 /*
  * en_write: write a word to the card. This is the only function that
  * writes to the card.
  */
 static __inline void
 en_write(struct en_softc *sc, uint32_t r, uint32_t v)
 {
 #ifdef EN_DIAG
 	if (r > MID_MAXOFF || (r % 4))
 		panic("en_write out of range, r=0x%x", r);
 #endif
 	DBG(sc, REG, ("en_write(%#x) <- %08x", r, v));
 	bus_space_write_4(sc->en_memt, sc->en_base, r, v);
 }
 
 /*
  * en_k2sz: convert KBytes to a size parameter (a log2)
  */
 static __inline int
 en_k2sz(int k)
 {
 	switch(k) {
 	  case 1:   return (0);
 	  case 2:   return (1);
 	  case 4:   return (2);
 	  case 8:   return (3);
 	  case 16:  return (4);
 	  case 32:  return (5);
 	  case 64:  return (6);
 	  case 128: return (7);
 	  default:
 		panic("en_k2sz");
 	}
 	return (0);
 }
 #define en_log2(X) en_k2sz(X)
 
 #if 0
 /*
  * en_b2sz: convert a DMA burst code to its byte size
  */
 static __inline int
 en_b2sz(int b)
 {
 	switch (b) {
 	  case MIDDMA_WORD:   return (1*4);
 	  case MIDDMA_2WMAYBE:
 	  case MIDDMA_2WORD:  return (2*4);
 	  case MIDDMA_4WMAYBE:
 	  case MIDDMA_4WORD:  return (4*4);
 	  case MIDDMA_8WMAYBE:
 	  case MIDDMA_8WORD:  return (8*4);
 	  case MIDDMA_16WMAYBE:
 	  case MIDDMA_16WORD: return (16*4);
 	  default:
 		panic("en_b2sz");
 	}
 	return (0);
 }
 #endif
 
 /*
  * en_sz2b: convert a burst size (bytes) to DMA burst code
  */
 static __inline int
 en_sz2b(int sz)
 {
 	switch (sz) {
 	  case 1*4:  return (MIDDMA_WORD);
 	  case 2*4:  return (MIDDMA_2WORD);
 	  case 4*4:  return (MIDDMA_4WORD);
 	  case 8*4:  return (MIDDMA_8WORD);
 	  case 16*4: return (MIDDMA_16WORD);
 	  default:
 		panic("en_sz2b");
 	}
 	return(0);
 }
 
 #ifdef EN_DEBUG
 /*
  * Dump a packet
  */
 static void
 en_dump_packet(struct en_softc *sc, struct mbuf *m)
 {
 	int plen = m->m_pkthdr.len;
 	u_int pos = 0;
 	u_int totlen = 0;
 	int len;
 	u_char *ptr;
 
 	device_printf(sc->dev, "packet len=%d", plen);
 	while (m != NULL) {
 		totlen += m->m_len;
 		ptr = mtod(m, u_char *);
 		for (len = 0; len < m->m_len; len++, pos++, ptr++) {
 			if (pos % 16 == 8)
 				printf(" ");
 			if (pos % 16 == 0)
 				printf("\n");
 			printf(" %02x", *ptr);
 		}
 		m = m->m_next;
 	}
 	printf("\n");
 	if (totlen != plen)
 		printf("sum of m_len=%u\n", totlen);
 }
 #endif
 
 /*********************************************************************/
 /*
  * DMA maps
  */
 
 /*
  * Map constructor for a MAP.
  *
  * This is called each time when a map is allocated
  * from the pool and about to be returned to the user. Here we actually
  * allocate the map if there isn't one. The problem is that we may fail
  * to allocate the DMA map yet have no means to signal this error. Therefor
  * when allocating a map, the call must check that there is a map. An
  * additional problem is, that i386 maps will be NULL, yet are ok and must
  * be freed so let's use a flag to signal allocation.
  *
  * Caveat: we have no way to know that we are called from an interrupt context
  * here. We rely on the fact, that bus_dmamap_create uses M_NOWAIT in all
  * its allocations.
  *
  * LOCK: any, not needed
  */
 static int
 en_map_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct en_softc *sc = arg;
 	struct en_map *map = mem;
 	int err;
 
 	err = bus_dmamap_create(sc->txtag, 0, &map->map);
 	if (err != 0) {
 		device_printf(sc->dev, "cannot create DMA map %d\n", err);
 		return (err);
 	}
 	map->flags = ENMAP_ALLOC;
 	map->sc = sc;
 	return (0);
 }
 
 /*
  * Map destructor.
  *
  * Called when a map is disposed into the zone. If the map is loaded, unload
  * it.
  *
  * LOCK: any, not needed
  */
 static void
 en_map_dtor(void *mem, int size, void *arg)
 {
 	struct en_map *map = mem;
 
 	if (map->flags & ENMAP_LOADED) {
 		bus_dmamap_unload(map->sc->txtag, map->map);
 		map->flags &= ~ENMAP_LOADED;
 	}
 }
 
 /*
  * Map finializer.
  *
  * This is called each time a map is returned from the zone to the system.
  * Get rid of the dmamap here.
  *
  * LOCK: any, not needed
  */
 static void
 en_map_fini(void *mem, int size)
 {
 	struct en_map *map = mem;
 
 	bus_dmamap_destroy(map->sc->txtag, map->map);
 }
 
 /*********************************************************************/
 /*
  * Transmission
  */
 
 /*
  * Argument structure to load a transmit DMA map
  */
 struct txarg {
 	struct en_softc *sc;
 	struct mbuf *m;
 	u_int vci;
 	u_int chan;		/* transmit channel */
 	u_int datalen;		/* length of user data */
 	u_int flags;
 	u_int wait;		/* return: out of resources */
 };
 
 /*
  * TX DMA map loader helper. This function is the callback when the map
  * is loaded. It should fill the DMA segment descriptors into the hardware.
  *
  * LOCK: locked, needed
  */
 static void
 en_txdma_load(void *uarg, bus_dma_segment_t *segs, int nseg, bus_size_t mapsize,
     int error)
 {
 	struct txarg *tx = uarg;
 	struct en_softc *sc = tx->sc;
 	struct en_txslot *slot = &sc->txslot[tx->chan];
 	uint32_t cur;		/* on-card buffer position (bytes offset) */
 	uint32_t dtq;		/* on-card queue position (byte offset) */
 	uint32_t last_dtq;	/* last DTQ we have written */
 	uint32_t tmp;
 	u_int free;		/* free queue entries on card */
 	u_int needalign, cnt;
 	bus_size_t rest;	/* remaining bytes in current segment */
 	bus_addr_t addr;
 	bus_dma_segment_t *s;
 	uint32_t count, bcode;
 	int i;
 
 	if (error != 0)
 		return;
 
 	cur = slot->cur;
 	dtq = sc->dtq_us;
 	free = sc->dtq_free;
 
 	last_dtq = 0;		/* make gcc happy */
 
 	/*
 	 * Local macro to add an entry to the transmit DMA area. If there
 	 * are no entries left, return. Save the byte offset of the entry
 	 * in last_dtq for later use.
 	 */
 #define PUT_DTQ_ENTRY(ENI, BCODE, COUNT, ADDR)				\
 	if (free == 0) {						\
 		EN_COUNT(sc->stats.txdtqout);				\
 		tx->wait = 1;						\
 		return;							\
 	}								\
 	last_dtq = dtq;							\
 	en_write(sc, dtq + 0, (ENI || !sc->is_adaptec) ?		\
 	    MID_MK_TXQ_ENI(COUNT, tx->chan, 0, BCODE) :			\
 	    MID_MK_TXQ_ADP(COUNT, tx->chan, 0, BCODE));			\
 	en_write(sc, dtq + 4, ADDR);					\
 									\
 	EN_WRAPADD(MID_DTQOFF, MID_DTQEND, dtq, 8);			\
 	free--;
 
 	/*
 	 * Local macro to generate a DMA entry to DMA cnt bytes. Updates
 	 * the current buffer byte offset accordingly.
 	 */
 #define DO_DTQ(TYPE) do {						\
 	rest -= cnt;							\
 	EN_WRAPADD(slot->start, slot->stop, cur, cnt);			\
 	DBG(sc, TX, ("tx%d: "TYPE" %u bytes, %ju left, cur %#x",	\
 	    tx->chan, cnt, (uintmax_t)rest, cur));			\
 									\
 	PUT_DTQ_ENTRY(1, bcode, count, addr);				\
 									\
 	addr += cnt;							\
     } while (0)
 
 	if (!(tx->flags & TX_HAS_TBD)) {
 		/*
 		 * Prepend the TBD - it did not fit into the first mbuf
 		 */
 		tmp = MID_TBD_MK1((tx->flags & TX_AAL5) ?
 		    MID_TBD_AAL5 : MID_TBD_NOAAL5,
 		    sc->vccs[tx->vci]->txspeed,
 		    tx->m->m_pkthdr.len / MID_ATMDATASZ);
 		en_write(sc, cur, tmp);
 		EN_WRAPADD(slot->start, slot->stop, cur, 4);
 
 		tmp = MID_TBD_MK2(tx->vci, 0, 0);
 		en_write(sc, cur, tmp);
 		EN_WRAPADD(slot->start, slot->stop, cur, 4);
 
 		/* update DMA address */
 		PUT_DTQ_ENTRY(0, MIDDMA_JK, WORD_IDX(slot->start, cur), 0);
 	}
 
 	for (i = 0, s = segs; i < nseg; i++, s++) {
 		rest = s->ds_len;
 		addr = s->ds_addr;
 
 		if (sc->is_adaptec) {
 			/* adaptec card - simple */
 
 			/* advance the on-card buffer pointer */
 			EN_WRAPADD(slot->start, slot->stop, cur, rest);
 			DBG(sc, TX, ("tx%d: adp %ju bytes %#jx (cur now 0x%x)",
 			    tx->chan, (uintmax_t)rest, (uintmax_t)addr, cur));
 
 			PUT_DTQ_ENTRY(0, 0, rest, addr);
 
 			continue;
 		}
 
 		/*
 		 * do we need to do a DMA op to align to the maximum
 		 * burst? Note, that we are alway 32-bit aligned.
 		 */
 		if (sc->alburst &&
 		    (needalign = (addr & sc->bestburstmask)) != 0) {
 			/* compute number of bytes, words and code */
 			cnt = sc->bestburstlen - needalign;
 			if (cnt > rest)
 				cnt = rest;
 			count = cnt / sizeof(uint32_t);
 			if (sc->noalbursts) {
 				bcode = MIDDMA_WORD;
 			} else {
 				bcode = en_dmaplan[count].bcode;
 				count = cnt >> en_dmaplan[count].divshift;
 			}
 			DO_DTQ("al_dma");
 		}
 
 		/* do we need to do a max-sized burst? */
 		if (rest >= sc->bestburstlen) {
 			count = rest >> sc->bestburstshift;
 			cnt = count << sc->bestburstshift;
 			bcode = sc->bestburstcode;
 			DO_DTQ("best_dma");
 		}
 
 		/* do we need to do a cleanup burst? */
 		if (rest != 0) {
 			cnt = rest;
 			count = rest / sizeof(uint32_t);
 			if (sc->noalbursts) {
 				bcode = MIDDMA_WORD;
 			} else {
 				bcode = en_dmaplan[count].bcode;
 				count = cnt >> en_dmaplan[count].divshift;
 			}
 			DO_DTQ("clean_dma");
 		}
 	}
 
 	KASSERT (tx->flags & TX_HAS_PAD, ("PDU not padded"));
 
 	if ((tx->flags & TX_AAL5) && !(tx->flags & TX_HAS_PDU)) {
 		/*
 		 * Append the AAL5 PDU trailer
 		 */
 		tmp = MID_PDU_MK1(0, 0, tx->datalen);
 		en_write(sc, cur, tmp);
 		EN_WRAPADD(slot->start, slot->stop, cur, 4);
 
 		en_write(sc, cur, 0);
 		EN_WRAPADD(slot->start, slot->stop, cur, 4);
 
 		/* update DMA address */
 		PUT_DTQ_ENTRY(0, MIDDMA_JK, WORD_IDX(slot->start, cur), 0);
 	}
 
 	/* record the end for the interrupt routine */
 	sc->dtq[MID_DTQ_A2REG(last_dtq)] =
 	    EN_DQ_MK(tx->chan, tx->m->m_pkthdr.len);
 
 	/* set the end flag in the last descriptor */
 	en_write(sc, last_dtq + 0, SETQ_END(sc, en_read(sc, last_dtq + 0)));
 
 #undef PUT_DTQ_ENTRY
 #undef DO_DTQ
 
 	/* commit */
 	slot->cur = cur;
 	sc->dtq_free = free;
 	sc->dtq_us = dtq;
 
 	/* tell card */
 	en_write(sc, MID_DMA_WRTX, MID_DTQ_A2REG(sc->dtq_us));
 }
 
 /*
  * en_txdma: start transmit DMA on the given channel, if possible
  *
  * This is called from two places: when we got new packets from the upper
  * layer or when we found that buffer space has freed up during interrupt
  * processing.
  *
  * LOCK: locked, needed
  */
 static void
 en_txdma(struct en_softc *sc, struct en_txslot *slot)
 {
 	struct en_map *map;
 	struct mbuf *lastm;
 	struct txarg tx;
 	u_int pad;
 	int error;
 
 	DBG(sc, TX, ("tx%td: starting ...", slot - sc->txslot));
   again:
 	bzero(&tx, sizeof(tx));
 	tx.chan = slot - sc->txslot;
 	tx.sc = sc;
 
 	/*
 	 * get an mbuf waiting for DMA
 	 */
 	_IF_DEQUEUE(&slot->q, tx.m);
 	if (tx.m == NULL) {
 		DBG(sc, TX, ("tx%td: ...done!", slot - sc->txslot));
 		return;
 	}
 	MBUF_GET_TX(tx.m, tx.vci, tx.flags, tx.datalen, pad, map);
 
 	/*
 	 * note: don't use the entire buffer space.  if WRTX becomes equal
 	 * to RDTX, the transmitter stops assuming the buffer is empty!  --kjc
 	 */
 	if (tx.m->m_pkthdr.len >= slot->bfree) {
 		EN_COUNT(sc->stats.txoutspace);
 		DBG(sc, TX, ("tx%td: out of transmit space", slot - sc->txslot));
 		goto waitres;
 	}
   
 	lastm = NULL;
 	if (!(tx.flags & TX_HAS_PAD)) {
 		if (pad != 0) {
 			/* Append the padding buffer */
 			(void)m_length(tx.m, &lastm);
 			lastm->m_next = sc->padbuf;
 			sc->padbuf->m_len = pad;
 		}
 		tx.flags |= TX_HAS_PAD;
 	}
 
 	/*
 	 * Try to load that map
 	 */
 	error = bus_dmamap_load_mbuf(sc->txtag, map->map, tx.m,
 	    en_txdma_load, &tx, BUS_DMA_NOWAIT);
 
 	if (lastm != NULL)
 		lastm->m_next = NULL;
 
 	if (error != 0) {
 		device_printf(sc->dev, "loading TX map failed %d\n",
 		    error);
 		goto dequeue_drop;
 	}
 	map->flags |= ENMAP_LOADED;
 	if (tx.wait) {
 		/* probably not enough space */
 		bus_dmamap_unload(map->sc->txtag, map->map);
 		map->flags &= ~ENMAP_LOADED;
 
 		sc->need_dtqs = 1;
 		DBG(sc, TX, ("tx%td: out of transmit DTQs", slot - sc->txslot));
 		goto waitres;
 	}
 
 	EN_COUNT(sc->stats.launch);
 	if_inc_counter(sc->ifp, IFCOUNTER_OPACKETS, 1);
 
 	sc->vccs[tx.vci]->opackets++;
 	sc->vccs[tx.vci]->obytes += tx.datalen;
 
 #ifdef ENABLE_BPF
 	if (bpf_peers_present(sc->ifp->if_bpf)) {
 		/*
 		 * adjust the top of the mbuf to skip the TBD if present
 		 * before passing the packet to bpf.
 		 * Also remove padding and the PDU trailer. Assume both of
 		 * them to be in the same mbuf. pktlen, m_len and m_data
 		 * are not needed anymore so we can change them.
 		 */
 		if (tx.flags & TX_HAS_TBD) {
 			tx.m->m_data += MID_TBD_SIZE;
 			tx.m->m_len -= MID_TBD_SIZE;
 		}
 		tx.m->m_pkthdr.len = m_length(tx.m, &lastm);
 		if (tx.m->m_pkthdr.len > tx.datalen) {
 			lastm->m_len -= tx.m->m_pkthdr.len - tx.datalen;
 			tx.m->m_pkthdr.len = tx.datalen;
 		}
 
 		bpf_mtap(sc->ifp->if_bpf, tx.m);
 	}
 #endif
 
 	/*
 	 * do some housekeeping and get the next packet
 	 */
 	slot->bfree -= tx.m->m_pkthdr.len;
 	_IF_ENQUEUE(&slot->indma, tx.m);
 
 	goto again;
 
 	/*
 	 * error handling. This is jumped to when we just want to drop
 	 * the packet. Must be unlocked here.
 	 */
   dequeue_drop:
 	if (map != NULL)
 		uma_zfree(sc->map_zone, map);
 
 	slot->mbsize -= tx.m->m_pkthdr.len;
 
 	m_freem(tx.m);
 
 	goto again;
 
   waitres:
 	_IF_PREPEND(&slot->q, tx.m);
 }
 
 /*
  * Create a copy of a single mbuf. It can have either internal or
  * external data, it may have a packet header. External data is really
  * copied, so the new buffer is writeable.
  *
  * LOCK: any, not needed
  */
 static struct mbuf *
 copy_mbuf(struct mbuf *m)
 {
 	struct mbuf *new;
 
 	MGET(new, M_WAITOK, MT_DATA);
 
 	if (m->m_flags & M_PKTHDR) {
 		M_MOVE_PKTHDR(new, m);
 		if (m->m_len > MHLEN)
 			MCLGET(new, M_WAITOK);
 	} else {
 		if (m->m_len > MLEN)
 			MCLGET(new, M_WAITOK);
 	}
 
 	bcopy(m->m_data, new->m_data, m->m_len);
 	new->m_len = m->m_len;
 	new->m_flags &= ~M_RDONLY;
 
 	return (new);
 }
 
 /*
  * This function is called when we have an ENI adapter. It fixes the
  * mbuf chain, so that all addresses and lengths are 4 byte aligned.
  * The overall length is already padded to multiple of cells plus the
  * TBD so this must always succeed. The routine can fail, when it
  * needs to copy an mbuf (this may happen if an mbuf is readonly).
  *
  * We assume here, that aligning the virtual addresses to 4 bytes also
  * aligns the physical addresses.
  *
  * LOCK: locked, needed
  */
 static struct mbuf *
 en_fix_mchain(struct en_softc *sc, struct mbuf *m0, u_int *pad)
 {
 	struct mbuf **prev = &m0;
 	struct mbuf *m = m0;
 	struct mbuf *new;
 	u_char *d;
 	int off;
 
 	while (m != NULL) {
 		d = mtod(m, u_char *);
 		if ((off = (uintptr_t)d % sizeof(uint32_t)) != 0) {
 			EN_COUNT(sc->stats.mfixaddr);
 			if (M_WRITABLE(m)) {
 				bcopy(d, d - off, m->m_len);
 				m->m_data -= off;
 			} else {
 				if ((new = copy_mbuf(m)) == NULL) {
 					EN_COUNT(sc->stats.mfixfail);
 					m_freem(m0);
 					return (NULL);
 				}
 				new->m_next = m_free(m);
 				*prev = m = new;
 			}
 		}
 
 		if ((off = m->m_len % sizeof(uint32_t)) != 0) {
 			EN_COUNT(sc->stats.mfixlen);
 			if (!M_WRITABLE(m)) {
 				if ((new = copy_mbuf(m)) == NULL) {
 					EN_COUNT(sc->stats.mfixfail);
 					m_freem(m0);
 					return (NULL);
 				}
 				new->m_next = m_free(m);
 				*prev = m = new;
 			}
 			d = mtod(m, u_char *) + m->m_len;
 			off = 4 - off;
 			while (off) {
 				while (m->m_next && m->m_next->m_len == 0)
 					m->m_next = m_free(m->m_next);
 
 				if (m->m_next == NULL) {
 					*d++ = 0;
 					KASSERT(*pad > 0, ("no padding space"));
 					(*pad)--;
 				} else {
 					*d++ = *mtod(m->m_next, u_char *);
 					m->m_next->m_len--;
 					m->m_next->m_data++;
 				}
 				m->m_len++;
 				off--;
 			}
 		}
 
 		prev = &m->m_next;
 		m = m->m_next;
 	}
 
 	return (m0);
 }
 
 /*
  * en_start: start transmitting the next packet that needs to go out
  * if there is one. We take off all packets from the interface's queue and
  * put them into the channels queue.
  *
  * Here we also prepend the transmit packet descriptor and append the padding
  * and (for aal5) the PDU trailer. This is different from the original driver:
  * we assume, that allocating one or two additional mbufs is actually cheaper
  * than all this algorithmic fiddling we would need otherwise.
  *
  * While the packet is on the channels wait queue we use the csum_* fields
  * in the packet header to hold the original datalen, the AAL5 flag and the
  * VCI. The packet length field in the header holds the needed buffer space.
  * This may actually be more than the length of the current mbuf chain (when
  * one or more of TBD, padding and PDU do not fit).
  *
  * LOCK: unlocked, needed
  */
 static void
 en_start(struct ifnet *ifp)
 {
 	struct en_softc *sc = (struct en_softc *)ifp->if_softc;
 	struct mbuf *m, *lastm;
 	struct atm_pseudohdr *ap;
 	u_int pad;		/* 0-bytes to pad at PDU end */
 	u_int datalen;		/* length of user data */
 	u_int vci;		/* the VCI we are transmitting on */
 	u_int flags;
 	uint32_t tbd[2];
 	uint32_t pdu[2];
 	struct en_vcc *vc;
 	struct en_map *map;
 	struct en_txslot *tx;
 
 	while (1) {
 		IF_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			return;
 
 		flags = 0;
 
 	    	ap = mtod(m, struct atm_pseudohdr *);
 		vci = ATM_PH_VCI(ap);
 
 		if (ATM_PH_VPI(ap) != 0 || vci >= MID_N_VC ||
 		    (vc = sc->vccs[vci]) == NULL ||
 		    (vc->vflags & VCC_CLOSE_RX)) {
 			DBG(sc, TX, ("output vpi=%u, vci=%u -- drop",
 			    ATM_PH_VPI(ap), vci));
 			m_freem(m);
 			continue;
 		}
 		if (vc->vcc.aal == ATMIO_AAL_5)
 			flags |= TX_AAL5;
 		m_adj(m, sizeof(struct atm_pseudohdr));
 
 		/*
 		 * (re-)calculate size of packet (in bytes)
 		 */
 		m->m_pkthdr.len = datalen = m_length(m, &lastm);
 
 		/*
 		 * computing how much padding we need on the end of the mbuf,
 		 * then see if we can put the TBD at the front of the mbuf
 		 * where the link header goes (well behaved protocols will
 		 * reserve room for us). Last, check if room for PDU tail.
 		 */
 		if (flags & TX_AAL5)
 			m->m_pkthdr.len += MID_PDU_SIZE;
 		m->m_pkthdr.len = roundup(m->m_pkthdr.len, MID_ATMDATASZ);
 		pad = m->m_pkthdr.len - datalen;
 		if (flags & TX_AAL5)
 			pad -= MID_PDU_SIZE;
 		m->m_pkthdr.len += MID_TBD_SIZE;
 
 		DBG(sc, TX, ("txvci%d: buflen=%u datalen=%u lead=%d trail=%d",
 		    vci, m->m_pkthdr.len, datalen, (int)M_LEADINGSPACE(m),
 		    (int)M_TRAILINGSPACE(lastm)));
 
 		/*
 		 * From here on we need access to sc
 		 */
 		EN_LOCK(sc);
 
 		/*
 		 * Allocate a map. We do this here rather then in en_txdma,
 		 * because en_txdma is also called from the interrupt handler
 		 * and we are going to have a locking problem then. We must
 		 * use NOWAIT here, because the ip_output path holds various
 		 * locks.
 		 */
 		map = uma_zalloc_arg(sc->map_zone, sc, M_NOWAIT);
 		if (map == NULL) {
 			/* drop that packet */
 			EN_COUNT(sc->stats.txnomap);
 			EN_UNLOCK(sc);
 			m_freem(m);
 			continue;
 		}
 
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 			EN_UNLOCK(sc);
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 			continue;
 		}
 
 		/*
 		 * Look, whether we can prepend the TBD (8 byte)
 		 */
 		if (M_WRITABLE(m) && M_LEADINGSPACE(m) >= MID_TBD_SIZE) {
 			tbd[0] = htobe32(MID_TBD_MK1((flags & TX_AAL5) ?
 			    MID_TBD_AAL5 : MID_TBD_NOAAL5,
 			    vc->txspeed, m->m_pkthdr.len / MID_ATMDATASZ));
 			tbd[1] = htobe32(MID_TBD_MK2(vci, 0, 0));
 
 			m->m_data -= MID_TBD_SIZE;
 			bcopy(tbd, m->m_data, MID_TBD_SIZE);
 			m->m_len += MID_TBD_SIZE;
 			flags |= TX_HAS_TBD;
 		}
 
 		/*
 		 * Check whether the padding fits (must be writeable -
 		 * we pad with zero).
 		 */
 		if (M_WRITABLE(lastm) && M_TRAILINGSPACE(lastm) >= pad) {
 			bzero(lastm->m_data + lastm->m_len, pad);
 			lastm->m_len += pad;
 			flags |= TX_HAS_PAD;
 
 			if ((flags & TX_AAL5) &&
 			    M_TRAILINGSPACE(lastm) > MID_PDU_SIZE) {
 				pdu[0] = htobe32(MID_PDU_MK1(0, 0, datalen));
 				pdu[1] = 0;
 				bcopy(pdu, lastm->m_data + lastm->m_len,
 				    MID_PDU_SIZE);
 				lastm->m_len += MID_PDU_SIZE;
 				flags |= TX_HAS_PDU;
 			}
 		}
 
 		if (!sc->is_adaptec &&
 		    (m = en_fix_mchain(sc, m, &pad)) == NULL) {
 			EN_UNLOCK(sc);
 			uma_zfree(sc->map_zone, map);
 			continue;
 		}
 
 		/*
 		 * get assigned channel (will be zero unless txspeed is set)
 		 */
 		tx = vc->txslot;
 
 		if (m->m_pkthdr.len > EN_TXSZ * 1024) {
 			DBG(sc, TX, ("tx%td: packet larger than xmit buffer "
 			    "(%d > %d)\n", tx - sc->txslot, m->m_pkthdr.len,
 			    EN_TXSZ * 1024));
 			EN_UNLOCK(sc);
 			m_freem(m);
 			uma_zfree(sc->map_zone, map);
 			continue;
 		}
 
 		if (tx->mbsize > EN_TXHIWAT) {
 			EN_COUNT(sc->stats.txmbovr);
 			DBG(sc, TX, ("tx%td: buffer space shortage",
 			    tx - sc->txslot));
 			EN_UNLOCK(sc);
 			m_freem(m);
 			uma_zfree(sc->map_zone, map);
 			continue;
 		}
 
 		/* commit */
 		tx->mbsize += m->m_pkthdr.len;
 
 		DBG(sc, TX, ("tx%td: VCI=%d, speed=0x%x, buflen=%d, mbsize=%d",
 		    tx - sc->txslot, vci, sc->vccs[vci]->txspeed,
 		    m->m_pkthdr.len, tx->mbsize));
 
 		MBUF_SET_TX(m, vci, flags, datalen, pad, map);
 
 		_IF_ENQUEUE(&tx->q, m);
 
 		en_txdma(sc, tx);
 
 		EN_UNLOCK(sc);
 	}
 }
 
 /*********************************************************************/
 /*
  * VCs
  */
 
 /*
  * en_loadvc: load a vc tab entry from a slot
  *
  * LOCK: locked, needed
  */
 static void
 en_loadvc(struct en_softc *sc, struct en_vcc *vc)
 {
 	uint32_t reg = en_read(sc, MID_VC(vc->vcc.vci));
 
 	reg = MIDV_SETMODE(reg, MIDV_TRASH);
 	en_write(sc, MID_VC(vc->vcc.vci), reg);
 	DELAY(27);
 
 	/* no need to set CRC */
 
 	/* read pointer = 0, desc. start = 0 */
 	en_write(sc, MID_DST_RP(vc->vcc.vci), 0);
 	/* write pointer = 0 */
 	en_write(sc, MID_WP_ST_CNT(vc->vcc.vci), 0);
 	/* set mode, size, loc */
 	en_write(sc, MID_VC(vc->vcc.vci), vc->rxslot->mode);
 
 	vc->rxslot->cur = vc->rxslot->start;
 
 	DBG(sc, VC, ("rx%td: assigned to VCI %d", vc->rxslot - sc->rxslot,
 	    vc->vcc.vci));
 }
 
 /*
  * Open the given vcc.
  *
  * LOCK: unlocked, needed
  */
 static int
 en_open_vcc(struct en_softc *sc, struct atmio_openvcc *op)
 {
 	uint32_t oldmode, newmode;
 	struct en_rxslot *slot;
 	struct en_vcc *vc;
 	int error = 0;
 
 	DBG(sc, IOCTL, ("enable vpi=%d, vci=%d, flags=%#x",
 	    op->param.vpi, op->param.vci, op->param.flags));
 
 	if (op->param.vpi != 0 || op->param.vci >= MID_N_VC)
 		return (EINVAL);
 
 	vc = uma_zalloc(en_vcc_zone, M_NOWAIT | M_ZERO);
 	if (vc == NULL)
 		return (ENOMEM);
 
 	EN_LOCK(sc);
 
 	if (sc->vccs[op->param.vci] != NULL) {
 		error = EBUSY;
 		goto done;
 	}
 
 	/* find a free receive slot */
 	for (slot = sc->rxslot; slot < &sc->rxslot[sc->en_nrx]; slot++)
 		if (slot->vcc == NULL)
 			break;
 	if (slot == &sc->rxslot[sc->en_nrx]) {
 		error = ENOSPC;
 		goto done;
 	}
 
 	vc->rxslot = slot;
 	vc->rxhand = op->rxhand;
 	vc->vcc = op->param;
 
 	oldmode = slot->mode;
 	newmode = (op->param.aal == ATMIO_AAL_5) ? MIDV_AAL5 : MIDV_NOAAL;
 	slot->mode = MIDV_SETMODE(oldmode, newmode);
 	slot->vcc = vc;
 
 	KASSERT (_IF_QLEN(&slot->indma) == 0 && _IF_QLEN(&slot->q) == 0,
 	    ("en_rxctl: left over mbufs on enable slot=%td",
 	    vc->rxslot - sc->rxslot));
 
 	vc->txspeed = 0;
 	vc->txslot = sc->txslot;
 	vc->txslot->nref++;	/* bump reference count */
 
 	en_loadvc(sc, vc);	/* does debug printf for us */
 
 	/* don't free below */
 	sc->vccs[vc->vcc.vci] = vc;
 	vc = NULL;
 	sc->vccs_open++;
 
   done:
 	if (vc != NULL)
 		uma_zfree(en_vcc_zone, vc);
 
 	EN_UNLOCK(sc);
 	return (error);
 }
 
 /*
  * Close finished
  */
 static void
 en_close_finish(struct en_softc *sc, struct en_vcc *vc)
 {
 
 	if (vc->rxslot != NULL)
 		vc->rxslot->vcc = NULL;
 
 	DBG(sc, VC, ("vci: %u free (%p)", vc->vcc.vci, vc));
 
 	sc->vccs[vc->vcc.vci] = NULL;
 	uma_zfree(en_vcc_zone, vc);
 	sc->vccs_open--;
 }
 
 /*
  * LOCK: unlocked, needed
  */
 static int
 en_close_vcc(struct en_softc *sc, struct atmio_closevcc *cl)
 {
 	uint32_t oldmode, newmode;
 	struct en_vcc *vc;
 	int error = 0;
 
 	DBG(sc, IOCTL, ("disable vpi=%d, vci=%d", cl->vpi, cl->vci));
 
 	if (cl->vpi != 0 || cl->vci >= MID_N_VC)
 		return (EINVAL);
 
 	EN_LOCK(sc);
 	if ((vc = sc->vccs[cl->vci]) == NULL) {
 		error = ENOTCONN;
 		goto done;
 	}
 
 	/*
 	 * turn off VCI
 	 */
 	if (vc->rxslot == NULL) {
 		error = ENOTCONN;
 		goto done;
 	}
 	if (vc->vflags & VCC_DRAIN) {
 		error = EINVAL;
 		goto done;
 	}
 
 	oldmode = en_read(sc, MID_VC(cl->vci));
 	newmode = MIDV_SETMODE(oldmode, MIDV_TRASH) & ~MIDV_INSERVICE;
 	en_write(sc, MID_VC(cl->vci), (newmode | (oldmode & MIDV_INSERVICE)));
 
 	/* halt in tracks, be careful to preserve inservice bit */
 	DELAY(27);
 	vc->rxslot->mode = newmode;
 
 	vc->txslot->nref--;
 
 	/* if stuff is still going on we are going to have to drain it out */
 	if (_IF_QLEN(&vc->rxslot->indma) == 0 &&
 	    _IF_QLEN(&vc->rxslot->q) == 0 &&
 	    (vc->vflags & VCC_SWSL) == 0) {
 		en_close_finish(sc, vc);
 		goto done;
 	}
 
 	vc->vflags |= VCC_DRAIN;
 	DBG(sc, IOCTL, ("VCI %u now draining", cl->vci));
 
 	if (vc->vcc.flags & ATMIO_FLAG_ASYNC)
 		goto done;
 
 	vc->vflags |= VCC_CLOSE_RX;
 	while ((sc->ifp->if_drv_flags & IFF_DRV_RUNNING) &&
 	    (vc->vflags & VCC_DRAIN))
 		cv_wait(&sc->cv_close, &sc->en_mtx);
 
 	en_close_finish(sc, vc);
 	if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		error = EIO;
 		goto done;
 	}
 
 
   done:
 	EN_UNLOCK(sc);
 	return (error);
 }
 
 /*********************************************************************/
 /*
  * starting/stopping the card
  */
 
 /*
  * en_reset_ul: reset the board, throw away work in progress.
  * must en_init to recover.
  *
  * LOCK: locked, needed
  */
 static void
 en_reset_ul(struct en_softc *sc)
 {
 	struct en_map *map;
 	struct mbuf *m;
 	struct en_rxslot *rx;
 	int lcv;
 
 	device_printf(sc->dev, "reset\n");
 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);	/* reset hardware */
 
 	/*
 	 * recv: dump any mbufs we are dma'ing into, if DRAINing, then a reset
 	 * will free us! Don't release the rxslot from the channel.
 	 */
 	for (lcv = 0 ; lcv < MID_N_VC ; lcv++) {
 		if (sc->vccs[lcv] == NULL)
 			continue;
 		rx = sc->vccs[lcv]->rxslot;
 
 		for (;;) {
 			_IF_DEQUEUE(&rx->indma, m);
 			if (m == NULL)
 				break;
 			map = (void *)m->m_pkthdr.rcvif;
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 		}
 		for (;;) {
 			_IF_DEQUEUE(&rx->q, m);
 			if (m == NULL)
 				break;
 			m_freem(m);
 		}
 		sc->vccs[lcv]->vflags = 0;
 	}
 
 	/*
 	 * xmit: dump everything
 	 */
 	for (lcv = 0 ; lcv < EN_NTX ; lcv++) {
 		for (;;) {
 			_IF_DEQUEUE(&sc->txslot[lcv].indma, m);
 			if (m == NULL)
 				break;
 			map = (void *)m->m_pkthdr.rcvif;
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 		}
 		for (;;) {
 			_IF_DEQUEUE(&sc->txslot[lcv].q, m);
 			if (m == NULL)
 				break;
 			map = (void *)m->m_pkthdr.rcvif;
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 		}
 		sc->txslot[lcv].mbsize = 0;
 	}
 
 	/*
 	 * Unstop all waiters
 	 */
 	cv_broadcast(&sc->cv_close);
 }
 
 /*
  * en_reset: reset the board, throw away work in progress.
  * must en_init to recover.
  *
  * LOCK: unlocked, needed
  *
  * Use en_reset_ul if you alreay have the lock
  */
 void
 en_reset(struct en_softc *sc)
 {
 	EN_LOCK(sc);
 	en_reset_ul(sc);
 	EN_UNLOCK(sc);
 }
 
 
 /*
  * en_init: init board and sync the card with the data in the softc.
  *
  * LOCK: locked, needed
  */
 static void
 en_init(struct en_softc *sc)
 {
 	int vc, slot;
 	uint32_t loc;
 
 	if ((sc->ifp->if_flags & IFF_UP) == 0) {
 		DBG(sc, INIT, ("going down"));
 		en_reset(sc);				/* to be safe */
 		return;
 	}
 
 	DBG(sc, INIT, ("going up"));
 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;	/* enable */
 
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);		/* reset */
 
 	/* zero memory */
 	bus_space_set_region_4(sc->en_memt, sc->en_base,
 	    MID_RAMOFF, 0, sc->en_obmemsz / 4);
 
 	/*
 	 * init obmem data structures: vc tab, dma q's, slist.
 	 *
 	 * note that we set drq_free/dtq_free to one less than the total number
 	 * of DTQ/DRQs present.   we do this because the card uses the condition
 	 * (drq_chip == drq_us) to mean "list is empty"... but if you allow the
 	 * circular list to be completely full then (drq_chip == drq_us) [i.e.
 	 * the drq_us pointer will wrap all the way around].   by restricting
 	 * the number of active requests to (N - 1) we prevent the list from
 	 * becoming completely full.    note that the card will sometimes give
 	 * us an interrupt for a DTQ/DRQ we have already processes... this helps
 	 * keep that interrupt from messing us up.
 	 */
 	bzero(&sc->drq, sizeof(sc->drq));
 	sc->drq_free = MID_DRQ_N - 1;
 	sc->drq_chip = MID_DRQ_REG2A(en_read(sc, MID_DMA_RDRX));
 	en_write(sc, MID_DMA_WRRX, MID_DRQ_A2REG(sc->drq_chip)); 
 	sc->drq_us = sc->drq_chip;
 
 	bzero(&sc->dtq, sizeof(sc->dtq));
 	sc->dtq_free = MID_DTQ_N - 1;
 	sc->dtq_chip = MID_DTQ_REG2A(en_read(sc, MID_DMA_RDTX));
 	en_write(sc, MID_DMA_WRTX, MID_DRQ_A2REG(sc->dtq_chip)); 
 	sc->dtq_us = sc->dtq_chip;
 
 	sc->hwslistp = MID_SL_REG2A(en_read(sc, MID_SERV_WRITE));
 	sc->swsl_size = sc->swsl_head = sc->swsl_tail = 0;
 
 	DBG(sc, INIT, ("drq free/chip: %d/0x%x, dtq free/chip: %d/0x%x, "
 	    "hwslist: 0x%x", sc->drq_free, sc->drq_chip, sc->dtq_free,
 	    sc->dtq_chip, sc->hwslistp));
 
 	for (slot = 0 ; slot < EN_NTX ; slot++) {
 		sc->txslot[slot].bfree = EN_TXSZ * 1024;
 		en_write(sc, MIDX_READPTR(slot), 0);
 		en_write(sc, MIDX_DESCSTART(slot), 0);
 		loc = sc->txslot[slot].cur = sc->txslot[slot].start;
 		loc = loc - MID_RAMOFF;
 		/* mask, cvt to words */
 		loc = (loc & ~((EN_TXSZ * 1024) - 1)) >> 2;
 		/* top 11 bits */
 		loc = loc >> MIDV_LOCTOPSHFT;
 		en_write(sc, MIDX_PLACE(slot), MIDX_MKPLACE(en_k2sz(EN_TXSZ),
 		    loc));
 		DBG(sc, INIT, ("tx%d: place 0x%x", slot,
 		    (u_int)en_read(sc, MIDX_PLACE(slot))));
 	}
 
 	for (vc = 0; vc < MID_N_VC; vc++) 
 		if (sc->vccs[vc] != NULL)
 			en_loadvc(sc, sc->vccs[vc]);
 
 	/*
 	 * enable!
 	 */
 	en_write(sc, MID_INTENA, MID_INT_TX | MID_INT_DMA_OVR | MID_INT_IDENT |
 	    MID_INT_LERR | MID_INT_DMA_ERR | MID_INT_DMA_RX | MID_INT_DMA_TX |
 	    MID_INT_SERVICE | MID_INT_SUNI | MID_INT_STATS);
 	en_write(sc, MID_MAST_CSR, MID_SETIPL(sc->ipl) | MID_MCSR_ENDMA |
 	    MID_MCSR_ENTX | MID_MCSR_ENRX);
 }
 
 /*********************************************************************/
 /*
  * Ioctls
  */
 /*
  * en_ioctl: handle ioctl requests
  *
  * NOTE: if you add an ioctl to set txspeed, you should choose a new
  * TX channel/slot.   Choose the one with the lowest sc->txslot[slot].nref
  * value, subtract one from sc->txslot[0].nref, add one to the
  * sc->txslot[slot].nref, set sc->txvc2slot[vci] = slot, and then set
  * txspeed[vci].
  *
  * LOCK: unlocked, needed
  */
 static int
 en_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct en_softc *sc = (struct en_softc *)ifp->if_softc;
 #if defined(INET) || defined(INET6)
 	struct ifaddr *ifa = (struct ifaddr *)data;
 #endif
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct atmio_vcctable *vtab;
 	int error = 0;
 
 	switch (cmd) {
 
 	  case SIOCSIFADDR: 
 		EN_LOCK(sc);
 		ifp->if_flags |= IFF_UP;
 #if defined(INET) || defined(INET6)
 		if (ifa->ifa_addr->sa_family == AF_INET
 		    || ifa->ifa_addr->sa_family == AF_INET6) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 				en_reset_ul(sc);
 				en_init(sc);
 			}
 			ifa->ifa_rtrequest = atm_rtrequest; /* ??? */
 			EN_UNLOCK(sc);
 			break;
 		}
 #endif /* INET */
 		if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			en_reset_ul(sc);
 			en_init(sc);
 		}
 		EN_UNLOCK(sc);
 		break;
 
 	case SIOCSIFFLAGS: 
 		EN_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 				en_init(sc);
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				en_reset_ul(sc);
 		}
 		EN_UNLOCK(sc);
 		break;
 
 	  case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ATMMTU) {
 			error = EINVAL;
 			break;
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	  case SIOCSIFMEDIA:
 	  case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	  case SIOCATMOPENVCC:		/* kernel internal use */
 		error = en_open_vcc(sc, (struct atmio_openvcc *)data);
 		break;
 
 	  case SIOCATMCLOSEVCC:		/* kernel internal use */
 		error = en_close_vcc(sc, (struct atmio_closevcc *)data);
 		break;
 
 	  case SIOCATMGETVCCS:	/* internal netgraph use */
 		vtab = atm_getvccs((struct atmio_vcc **)sc->vccs,
 		    MID_N_VC, sc->vccs_open, &sc->en_mtx, 0);
 		if (vtab == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		*(void **)data = vtab;
 		break;
 
 	  case SIOCATMGVCCS:	/* return vcc table */
 		vtab = atm_getvccs((struct atmio_vcc **)sc->vccs,
 		    MID_N_VC, sc->vccs_open, &sc->en_mtx, 1);
 		error = copyout(vtab, ifr->ifr_data, sizeof(*vtab) +
 		    vtab->count * sizeof(vtab->vccs[0]));
 		free(vtab, M_DEVBUF);
 		break;
 
 	  default: 
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 /*********************************************************************/
 /*
  * Sysctl's
  */
 
 /*
  * Sysctl handler for internal statistics
  *
  * LOCK: unlocked, needed
  */
 static int
 en_sysctl_istats(SYSCTL_HANDLER_ARGS)
 {
 	struct en_softc *sc = arg1;
 	uint32_t *ret;
 	int error;
 
 	ret = malloc(sizeof(sc->stats), M_TEMP, M_WAITOK);
 
 	EN_LOCK(sc);
 	bcopy(&sc->stats, ret, sizeof(sc->stats));
 	EN_UNLOCK(sc);
 
 	error = SYSCTL_OUT(req, ret, sizeof(sc->stats));
 	free(ret, M_TEMP);
 
 	return (error);
 }
 
 /*********************************************************************/
 /*
  * Interrupts
  */
 
 /*
  * Transmit interrupt handler
  *
  * check for tx complete, if detected then this means that some space
  * has come free on the card.   we must account for it and arrange to
  * kick the channel to life (in case it is stalled waiting on the card).
  *
  * LOCK: locked, needed
  */
 static uint32_t
 en_intr_tx(struct en_softc *sc, uint32_t reg)
 {
 	uint32_t kick;
 	uint32_t mask;
 	uint32_t val;
 	int chan;
 
 	kick = 0;		/* bitmask of channels to kick */
 
 	for (mask = 1, chan = 0; chan < EN_NTX; chan++, mask *= 2) {
 		if (!(reg & MID_TXCHAN(chan)))
 			continue;
 
 		kick = kick | mask;
 
 		/* current read pointer */
 		val = en_read(sc, MIDX_READPTR(chan));
 		/* as offset */
 		val = (val * sizeof(uint32_t)) + sc->txslot[chan].start;
 		if (val > sc->txslot[chan].cur)
 			sc->txslot[chan].bfree = val - sc->txslot[chan].cur;
 		else
 			sc->txslot[chan].bfree = (val + (EN_TXSZ * 1024)) -
 			    sc->txslot[chan].cur;
 		DBG(sc, INTR, ("tx%d: transmit done. %d bytes now free in "
 		    "buffer", chan, sc->txslot[chan].bfree));
 	}
 	return (kick);
 }
 
 /*
  * TX DMA interrupt
  *
  * check for TX DMA complete, if detected then this means
  * that some DTQs are now free.   it also means some indma
  * mbufs can be freed. if we needed DTQs, kick all channels.
  *
  * LOCK: locked, needed
  */
 static uint32_t
 en_intr_tx_dma(struct en_softc *sc)
 {
 	uint32_t kick = 0;
 	uint32_t val;
 	uint32_t idx;
 	uint32_t slot;
 	uint32_t dtq;
 	struct en_map *map;
 	struct mbuf *m;
 
 	val = en_read(sc, MID_DMA_RDTX); 	/* chip's current location */
 	idx = MID_DTQ_A2REG(sc->dtq_chip);	/* where we last saw chip */
 
 	if (sc->need_dtqs) {
 		kick = MID_NTX_CH - 1;	/* assume power of 2, kick all! */
 		sc->need_dtqs = 0;	/* recalculated in "kick" loop below */
 		DBG(sc, INTR, ("cleared need DTQ condition"));
 	}
 
 	while (idx != val) {
 		sc->dtq_free++;
 		if ((dtq = sc->dtq[idx]) != 0) {
 			/* don't forget to zero it out when done */
 			sc->dtq[idx] = 0;
 			slot = EN_DQ_SLOT(dtq);
 
 			_IF_DEQUEUE(&sc->txslot[slot].indma, m);
 			if (m == NULL)
 				panic("enintr: dtqsync");
 			map = (void *)m->m_pkthdr.rcvif;
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 
 			sc->txslot[slot].mbsize -= EN_DQ_LEN(dtq);
 			DBG(sc, INTR, ("tx%d: free %d dma bytes, mbsize now "
 			    "%d", slot, EN_DQ_LEN(dtq), 
 			    sc->txslot[slot].mbsize));
 		}
 		EN_WRAPADD(0, MID_DTQ_N, idx, 1);
 	}
 	sc->dtq_chip = MID_DTQ_REG2A(val);	/* sync softc */
 
 	return (kick);
 }
 
 /*
  * Service interrupt
  *
  * LOCK: locked, needed
  */
 static int
 en_intr_service(struct en_softc *sc)
 {
 	uint32_t chip;
 	uint32_t vci;
 	int need_softserv = 0;
 	struct en_vcc *vc;
 
 	chip = MID_SL_REG2A(en_read(sc, MID_SERV_WRITE));
 
 	while (sc->hwslistp != chip) {
 		/* fetch and remove it from hardware service list */
 		vci = en_read(sc, sc->hwslistp);
 		EN_WRAPADD(MID_SLOFF, MID_SLEND, sc->hwslistp, 4);
 
 		if ((vc = sc->vccs[vci]) == NULL ||
 		    (vc->vcc.flags & ATMIO_FLAG_NORX)) {
 			DBG(sc, INTR, ("unexpected rx interrupt VCI %d", vci));
 			en_write(sc, MID_VC(vci), MIDV_TRASH);  /* rx off */
 			continue;
 		}
 
 		/* remove from hwsl */
 		en_write(sc, MID_VC(vci), vc->rxslot->mode);
 		EN_COUNT(sc->stats.hwpull);
 
 		DBG(sc, INTR, ("pulled VCI %d off hwslist", vci));
 
 		/* add it to the software service list (if needed) */
 		if ((vc->vflags & VCC_SWSL) == 0) {
 			EN_COUNT(sc->stats.swadd);
 			need_softserv = 1;
 			vc->vflags |= VCC_SWSL;
 			sc->swslist[sc->swsl_tail] = vci;
 			EN_WRAPADD(0, MID_SL_N, sc->swsl_tail, 1);
 			sc->swsl_size++;
 			DBG(sc, INTR, ("added VCI %d to swslist", vci));
 		}
 	}
 	return (need_softserv);
 }
 
 /*
  * Handle a receive DMA completion
  */
 static void
 en_rx_drain(struct en_softc *sc, u_int drq)
 {
 	struct en_rxslot *slot;
 	struct en_vcc *vc;
 	struct mbuf *m;
 	struct atm_pseudohdr ah;
 
 	slot = &sc->rxslot[EN_DQ_SLOT(drq)];
 
 	m = NULL;	/* assume "JK" trash DMA */
 	if (EN_DQ_LEN(drq) != 0) {
 		_IF_DEQUEUE(&slot->indma, m);
 		KASSERT(m != NULL, ("drqsync: %s: lost mbuf in slot %td!",
 		    sc->ifp->if_xname, slot - sc->rxslot));
 		uma_zfree(sc->map_zone, (struct en_map *)m->m_pkthdr.rcvif);
 	}
 	if ((vc = slot->vcc) == NULL) {
 		/* ups */
 		if (m != NULL)
 			m_freem(m);
 		return;
 	}
 
 	/* do something with this mbuf */
 	if (vc->vflags & VCC_DRAIN) {
 		/* drain? */
 		if (m != NULL)
 			m_freem(m);
 		if (_IF_QLEN(&slot->indma) == 0 && _IF_QLEN(&slot->q) == 0 &&
 		    (en_read(sc, MID_VC(vc->vcc.vci)) & MIDV_INSERVICE) == 0 &&
 		    (vc->vflags & VCC_SWSL) == 0) {
 			vc->vflags &= ~VCC_CLOSE_RX;
 			if (vc->vcc.flags & ATMIO_FLAG_ASYNC)
 				en_close_finish(sc, vc);
 			else
 				cv_signal(&sc->cv_close);
 		}
 		return;
 	}
 
 	if (m != NULL) {
 		ATM_PH_FLAGS(&ah) = vc->vcc.flags;
 		ATM_PH_VPI(&ah) = 0;
 		ATM_PH_SETVCI(&ah, vc->vcc.vci);
 
 		DBG(sc, INTR, ("rx%td: rxvci%d: atm_input, mbuf %p, len %d, "
 		    "hand %p", slot - sc->rxslot, vc->vcc.vci, m,
 		    EN_DQ_LEN(drq), vc->rxhand));
 
 		m->m_pkthdr.rcvif = sc->ifp;
 		if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1);
 
 		vc->ipackets++;
 		vc->ibytes += m->m_pkthdr.len;
 
 #ifdef EN_DEBUG
 		if (sc->debug & DBG_IPACKETS)
 			en_dump_packet(sc, m);
 #endif
 #ifdef ENABLE_BPF
 		BPF_MTAP(sc->ifp, m);
 #endif
 		EN_UNLOCK(sc);
 		atm_input(sc->ifp, &ah, m, vc->rxhand);
 		EN_LOCK(sc);
 	}
 }
 
 /*
  * check for RX DMA complete, and pass the data "upstairs"
  *
  * LOCK: locked, needed
  */
 static int
 en_intr_rx_dma(struct en_softc *sc)
 {
 	uint32_t val;
 	uint32_t idx;
 	uint32_t drq;
 
 	val = en_read(sc, MID_DMA_RDRX); 	/* chip's current location */
 	idx = MID_DRQ_A2REG(sc->drq_chip);	/* where we last saw chip */
 
 	while (idx != val) {
 		sc->drq_free++;
 		if ((drq = sc->drq[idx]) != 0) {
 			/* don't forget to zero it out when done */
 			sc->drq[idx] = 0;
 			en_rx_drain(sc, drq);
 		}
 		EN_WRAPADD(0, MID_DRQ_N, idx, 1);
 	}
 	sc->drq_chip = MID_DRQ_REG2A(val);	/* sync softc */
 
 	if (sc->need_drqs) {
 		/* true if we had a DRQ shortage */
 		sc->need_drqs = 0;
 		DBG(sc, INTR, ("cleared need DRQ condition"));
 		return (1);
 	} else
 		return (0);
 }
 
 /*
  * en_mget: get an mbuf chain that can hold totlen bytes and return it
  * (for recv). For the actual allocation totlen is rounded up to a multiple
  * of 4. We also ensure, that each mbuf has a multiple of 4 bytes.
  *
  * After this call the sum of all the m_len's in the chain will be totlen.
  * This is called at interrupt time, so we can't wait here.
  *
  * LOCK: any, not needed
  */
 static struct mbuf *
 en_mget(struct en_softc *sc, u_int pktlen)
 {
 	struct mbuf *m, *tmp;
 	u_int totlen, pad;
 
 	totlen = roundup(pktlen, sizeof(uint32_t));
 	pad = totlen - pktlen;
 
 	/*
 	 * First get an mbuf with header. Keep space for a couple of
 	 * words at the begin.
 	 */
 	/* called from interrupt context */
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	m->m_pkthdr.rcvif = NULL;
 	m->m_pkthdr.len = pktlen;
 	m->m_len = EN_RX1BUF;
-	MH_ALIGN(m, EN_RX1BUF);
+	M_ALIGN(m, EN_RX1BUF);
 	if (m->m_len >= totlen) {
 		m->m_len = totlen;
 
 	} else {
 		totlen -= m->m_len;
 
 		/* called from interrupt context */
 		tmp = m_getm(m, totlen, M_NOWAIT, MT_DATA);
 		if (tmp == NULL) {
 			m_free(m);
 			return (NULL);
 		}
 		tmp = m->m_next;
 		/* m_getm could do this for us */
 		while (tmp != NULL) {
 			tmp->m_len = min(MCLBYTES, totlen);
 			totlen -= tmp->m_len;
 			tmp = tmp->m_next;
 		}
 	}
 
 	return (m);
 }
 
 /*
  * Argument for RX DMAMAP loader.
  */
 struct rxarg {
 	struct en_softc *sc;
 	struct mbuf *m;
 	u_int pre_skip;		/* number of bytes to skip at begin */
 	u_int post_skip;	/* number of bytes to skip at end */
 	struct en_vcc *vc;	/* vc we are receiving on */
 	int wait;		/* wait for DRQ entries */
 };
 
 /*
  * Copy the segment table to the buffer for later use. And compute the
  * number of dma queue entries we need.
  *
  * LOCK: locked, needed
  */
 static void
 en_rxdma_load(void *uarg, bus_dma_segment_t *segs, int nseg,
     bus_size_t mapsize, int error)
 {
 	struct rxarg *rx = uarg;
 	struct en_softc *sc = rx->sc;
 	struct en_rxslot *slot = rx->vc->rxslot;
 	u_int		free;		/* number of free DRQ entries */
 	uint32_t	cur;		/* current buffer offset */
 	uint32_t	drq;		/* DRQ entry pointer */
 	uint32_t	last_drq;	/* where we have written last */
 	u_int		needalign, cnt, count, bcode;
 	bus_addr_t	addr;
 	bus_size_t	rest;
 	int		i;
 
 	if (error != 0)
 		return;
 	if (nseg > EN_MAX_DMASEG)
 		panic("too many DMA segments");
 
 	rx->wait = 0;
 
 	free = sc->drq_free;
 	drq = sc->drq_us;
 	cur = slot->cur;
 
 	last_drq = 0;
 
 	/*
 	 * Local macro to add an entry to the receive DMA area. If there
 	 * are no entries left, return. Save the byte offset of the entry
 	 * in last_drq for later use.
 	 */
 #define PUT_DRQ_ENTRY(ENI, BCODE, COUNT, ADDR)				\
 	if (free == 0) {						\
 		EN_COUNT(sc->stats.rxdrqout);				\
 		rx->wait = 1;						\
 		return;							\
 	}								\
 	last_drq = drq;							\
 	en_write(sc, drq + 0, (ENI || !sc->is_adaptec) ?		\
 	    MID_MK_RXQ_ENI(COUNT, rx->vc->vcc.vci, 0, BCODE) :		\
 	    MID_MK_RXQ_ADP(COUNT, rx->vc->vcc.vci, 0, BCODE));		\
 	en_write(sc, drq + 4, ADDR);					\
 									\
 	EN_WRAPADD(MID_DRQOFF, MID_DRQEND, drq, 8);			\
 	free--;
 
 	/*
 	 * Local macro to generate a DMA entry to DMA cnt bytes. Updates
 	 * the current buffer byte offset accordingly.
 	 */
 #define DO_DRQ(TYPE) do {						\
 	rest -= cnt;							\
 	EN_WRAPADD(slot->start, slot->stop, cur, cnt);			\
 	DBG(sc, SERV, ("rx%td: "TYPE" %u bytes, %ju left, cur %#x",	\
 	    slot - sc->rxslot, cnt, (uintmax_t)rest, cur));		\
 									\
 	PUT_DRQ_ENTRY(1, bcode, count, addr);				\
 									\
 	addr += cnt;							\
     } while (0)
 
 	/*
 	 * Skip the RBD at the beginning
 	 */
 	if (rx->pre_skip > 0) {
 		/* update DMA address */
 		EN_WRAPADD(slot->start, slot->stop, cur, rx->pre_skip);
 
 		PUT_DRQ_ENTRY(0, MIDDMA_JK, WORD_IDX(slot->start, cur), 0);
 	}
 
 	for (i = 0; i < nseg; i++, segs++) {
 		addr = segs->ds_addr;
 		rest = segs->ds_len;
 
 		if (sc->is_adaptec) {
 			/* adaptec card - simple */
 
 			/* advance the on-card buffer pointer */
 			EN_WRAPADD(slot->start, slot->stop, cur, rest);
 			DBG(sc, SERV, ("rx%td: adp %ju bytes %#jx "
 			    "(cur now 0x%x)", slot - sc->rxslot,
 			    (uintmax_t)rest, (uintmax_t)addr, cur));
 
 			PUT_DRQ_ENTRY(0, 0, rest, addr);
 
 			continue;
 		}
 
 		/*
 		 * do we need to do a DMA op to align to the maximum
 		 * burst? Note, that we are alway 32-bit aligned.
 		 */
 		if (sc->alburst &&
 		    (needalign = (addr & sc->bestburstmask)) != 0) {
 			/* compute number of bytes, words and code */
 			cnt = sc->bestburstlen - needalign;
 			if (cnt > rest)
 				cnt = rest;
 			count = cnt / sizeof(uint32_t);
 			if (sc->noalbursts) {
 				bcode = MIDDMA_WORD;
 			} else {
 				bcode = en_dmaplan[count].bcode;
 				count = cnt >> en_dmaplan[count].divshift;
 			}
 			DO_DRQ("al_dma");
 		}
 
 		/* do we need to do a max-sized burst? */
 		if (rest >= sc->bestburstlen) {
 			count = rest >> sc->bestburstshift;
 			cnt = count << sc->bestburstshift;
 			bcode = sc->bestburstcode;
 			DO_DRQ("best_dma");
 		}
 
 		/* do we need to do a cleanup burst? */
 		if (rest != 0) {
 			cnt = rest;
 			count = rest / sizeof(uint32_t);
 			if (sc->noalbursts) {
 				bcode = MIDDMA_WORD;
 			} else {
 				bcode = en_dmaplan[count].bcode;
 				count = cnt >> en_dmaplan[count].divshift;
 			}
 			DO_DRQ("clean_dma");
 		}
 	}
 
 	/*
 	 * Skip stuff at the end
 	 */
 	if (rx->post_skip > 0) {
 		/* update DMA address */
 		EN_WRAPADD(slot->start, slot->stop, cur, rx->post_skip);
 
 		PUT_DRQ_ENTRY(0, MIDDMA_JK, WORD_IDX(slot->start, cur), 0);
 	}
 
 	/* record the end for the interrupt routine */
 	sc->drq[MID_DRQ_A2REG(last_drq)] =
 	    EN_DQ_MK(slot - sc->rxslot, rx->m->m_pkthdr.len);
 
 	/* set the end flag in the last descriptor */
 	en_write(sc, last_drq + 0, SETQ_END(sc, en_read(sc, last_drq + 0)));
 
 #undef PUT_DRQ_ENTRY
 #undef DO_DRQ
 
 	/* commit */
 	slot->cur = cur;
 	sc->drq_free = free;
 	sc->drq_us = drq;
 
 	/* signal to card */
 	en_write(sc, MID_DMA_WRRX, MID_DRQ_A2REG(sc->drq_us));
 }
 
 /*
  * en_service: handle a service interrupt
  *
  * Q: why do we need a software service list?
  *
  * A: if we remove a VCI from the hardware list and we find that we are
  *    out of DRQs we must defer processing until some DRQs become free.
  *    so we must remember to look at this RX VCI/slot later, but we can't
  *    put it back on the hardware service list (since that isn't allowed).
  *    so we instead save it on the software service list.   it would be nice 
  *    if we could peek at the VCI on top of the hwservice list without removing
  *    it, however this leads to a race condition: if we peek at it and
  *    decide we are done with it new data could come in before we have a 
  *    chance to remove it from the hwslist.   by the time we get it out of
  *    the list the interrupt for the new data will be lost.   oops!
  *
  * LOCK: locked, needed
  */
 static void
 en_service(struct en_softc *sc)
 {
 	struct mbuf	*m, *lastm;
 	struct en_map	*map;
 	struct rxarg	rx;
 	uint32_t	cur;
 	uint32_t	dstart;		/* data start (as reported by card) */
 	uint32_t	rbd;		/* receive buffer descriptor */
 	uint32_t	pdu;		/* AAL5 trailer */
 	int		mlen;
 	int		error;
 	struct en_rxslot *slot;
 	struct en_vcc *vc;
 
 	rx.sc = sc;
 
   next_vci:
 	if (sc->swsl_size == 0) {
 		DBG(sc, SERV, ("en_service done"));
 		return;
 	}
 
 	/*
 	 * get vcc to service
 	 */
 	rx.vc = vc = sc->vccs[sc->swslist[sc->swsl_head]];
 	slot = vc->rxslot;
 	KASSERT (slot->vcc->rxslot == slot, ("en_service: rx slot/vci sync"));
 
 	/*
 	 * determine our mode and if we've got any work to do
 	 */
 	DBG(sc, SERV, ("rx%td: service vci=%d start/stop/cur=0x%x 0x%x "
 	    "0x%x", slot - sc->rxslot, vc->vcc.vci, slot->start,
 	    slot->stop, slot->cur));
 
   same_vci:
 	cur = slot->cur;
 
 	dstart = MIDV_DSTART(en_read(sc, MID_DST_RP(vc->vcc.vci)));
 	dstart = (dstart * sizeof(uint32_t)) + slot->start;
 
 	/* check to see if there is any data at all */
 	if (dstart == cur) {
 		EN_WRAPADD(0, MID_SL_N, sc->swsl_head, 1); 
 		/* remove from swslist */
 		vc->vflags &= ~VCC_SWSL;
 		sc->swsl_size--;
 		DBG(sc, SERV, ("rx%td: remove vci %d from swslist",
 		    slot - sc->rxslot, vc->vcc.vci));
 		goto next_vci;
 	}
 
 	/*
 	 * figure out how many bytes we need
 	 * [mlen = # bytes to go in mbufs]
 	 */
 	rbd = en_read(sc, cur);
 	if (MID_RBD_ID(rbd) != MID_RBD_STDID) 
 		panic("en_service: id mismatch");
 
 	if (rbd & MID_RBD_T) {
 		mlen = 0;		/* we've got trash */
 		rx.pre_skip = MID_RBD_SIZE;
 		rx.post_skip = 0;
 		EN_COUNT(sc->stats.ttrash);
 		DBG(sc, SERV, ("RX overflow lost %d cells!", MID_RBD_CNT(rbd)));
 
 	} else if (vc->vcc.aal != ATMIO_AAL_5) {
 		/* 1 cell (ick!) */
 		mlen = MID_CHDR_SIZE + MID_ATMDATASZ;
 		rx.pre_skip = MID_RBD_SIZE;
 		rx.post_skip = 0;
 
 	} else {
 		rx.pre_skip = MID_RBD_SIZE;
 
 		/* get PDU trailer in correct byte order */
 		pdu = cur + MID_RBD_CNT(rbd) * MID_ATMDATASZ +
 		    MID_RBD_SIZE - MID_PDU_SIZE;
 		if (pdu >= slot->stop)
 			pdu -= EN_RXSZ * 1024;
 		pdu = en_read(sc, pdu);
 
 		if (MID_RBD_CNT(rbd) * MID_ATMDATASZ <
 		    MID_PDU_LEN(pdu)) {
 			device_printf(sc->dev, "invalid AAL5 length\n");
 			rx.post_skip = MID_RBD_CNT(rbd) * MID_ATMDATASZ;
 			mlen = 0;
 			if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
 
 		} else if (rbd & MID_RBD_CRCERR) {
 			device_printf(sc->dev, "CRC error\n");
 			rx.post_skip = MID_RBD_CNT(rbd) * MID_ATMDATASZ;
 			mlen = 0;
 			if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
 
 		} else {
 			mlen = MID_PDU_LEN(pdu);
 			rx.post_skip = MID_RBD_CNT(rbd) * MID_ATMDATASZ - mlen;
 		}
 	}
 
 	/*
 	 * now allocate mbufs for mlen bytes of data, if out of mbufs, trash all
 	 *
 	 * notes:
 	 *  1. it is possible that we've already allocated an mbuf for this pkt
 	 *     but ran out of DRQs, in which case we saved the allocated mbuf
 	 *     on "q".
 	 *  2. if we save an buf in "q" we store the "cur" (pointer) in the
 	 *     buf as an identity (that we can check later).
 	 *  3. after this block of code, if m is still NULL then we ran out of
 	 *     mbufs
 	 */
 	_IF_DEQUEUE(&slot->q, m);
 	if (m != NULL) {
 		if (m->m_pkthdr.csum_data != cur) {
 			/* wasn't ours */
 			DBG(sc, SERV, ("rx%td: q'ed buf %p not ours",
 			    slot - sc->rxslot, m));
 			_IF_PREPEND(&slot->q, m);
 			m = NULL;
 			EN_COUNT(sc->stats.rxqnotus);
 		} else {
 			EN_COUNT(sc->stats.rxqus);
 			DBG(sc, SERV, ("rx%td: recovered q'ed buf %p",
 			    slot - sc->rxslot, m));
 		}
 	}
 	if (mlen == 0 && m != NULL) {
 		/* should not happen */
 		m_freem(m);
 		m = NULL;
 	}
 
 	if (mlen != 0 && m == NULL) {
 		m = en_mget(sc, mlen);
 		if (m == NULL) {
 			rx.post_skip += mlen;
 			mlen = 0;
 			EN_COUNT(sc->stats.rxmbufout);
 			DBG(sc, SERV, ("rx%td: out of mbufs",
 			    slot - sc->rxslot));
 		} else
 			rx.post_skip -= roundup(mlen, sizeof(uint32_t)) - mlen;
 
 		DBG(sc, SERV, ("rx%td: allocate buf %p, mlen=%d",
 		    slot - sc->rxslot, m, mlen));
 	}
 
 	DBG(sc, SERV, ("rx%td: VCI %d, rbuf %p, mlen %d, skip %u/%u",
 	    slot - sc->rxslot, vc->vcc.vci, m, mlen, rx.pre_skip,
 	    rx.post_skip));
 
 	if (m != NULL) {
 		/* M_NOWAIT - called from interrupt context */
 		map = uma_zalloc_arg(sc->map_zone, sc, M_NOWAIT);
 		if (map == NULL) {
 			rx.post_skip += mlen;
 			m_freem(m);
 			DBG(sc, SERV, ("rx%td: out of maps",
 			    slot - sc->rxslot));
 			goto skip;
 		}
 		rx.m = m;
 		error = bus_dmamap_load_mbuf(sc->txtag, map->map, m,
 		    en_rxdma_load, &rx, BUS_DMA_NOWAIT);
 
 		if (error != 0) {
 			device_printf(sc->dev, "loading RX map failed "
 			    "%d\n", error);
 			uma_zfree(sc->map_zone, map);
 			m_freem(m);
 			rx.post_skip += mlen;
 			goto skip;
 
 		}
 		map->flags |= ENMAP_LOADED;
 
 		if (rx.wait) {
 			/* out of DRQs - wait */
 			uma_zfree(sc->map_zone, map);
 
 			m->m_pkthdr.csum_data = cur;
 			_IF_ENQUEUE(&slot->q, m);
 			EN_COUNT(sc->stats.rxdrqout);
 
 			sc->need_drqs = 1;	/* flag condition */
 			return;
 
 		}
 		(void)m_length(m, &lastm);
 		lastm->m_len -= roundup(mlen, sizeof(uint32_t)) - mlen;
 
 		m->m_pkthdr.rcvif = (void *)map;
 		_IF_ENQUEUE(&slot->indma, m);
 
 		/* get next packet in this slot */
 		goto same_vci;
 	}
   skip:
 	/*
 	 * Here we end if we should drop the packet from the receive buffer.
 	 * The number of bytes to drop is in fill. We can do this with on
 	 * JK entry. If we don't even have that one - wait.
 	 */
 	if (sc->drq_free == 0) {
 		sc->need_drqs = 1;	/* flag condition */
 		return;
 	}
 	rx.post_skip += rx.pre_skip;
 	DBG(sc, SERV, ("rx%td: skipping %u", slot - sc->rxslot, rx.post_skip));
 
 	/* advance buffer address */
 	EN_WRAPADD(slot->start, slot->stop, cur, rx.post_skip);
 
 	/* write DRQ entry */
 	if (sc->is_adaptec)
 		en_write(sc, sc->drq_us,
 		    MID_MK_RXQ_ADP(WORD_IDX(slot->start, cur),
 		    vc->vcc.vci, MID_DMA_END, MIDDMA_JK));
 	else
 	  	en_write(sc, sc->drq_us,
 		    MID_MK_RXQ_ENI(WORD_IDX(slot->start, cur),
 		    vc->vcc.vci, MID_DMA_END, MIDDMA_JK));
 	en_write(sc, sc->drq_us + 4, 0);
 	EN_WRAPADD(MID_DRQOFF, MID_DRQEND, sc->drq_us, 8);
 	sc->drq_free--;
 
 	/* signal to RX interrupt */
 	sc->drq[MID_DRQ_A2REG(sc->drq_us)] = EN_DQ_MK(slot - sc->rxslot, 0);
 	slot->cur = cur;
 
 	/* signal to card */
 	en_write(sc, MID_DMA_WRRX, MID_DRQ_A2REG(sc->drq_us));
 
 	goto same_vci;
 }
 
 /*
  * interrupt handler
  *
  * LOCK: unlocked, needed
  */
 void
 en_intr(void *arg)
 {
 	struct en_softc *sc = arg;
 	uint32_t reg, kick, mask;
 	int lcv, need_softserv;
 
 	EN_LOCK(sc);
 
 	reg = en_read(sc, MID_INTACK);
 	DBG(sc, INTR, ("interrupt=0x%b", reg, MID_INTBITS));
 
 	if ((reg & MID_INT_ANY) == 0) {
 		EN_UNLOCK(sc);
 		return;
 	}
 
 	/*
 	 * unexpected errors that need a reset
 	 */
 	if ((reg & (MID_INT_IDENT | MID_INT_LERR | MID_INT_DMA_ERR)) != 0) {
 		device_printf(sc->dev, "unexpected interrupt=0x%b, "
 		    "resetting\n", reg, MID_INTBITS);
 #ifdef EN_DEBUG
 		panic("en: unexpected error");
 #else
 		en_reset_ul(sc);
 		en_init(sc);
 #endif
 		EN_UNLOCK(sc);
 		return;
 	}
 
 	if (reg & MID_INT_SUNI)
 		utopia_intr(&sc->utopia);
 
 	kick = 0;
 	if (reg & MID_INT_TX)
 		kick |= en_intr_tx(sc, reg);
 
 	if (reg & MID_INT_DMA_TX)
 		kick |= en_intr_tx_dma(sc);
 
 	/*
 	 * kick xmit channels as needed.
 	 */
 	if (kick) {
 		DBG(sc, INTR, ("tx kick mask = 0x%x", kick));
 		for (mask = 1, lcv = 0 ; lcv < EN_NTX ; lcv++, mask = mask * 2)
 			if ((kick & mask) && _IF_QLEN(&sc->txslot[lcv].q) != 0)
 				en_txdma(sc, &sc->txslot[lcv]);
 	}
 
 	need_softserv = 0;
 	if (reg & MID_INT_DMA_RX)
 		need_softserv |= en_intr_rx_dma(sc);
 
 	if (reg & MID_INT_SERVICE)
 		need_softserv |= en_intr_service(sc);
 
 	if (need_softserv)
 		en_service(sc);
 
 	/*
 	 * keep our stats
 	 */
 	if (reg & MID_INT_DMA_OVR) {
 		EN_COUNT(sc->stats.dmaovr);
 		DBG(sc, INTR, ("MID_INT_DMA_OVR"));
 	}
 	reg = en_read(sc, MID_STAT);
 	sc->stats.otrash += MID_OTRASH(reg);
 	sc->stats.vtrash += MID_VTRASH(reg);
 
 	EN_UNLOCK(sc);
 }
 
 /*
  * Read at most n SUNI regs starting at reg into val
  */
 static int
 en_utopia_readregs(struct ifatm *ifatm, u_int reg, uint8_t *val, u_int *n)
 {
 	struct en_softc *sc = ifatm->ifp->if_softc;
 	u_int i;
 
 	EN_CHECKLOCK(sc);
 	if (reg >= MID_NSUNI)
 		return (EINVAL);
 	if (reg + *n > MID_NSUNI)
 		*n = MID_NSUNI - reg;
 
 	for (i = 0; i < *n; i++)
 		val[i] = en_read(sc, MID_SUNIOFF + 4 * (reg + i));
 
 	return (0);
 }
 
 /*
  * change the bits given by mask to them in val in register reg
  */
 static int
 en_utopia_writereg(struct ifatm *ifatm, u_int reg, u_int mask, u_int val)
 {
 	struct en_softc *sc = ifatm->ifp->if_softc;
 	uint32_t regval;
 
 	EN_CHECKLOCK(sc);
 	if (reg >= MID_NSUNI)
 		return (EINVAL);
 	regval = en_read(sc, MID_SUNIOFF + 4 * reg);
 	regval = (regval & ~mask) | (val & mask);
 	en_write(sc, MID_SUNIOFF + 4 * reg, regval);
 	return (0);
 }
 
 static const struct utopia_methods en_utopia_methods = {
 	en_utopia_readregs,
 	en_utopia_writereg
 };
 
 /*********************************************************************/
 /*
  * Probing the DMA brokeness of the card
  */
 
 /*
  * Physical address load helper function for DMA probe
  *
  * LOCK: unlocked, not needed
  */
 static void
 en_dmaprobe_load(void *uarg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	if (error == 0)
 		*(bus_addr_t *)uarg = segs[0].ds_addr;
 }
 
 /*
  * en_dmaprobe: helper function for en_attach.
  *
  * see how the card handles DMA by running a few DMA tests.   we need
  * to figure out the largest number of bytes we can DMA in one burst
  * ("bestburstlen"), and if the starting address for a burst needs to
  * be aligned on any sort of boundary or not ("alburst").
  *
  * Things turn out more complex than that, because on my (harti) brand
  * new motherboard (2.4GHz) we can do 64byte aligned DMAs, but everything
  * we more than 4 bytes fails (with an RX DMA timeout) for physical
  * addresses that end with 0xc. Therefor we search not only the largest
  * burst that is supported (hopefully 64) but also check what is the largerst
  * unaligned supported size. If that appears to be lesser than 4 words,
  * set the noalbursts flag. That will be set only if also alburst is set.
  */
 
 /*
  * en_dmaprobe_doit: do actual testing for the DMA test.
  * Cycle through all bursts sizes from 8 up to 64 and try whether it works.
  * Return the largest one that works.
  *
  * LOCK: unlocked, not needed
  */
 static int
 en_dmaprobe_doit(struct en_softc *sc, uint8_t *sp, bus_addr_t psp)
 {
 	uint8_t *dp = sp + MIDDMA_MAXBURST;
 	bus_addr_t pdp = psp + MIDDMA_MAXBURST;
 	int lcv, retval = 4, cnt;
 	uint32_t reg, bcode, midvloc;
 
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);	/* reset card before touching RAM */
 
 	/*
 	 * set up a 1k buffer at MID_BUFOFF
 	 */
 	midvloc = ((MID_BUFOFF - MID_RAMOFF) / sizeof(uint32_t))
 	    >> MIDV_LOCTOPSHFT;
 	en_write(sc, MIDX_PLACE(0), MIDX_MKPLACE(en_k2sz(1), midvloc));
 	en_write(sc, MID_VC(0), (midvloc << MIDV_LOCSHIFT) 
 	    | (en_k2sz(1) << MIDV_SZSHIFT) | MIDV_TRASH);
 	en_write(sc, MID_DST_RP(0), 0);
 	en_write(sc, MID_WP_ST_CNT(0), 0);
 
  	/* set up sample data */
 	for (lcv = 0 ; lcv < MIDDMA_MAXBURST; lcv++)
 		sp[lcv] = lcv + 1;
 
 	/* enable DMA (only) */
 	en_write(sc, MID_MAST_CSR, MID_MCSR_ENDMA);
 
 	sc->drq_chip = MID_DRQ_REG2A(en_read(sc, MID_DMA_RDRX));
 	sc->dtq_chip = MID_DTQ_REG2A(en_read(sc, MID_DMA_RDTX));
 
 	/*
 	 * try it now . . .  DMA it out, then DMA it back in and compare
 	 *
 	 * note: in order to get the dma stuff to reverse directions it wants
 	 * the "end" flag set!   since we are not dma'ing valid data we may
 	 * get an ident mismatch interrupt (which we will ignore).
 	 */
 	DBG(sc, DMA, ("test sp=%p/%#lx, dp=%p/%#lx", 
 	    sp, (u_long)psp, dp, (u_long)pdp));
 	for (lcv = 8 ; lcv <= MIDDMA_MAXBURST ; lcv = lcv * 2) {
 		DBG(sc, DMA, ("test lcv=%d", lcv));
 
 		/* zero SRAM and dest buffer */
 		bus_space_set_region_4(sc->en_memt, sc->en_base,
 		    MID_BUFOFF, 0, 1024 / 4);
 		bzero(dp, MIDDMA_MAXBURST);
 
 		bcode = en_sz2b(lcv);
 
 		/* build lcv-byte-DMA x NBURSTS */
 		if (sc->is_adaptec)
 			en_write(sc, sc->dtq_chip,
 			    MID_MK_TXQ_ADP(lcv, 0, MID_DMA_END, 0));
 		else
 			en_write(sc, sc->dtq_chip,
 			    MID_MK_TXQ_ENI(1, 0, MID_DMA_END, bcode));
 		en_write(sc, sc->dtq_chip + 4, psp);
 		EN_WRAPADD(MID_DTQOFF, MID_DTQEND, sc->dtq_chip, 8);
 		en_write(sc, MID_DMA_WRTX, MID_DTQ_A2REG(sc->dtq_chip));
 
 		cnt = 1000;
 		while ((reg = en_readx(sc, MID_DMA_RDTX)) !=
 		    MID_DTQ_A2REG(sc->dtq_chip)) {
 			DELAY(1);
 			if (--cnt == 0) {
 				DBG(sc, DMA, ("unexpected timeout in tx "
 				    "DMA test\n  alignment=0x%lx, burst size=%d"
 				    ", dma addr reg=%#x, rdtx=%#x, stat=%#x\n",
 				    (u_long)sp & 63, lcv,
 				    en_read(sc, MID_DMA_ADDR), reg,
 				    en_read(sc, MID_INTSTAT)));
 				return (retval);
 			}
 		}
 
 		reg = en_read(sc, MID_INTACK); 
 		if ((reg & MID_INT_DMA_TX) != MID_INT_DMA_TX) {
 			DBG(sc, DMA, ("unexpected status in tx DMA test: %#x\n",
 			    reg));
 			return (retval);
 		}
 		/* re-enable DMA (only) */
 		en_write(sc, MID_MAST_CSR, MID_MCSR_ENDMA);
 
 		/* "return to sender..."  address is known ... */
 
 		/* build lcv-byte-DMA x NBURSTS */
 		if (sc->is_adaptec)
 			en_write(sc, sc->drq_chip,
 			    MID_MK_RXQ_ADP(lcv, 0, MID_DMA_END, 0));
 		else
 			en_write(sc, sc->drq_chip,
 			    MID_MK_RXQ_ENI(1, 0, MID_DMA_END, bcode));
 		en_write(sc, sc->drq_chip + 4, pdp);
 		EN_WRAPADD(MID_DRQOFF, MID_DRQEND, sc->drq_chip, 8);
 		en_write(sc, MID_DMA_WRRX, MID_DRQ_A2REG(sc->drq_chip));
 		cnt = 1000;
 		while ((reg = en_readx(sc, MID_DMA_RDRX)) !=
 		    MID_DRQ_A2REG(sc->drq_chip)) {
 			DELAY(1);
 			cnt--;
 			if (--cnt == 0) {
 				DBG(sc, DMA, ("unexpected timeout in rx "
 				    "DMA test, rdrx=%#x\n", reg));
 				return (retval);
 			}
 		}
 		reg = en_read(sc, MID_INTACK); 
 		if ((reg & MID_INT_DMA_RX) != MID_INT_DMA_RX) {
 			DBG(sc, DMA, ("unexpected status in rx DMA "
 			    "test: 0x%x\n", reg));
 			return (retval);
 		}
 		if (bcmp(sp, dp, lcv)) {
 			DBG(sc, DMA, ("DMA test failed! lcv=%d, sp=%p, "
 			    "dp=%p", lcv, sp, dp));
 			return (retval);
 		}
 
 		retval = lcv;
 	}
 	return (retval);	/* studly 64 byte DMA present!  oh baby!! */
 }
 
 /*
  * Find the best DMA parameters
  *
  * LOCK: unlocked, not needed
  */
 static void
 en_dmaprobe(struct en_softc *sc)
 {
 	bus_dma_tag_t tag;
 	bus_dmamap_t map;
 	int err;
 	void *buffer;
 	int bestalgn, lcv, try, bestnoalgn;
 	bus_addr_t phys;
 	uint8_t *addr;
 
 	sc->alburst = 0;
 	sc->noalbursts = 0;
 
 	/*
 	 * Allocate some DMA-able memory.
 	 * We need 3 times the max burst size aligned to the max burst size.
 	 */
 	err = bus_dma_tag_create(bus_get_dma_tag(sc->dev), MIDDMA_MAXBURST, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
 	    3 * MIDDMA_MAXBURST, 1, 3 * MIDDMA_MAXBURST, 0,
 	    NULL, NULL, &tag);
 	if (err)
 		panic("%s: cannot create test DMA tag %d", __func__, err);
 
 	err = bus_dmamem_alloc(tag, &buffer, 0, &map);
 	if (err)
 		panic("%s: cannot allocate test DMA memory %d", __func__, err);
 
 	err = bus_dmamap_load(tag, map, buffer, 3 * MIDDMA_MAXBURST,
 	    en_dmaprobe_load, &phys, BUS_DMA_NOWAIT);
 	if (err)
 		panic("%s: cannot load test DMA map %d", __func__, err);
 	addr = buffer;
 	DBG(sc, DMA, ("phys=%#lx addr=%p", (u_long)phys, addr));
 
 	/*
 	 * Now get the best burst size of the aligned case.
 	 */
 	bestalgn = bestnoalgn = en_dmaprobe_doit(sc, addr, phys);
 
 	/*
 	 * Now try unaligned. 
 	 */
 	for (lcv = 4; lcv < MIDDMA_MAXBURST; lcv += 4) {
 		try = en_dmaprobe_doit(sc, addr + lcv, phys + lcv);
 
 		if (try < bestnoalgn)
 			bestnoalgn = try;
 	}
 
 	if (bestnoalgn < bestalgn) {
 		sc->alburst = 1;
 		if (bestnoalgn < 32)
 			sc->noalbursts = 1;
 	}
 
 	sc->bestburstlen = bestalgn;
 	sc->bestburstshift = en_log2(bestalgn);
 	sc->bestburstmask = sc->bestburstlen - 1; /* must be power of 2 */
 	sc->bestburstcode = en_sz2b(bestalgn);
 
 	/*
 	 * Reset the chip before freeing the buffer. It may still be trying
 	 * to DMA.
 	 */
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);	/* reset card before touching RAM */
 
 	DELAY(10000);			/* may still do DMA */
 
 	/*
 	 * Free the DMA stuff
 	 */
 	bus_dmamap_unload(tag, map);
 	bus_dmamem_free(tag, buffer, map);
 	bus_dma_tag_destroy(tag);
 }
 
 /*********************************************************************/
 /*
  * Attach/detach.
  */
 
 /*
  * Attach to the card.
  *
  * LOCK: unlocked, not needed (but initialized)
  */
 int
 en_attach(struct en_softc *sc)
 {
 	struct ifnet *ifp = sc->ifp;
 	int sz;
 	uint32_t reg, lcv, check, ptr, sav, midvloc;
 
 #ifdef EN_DEBUG
 	sc->debug = EN_DEBUG;
 #endif
 
 	/*
 	 * Probe card to determine memory size.
 	 *
 	 * The stupid ENI card always reports to PCI that it needs 4MB of
 	 * space (2MB regs and 2MB RAM). If it has less than 2MB RAM the
 	 * addresses wrap in the RAM address space (i.e. on a 512KB card
 	 * addresses 0x3ffffc, 0x37fffc, and 0x2ffffc are aliases for
 	 * 0x27fffc  [note that RAM starts at offset 0x200000]).
 	 */
 
 	/* reset card before touching RAM */
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);
 
 	for (lcv = MID_PROBEOFF; lcv <= MID_MAXOFF ; lcv += MID_PROBSIZE) {
 		en_write(sc, lcv, lcv);	/* data[address] = address */
 		for (check = MID_PROBEOFF; check < lcv ;check += MID_PROBSIZE) {
 			reg = en_read(sc, check);
 			if (reg != check)
 				/* found an alias! - quit */
 				goto done_probe;
 		}
 	}
   done_probe:
 	lcv -= MID_PROBSIZE;			/* take one step back */
 	sc->en_obmemsz = (lcv + 4) - MID_RAMOFF;
 
 	/*
 	 * determine the largest DMA burst supported
 	 */
 	en_dmaprobe(sc);
 
 	/*
 	 * "hello world"
 	 */
 
 	/* reset */
 	if (sc->en_busreset)
 		sc->en_busreset(sc);
 	en_write(sc, MID_RESID, 0x0);		/* reset */
 
 	/* zero memory */
 	bus_space_set_region_4(sc->en_memt, sc->en_base,
 	    MID_RAMOFF, 0, sc->en_obmemsz / 4);
 
 	reg = en_read(sc, MID_RESID);
 
 	device_printf(sc->dev, "ATM midway v%d, board IDs %d.%d, %s%s%s, "
 	    "%ldKB on-board RAM\n", MID_VER(reg), MID_MID(reg), MID_DID(reg), 
 	    (MID_IS_SABRE(reg)) ? "sabre controller, " : "",
 	    (MID_IS_SUNI(reg)) ? "SUNI" : "Utopia",
 	    (!MID_IS_SUNI(reg) && MID_IS_UPIPE(reg)) ? " (pipelined)" : "",
 	    (long)sc->en_obmemsz / 1024);
 
 	/*
 	 * fill in common ATM interface stuff
 	 */
 	IFP2IFATM(sc->ifp)->mib.hw_version = (MID_VER(reg) << 16) |
 	    (MID_MID(reg) << 8) | MID_DID(reg);
 	if (MID_DID(reg) & 0x4)
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_UTP_155;
 	else
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_MM_155;
 
 	IFP2IFATM(sc->ifp)->mib.pcr = ATM_RATE_155M;
 	IFP2IFATM(sc->ifp)->mib.vpi_bits = 0;
 	IFP2IFATM(sc->ifp)->mib.vci_bits = MID_VCI_BITS;
 	IFP2IFATM(sc->ifp)->mib.max_vccs = MID_N_VC;
 	IFP2IFATM(sc->ifp)->mib.max_vpcs = 0;
 
 	if (sc->is_adaptec) {
 		IFP2IFATM(sc->ifp)->mib.device = ATM_DEVICE_ADP155P;
 		if (sc->bestburstlen == 64 && sc->alburst == 0)
 			device_printf(sc->dev,
 			    "passed 64 byte DMA test\n");
 		else
 			device_printf(sc->dev, "FAILED DMA TEST: "
 			    "burst=%d, alburst=%d\n", sc->bestburstlen,
 			    sc->alburst);
 	} else {
 		IFP2IFATM(sc->ifp)->mib.device = ATM_DEVICE_ENI155P;
 		device_printf(sc->dev, "maximum DMA burst length = %d "
 		    "bytes%s\n", sc->bestburstlen, sc->alburst ?
 		    sc->noalbursts ?  " (no large bursts)" : " (must align)" :
 		    "");
 	}
 
 	/*
 	 * link into network subsystem and prepare card
 	 */
 	sc->ifp->if_softc = sc;
 	ifp->if_flags = IFF_SIMPLEX;
 	ifp->if_ioctl = en_ioctl;
 	ifp->if_start = en_start;
 
 	mtx_init(&sc->en_mtx, device_get_nameunit(sc->dev),
 	    MTX_NETWORK_LOCK, MTX_DEF);
 	cv_init(&sc->cv_close, "VC close");
 
 	/*
 	 * Make the sysctl tree
 	 */
 	sysctl_ctx_init(&sc->sysctl_ctx);
 
 	if ((sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_hw_atm), OID_AUTO,
 	    device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "")) == NULL)
 		goto fail;
 
 	if (SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "istats", CTLTYPE_OPAQUE | CTLFLAG_RD, sc, 0,
 	    en_sysctl_istats, "S", "internal statistics") == NULL)
 		goto fail;
 
 #ifdef EN_DEBUG
 	if (SYSCTL_ADD_UINT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "debug", CTLFLAG_RW , &sc->debug, 0, "") == NULL)
 		goto fail;
 #endif
 
 	IFP2IFATM(sc->ifp)->phy = &sc->utopia;
 	utopia_attach(&sc->utopia, IFP2IFATM(sc->ifp), &sc->media, &sc->en_mtx,
 	    &sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    &en_utopia_methods);
 	utopia_init_media(&sc->utopia);
 
 	MGET(sc->padbuf, M_WAITOK, MT_DATA);
 	bzero(sc->padbuf->m_data, MLEN);
 
 	if (bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
 	    EN_TXSZ * 1024, EN_MAX_DMASEG, EN_TXSZ * 1024, 0,
 	    NULL, NULL, &sc->txtag))
 		goto fail;
 
 	sc->map_zone = uma_zcreate("en dma maps", sizeof(struct en_map),
 	    en_map_ctor, en_map_dtor, NULL, en_map_fini, UMA_ALIGN_PTR,
 	    UMA_ZONE_ZINIT);
 	if (sc->map_zone == NULL)
 		goto fail;
 	uma_zone_set_max(sc->map_zone, EN_MAX_MAPS);
 
 	/*
 	 * init softc
 	 */
 	sc->vccs = malloc(MID_N_VC * sizeof(sc->vccs[0]),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 
 	sz = sc->en_obmemsz - (MID_BUFOFF - MID_RAMOFF);
 	ptr = sav = MID_BUFOFF;
 	ptr = roundup(ptr, EN_TXSZ * 1024);	/* align */
 	sz = sz - (ptr - sav);
 	if (EN_TXSZ*1024 * EN_NTX > sz) {
 		device_printf(sc->dev, "EN_NTX/EN_TXSZ too big\n");
 		goto fail;
 	}
 	for (lcv = 0 ;lcv < EN_NTX ;lcv++) {
 		sc->txslot[lcv].mbsize = 0;
 		sc->txslot[lcv].start = ptr;
 		ptr += (EN_TXSZ * 1024);
 		sz -= (EN_TXSZ * 1024);
 		sc->txslot[lcv].stop = ptr;
 		sc->txslot[lcv].nref = 0;
 		DBG(sc, INIT, ("tx%d: start 0x%x, stop 0x%x", lcv,
 		    sc->txslot[lcv].start, sc->txslot[lcv].stop));
 	}
 
 	sav = ptr;
 	ptr = roundup(ptr, EN_RXSZ * 1024);	/* align */
 	sz = sz - (ptr - sav);
 	sc->en_nrx = sz / (EN_RXSZ * 1024);
 	if (sc->en_nrx <= 0) {
 		device_printf(sc->dev, "EN_NTX/EN_TXSZ/EN_RXSZ too big\n");
 		goto fail;
 	}
 
 	/* 
 	 * ensure that there is always one VC slot on the service list free
 	 * so that we can tell the difference between a full and empty list.
 	 */
 	if (sc->en_nrx >= MID_N_VC)
 		sc->en_nrx = MID_N_VC - 1;
 
 	for (lcv = 0 ; lcv < sc->en_nrx ; lcv++) {
 		sc->rxslot[lcv].vcc = NULL;
 		midvloc = sc->rxslot[lcv].start = ptr;
 		ptr += (EN_RXSZ * 1024);
 		sz -= (EN_RXSZ * 1024);
 		sc->rxslot[lcv].stop = ptr;
 		midvloc = midvloc - MID_RAMOFF;
 		/* mask, cvt to words */
 		midvloc = (midvloc & ~((EN_RXSZ*1024) - 1)) >> 2;
 		/* we only want the top 11 bits */
 		midvloc = midvloc >> MIDV_LOCTOPSHFT;
 		midvloc = (midvloc & MIDV_LOCMASK) << MIDV_LOCSHIFT;
 		sc->rxslot[lcv].mode = midvloc | 
 		    (en_k2sz(EN_RXSZ) << MIDV_SZSHIFT) | MIDV_TRASH;
 
 		DBG(sc, INIT, ("rx%d: start 0x%x, stop 0x%x, mode 0x%x", lcv,
 		    sc->rxslot[lcv].start, sc->rxslot[lcv].stop,
 		    sc->rxslot[lcv].mode));
 	}
 
 	device_printf(sc->dev, "%d %dKB receive buffers, %d %dKB transmit "
 	    "buffers\n", sc->en_nrx, EN_RXSZ, EN_NTX, EN_TXSZ);
 	device_printf(sc->dev, "end station identifier (mac address) "
 	    "%6D\n", IFP2IFATM(sc->ifp)->mib.esi, ":");
 
 	/*
 	 * Start SUNI stuff. This will call our readregs/writeregs
 	 * functions and these assume the lock to be held so we must get it
 	 * here.
 	 */
 	EN_LOCK(sc);
 	utopia_start(&sc->utopia);
 	utopia_reset(&sc->utopia);
 	EN_UNLOCK(sc);
 
 	/*
 	 * final commit
 	 */
 	atm_ifattach(ifp); 
 
 #ifdef ENABLE_BPF
 	bpfattach(ifp, DLT_ATM_RFC1483, sizeof(struct atmllc));
 #endif
 
 	return (0);
 
  fail:
 	en_destroy(sc);
 	return (-1);
 }
 
 /*
  * Free all internal resources. No access to bus resources here.
  * No locking required here (interrupt is already disabled).
  *
  * LOCK: unlocked, needed (but destroyed)
  */
 void
 en_destroy(struct en_softc *sc)
 {
 	u_int i;
 
 	if (sc->utopia.state & UTP_ST_ATTACHED) {
 		/* these assume the lock to be held */
 		EN_LOCK(sc);
 		utopia_stop(&sc->utopia);
 		utopia_detach(&sc->utopia);
 		EN_UNLOCK(sc);
 	}
 
 	if (sc->vccs != NULL) {
 		/* get rid of sticky VCCs */
 		for (i = 0; i < MID_N_VC; i++)
 			if (sc->vccs[i] != NULL)
 				uma_zfree(en_vcc_zone, sc->vccs[i]);
 		free(sc->vccs, M_DEVBUF);
 	}
 
 	if (sc->padbuf != NULL)
 		m_free(sc->padbuf);
 
 	/*
 	 * Destroy the map zone before the tag (the fini function will
 	 * destroy the DMA maps using the tag)
 	 */
 	if (sc->map_zone != NULL)
 		uma_zdestroy(sc->map_zone);
 
 	if (sc->txtag != NULL)
 		bus_dma_tag_destroy(sc->txtag);
 
 	(void)sysctl_ctx_free(&sc->sysctl_ctx);
 
 	cv_destroy(&sc->cv_close);
 	mtx_destroy(&sc->en_mtx);
 }
 
 /*
  * Module loaded/unloaded
  */
 int
 en_modevent(module_t mod __unused, int event, void *arg __unused)
 {
 
 	switch (event) {
 
 	  case MOD_LOAD:
 		en_vcc_zone = uma_zcreate("EN vccs", sizeof(struct en_vcc),
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 		if (en_vcc_zone == NULL)
 			return (ENOMEM);
 		break;
 
 	  case MOD_UNLOAD:
 		uma_zdestroy(en_vcc_zone);
 		break;
 	}
 	return (0);
 }
 
 /*********************************************************************/
 /*
  * Debugging support
  */
 
 #ifdef EN_DDBHOOK
 /*
  * functions we can call from ddb
  */
 
 /*
  * en_dump: dump the state
  */
 #define END_SWSL	0x00000040		/* swsl state */
 #define END_DRQ		0x00000020		/* drq state */
 #define END_DTQ		0x00000010		/* dtq state */
 #define END_RX		0x00000008		/* rx state */
 #define END_TX		0x00000004		/* tx state */
 #define END_MREGS	0x00000002		/* registers */
 #define END_STATS	0x00000001		/* dump stats */
 
 #define END_BITS "\20\7SWSL\6DRQ\5DTQ\4RX\3TX\2MREGS\1STATS"
 
 static void
 en_dump_stats(const struct en_stats *s)
 {
 	printf("en_stats:\n");
 	printf("\t%d/%d mfix (%d failed)\n", s->mfixaddr, s->mfixlen,
 	    s->mfixfail);
 	printf("\t%d rx dma overflow interrupts\n", s->dmaovr);
 	printf("\t%d times out of TX space and stalled\n", s->txoutspace);
 	printf("\t%d times out of DTQs\n", s->txdtqout);
 	printf("\t%d times launched a packet\n", s->launch);
 	printf("\t%d times pulled the hw service list\n", s->hwpull);
 	printf("\t%d times pushed a vci on the sw service list\n", s->swadd);
 	printf("\t%d times RX pulled an mbuf from Q that wasn't ours\n",
 	    s->rxqnotus);
 	printf("\t%d times RX pulled a good mbuf from Q\n", s->rxqus);
 	printf("\t%d times ran out of DRQs\n", s->rxdrqout);
 	printf("\t%d transmit packets dropped due to mbsize\n", s->txmbovr);
 	printf("\t%d cells trashed due to turned off rxvc\n", s->vtrash);
 	printf("\t%d cells trashed due to totally full buffer\n", s->otrash);
 	printf("\t%d cells trashed due almost full buffer\n", s->ttrash);
 	printf("\t%d rx mbuf allocation failures\n", s->rxmbufout);
 	printf("\t%d times out of tx maps\n", s->txnomap);
 #ifdef NATM
 #ifdef NATM_STAT
 	printf("\tnatmintr so_rcv: ok/drop cnt: %d/%d, ok/drop bytes: %d/%d\n",
 	    natm_sookcnt, natm_sodropcnt, natm_sookbytes, natm_sodropbytes);
 #endif
 #endif
 }
 
 static void
 en_dump_mregs(struct en_softc *sc)
 {
 	u_int cnt;
 
 	printf("mregs:\n");
 	printf("resid = 0x%x\n", en_read(sc, MID_RESID));
 	printf("interrupt status = 0x%b\n",
 	    (int)en_read(sc, MID_INTSTAT), MID_INTBITS);
 	printf("interrupt enable = 0x%b\n", 
 	     (int)en_read(sc, MID_INTENA), MID_INTBITS);
 	printf("mcsr = 0x%b\n", (int)en_read(sc, MID_MAST_CSR), MID_MCSRBITS);
 	printf("serv_write = [chip=%u] [us=%u]\n", en_read(sc, MID_SERV_WRITE),
 	     MID_SL_A2REG(sc->hwslistp));
 	printf("dma addr = 0x%x\n", en_read(sc, MID_DMA_ADDR));
 	printf("DRQ: chip[rd=0x%x,wr=0x%x], sc[chip=0x%x,us=0x%x]\n",
 	    MID_DRQ_REG2A(en_read(sc, MID_DMA_RDRX)), 
 	    MID_DRQ_REG2A(en_read(sc, MID_DMA_WRRX)), sc->drq_chip, sc->drq_us);
 	printf("DTQ: chip[rd=0x%x,wr=0x%x], sc[chip=0x%x,us=0x%x]\n",
 	    MID_DTQ_REG2A(en_read(sc, MID_DMA_RDTX)), 
 	    MID_DTQ_REG2A(en_read(sc, MID_DMA_WRTX)), sc->dtq_chip, sc->dtq_us);
 
 	printf("  unusal txspeeds:");
 	for (cnt = 0 ; cnt < MID_N_VC ; cnt++)
 		if (sc->vccs[cnt]->txspeed)
 			printf(" vci%d=0x%x", cnt, sc->vccs[cnt]->txspeed);
 	printf("\n");
 
 	printf("  rxvc slot mappings:");
 	for (cnt = 0 ; cnt < MID_N_VC ; cnt++)
 		if (sc->vccs[cnt]->rxslot != NULL)
 			printf("  %d->%td", cnt,
 			    sc->vccs[cnt]->rxslot - sc->rxslot);
 	printf("\n");
 }
 
 static void
 en_dump_tx(struct en_softc *sc)
 {
 	u_int slot;
 
 	printf("tx:\n");
 	for (slot = 0 ; slot < EN_NTX; slot++) {
 		printf("tx%d: start/stop/cur=0x%x/0x%x/0x%x [%d]  ", slot,
 		    sc->txslot[slot].start, sc->txslot[slot].stop,
 		    sc->txslot[slot].cur,
 		    (sc->txslot[slot].cur - sc->txslot[slot].start) / 4);
 		printf("mbsize=%d, bfree=%d\n", sc->txslot[slot].mbsize,
 		    sc->txslot[slot].bfree);
 		printf("txhw: base_address=0x%x, size=%u, read=%u, "
 		    "descstart=%u\n",
 		    (u_int)MIDX_BASE(en_read(sc, MIDX_PLACE(slot))), 
 		    MIDX_SZ(en_read(sc, MIDX_PLACE(slot))),
 		    en_read(sc, MIDX_READPTR(slot)),
 		    en_read(sc, MIDX_DESCSTART(slot)));
 	}
 }
 
 static void
 en_dump_rx(struct en_softc *sc)
 {
 	struct en_rxslot *slot;
 
 	printf("  recv slots:\n");
 	for (slot = sc->rxslot ; slot < &sc->rxslot[sc->en_nrx]; slot++) {
 		printf("rx%td: start/stop/cur=0x%x/0x%x/0x%x mode=0x%x ",
 		    slot - sc->rxslot, slot->start, slot->stop, slot->cur,
 		    slot->mode);
 		if (slot->vcc != NULL) {
 			printf("vci=%u\n", slot->vcc->vcc.vci);
 			printf("RXHW: mode=0x%x, DST_RP=0x%x, WP_ST_CNT=0x%x\n",
 			    en_read(sc, MID_VC(slot->vcc->vcc.vci)),
 			    en_read(sc, MID_DST_RP(slot->vcc->vcc.vci)),
 			    en_read(sc, MID_WP_ST_CNT(slot->vcc->vcc.vci)));
 		}
 	}
 }
 
 /*
  * This is only correct for non-adaptec adapters
  */
 static void
 en_dump_dtqs(struct en_softc *sc)
 {
 	uint32_t ptr, reg;
 
 	printf("  dtq [need_dtqs=%d,dtq_free=%d]:\n", sc->need_dtqs,
 	    sc->dtq_free);
 	ptr = sc->dtq_chip;
 	while (ptr != sc->dtq_us) {
 		reg = en_read(sc, ptr);
 		printf("\t0x%x=[%#x cnt=%d, chan=%d, end=%d, type=%d @ 0x%x]\n", 
 		    sc->dtq[MID_DTQ_A2REG(ptr)], reg, MID_DMA_CNT(reg),
 		    MID_DMA_TXCHAN(reg), (reg & MID_DMA_END) != 0,
 		    MID_DMA_TYPE(reg), en_read(sc, ptr + 4));
 		EN_WRAPADD(MID_DTQOFF, MID_DTQEND, ptr, 8);
 	}
 }
 
 static void
 en_dump_drqs(struct en_softc *sc)
 {
 	uint32_t ptr, reg;
 
 	printf("  drq [need_drqs=%d,drq_free=%d]:\n", sc->need_drqs,
 	    sc->drq_free);
 	ptr = sc->drq_chip;
 	while (ptr != sc->drq_us) {
 		reg = en_read(sc, ptr);
 		printf("\t0x%x=[cnt=%d, chan=%d, end=%d, type=%d @ 0x%x]\n", 
 		    sc->drq[MID_DRQ_A2REG(ptr)], MID_DMA_CNT(reg),
 		    MID_DMA_RXVCI(reg), (reg & MID_DMA_END) != 0,
 		    MID_DMA_TYPE(reg), en_read(sc, ptr + 4));
 		EN_WRAPADD(MID_DRQOFF, MID_DRQEND, ptr, 8);
 	}
 }
 
 /* Do not staticize - meant for calling from DDB! */
 int
 en_dump(int unit, int level)
 {
 	struct en_softc *sc;
 	int lcv, cnt;
 	devclass_t dc;
 	int maxunit;
 
 	dc = devclass_find("en");
 	if (dc == NULL) {
 		printf("%s: can't find devclass!\n", __func__);
 		return (0);
 	}
 	maxunit = devclass_get_maxunit(dc);
 	for (lcv = 0 ; lcv < maxunit ; lcv++) {
 		sc = devclass_get_softc(dc, lcv);
 		if (sc == NULL)
 			continue;
 		if (unit != -1 && unit != lcv)
 			continue;
 
 		device_printf(sc->dev, "dumping device at level 0x%b\n",
 		    level, END_BITS);
 
 		if (sc->dtq_us == 0) {
 			printf("<hasn't been en_init'd yet>\n");
 			continue;
 		}
 
 		if (level & END_STATS)
 			en_dump_stats(&sc->stats);
 		if (level & END_MREGS)
 			en_dump_mregs(sc);
 		if (level & END_TX)
 			en_dump_tx(sc);
 		if (level & END_RX)
 			en_dump_rx(sc);
 		if (level & END_DTQ)
 			en_dump_dtqs(sc);
 		if (level & END_DRQ)
 			en_dump_drqs(sc);
 
 		if (level & END_SWSL) {
 			printf(" swslist [size=%d]: ", sc->swsl_size);
 			for (cnt = sc->swsl_head ; cnt != sc->swsl_tail ; 
 			    cnt = (cnt + 1) % MID_SL_N)
 				printf("0x%x ", sc->swslist[cnt]);
 			printf("\n");
 		}
 	}
 	return (0);
 }
 
 /*
  * en_dumpmem: dump the memory
  *
  * Do not staticize - meant for calling from DDB!
  */
 int
 en_dumpmem(int unit, int addr, int len)
 {
 	struct en_softc *sc;
 	uint32_t reg;
 	devclass_t dc;
 
 	dc = devclass_find("en");
 	if (dc == NULL) {
 		printf("%s: can't find devclass\n", __func__);
 		return (0);
 	}
 	sc = devclass_get_softc(dc, unit);
 	if (sc == NULL) {
 		printf("%s: invalid unit number: %d\n", __func__, unit);
 		return (0);
 	}
 
 	addr = addr & ~3;
 	if (addr < MID_RAMOFF || addr + len * 4 > MID_MAXOFF || len <= 0) {
 		printf("invalid addr/len number: %d, %d\n", addr, len);
 		return (0);
 	}
 	printf("dumping %d words starting at offset 0x%x\n", len, addr);
 	while (len--) {
 		reg = en_read(sc, addr);
 		printf("mem[0x%x] = 0x%x\n", addr, reg);
 		addr += 4;
 	}
 	return (0);
 }
 #endif
Index: head/sys/dev/fatm/if_fatm.c
===================================================================
--- head/sys/dev/fatm/if_fatm.c	(revision 276691)
+++ head/sys/dev/fatm/if_fatm.c	(revision 276692)
@@ -1,3091 +1,3091 @@
 /*-
  * Copyright (c) 2001-2003
  *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
  * 	All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Author: Hartmut Brandt <harti@freebsd.org>
  *
  * Fore PCA200E driver for NATM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_natm.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/errno.h>
 #include <sys/conf.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/syslog.h>
 #include <sys/endian.h>
 #include <sys/sysctl.h>
 #include <sys/condvar.h>
 #include <vm/uma.h>
 
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_atm.h>
 #include <net/route.h>
 #ifdef ENABLE_BPF
 #include <net/bpf.h>
 #endif
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_atm.h>
 #endif
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <dev/utopia/utopia.h>
 
 #include <dev/fatm/if_fatmreg.h>
 #include <dev/fatm/if_fatmvar.h>
 
 #include <dev/fatm/firmware.h>
 
 devclass_t fatm_devclass;
 
 static const struct {
 	uint16_t	vid;
 	uint16_t	did;
 	const char	*name;
 } fatm_devs[] = {
 	{ 0x1127, 0x300,
 	  "FORE PCA200E" },
 	{ 0, 0, NULL }
 };
 
 static const struct rate {
 	uint32_t	ratio;
 	uint32_t	cell_rate;
 } rate_table[] = {
 #include <dev/fatm/if_fatm_rate.h>
 };
 #define RATE_TABLE_SIZE (sizeof(rate_table) / sizeof(rate_table[0]))
 
 SYSCTL_DECL(_hw_atm);
 
 MODULE_DEPEND(fatm, utopia, 1, 1, 1);
 
 static int	fatm_utopia_readregs(struct ifatm *, u_int, uint8_t *, u_int *);
 static int	fatm_utopia_writereg(struct ifatm *, u_int, u_int, u_int);
 
 static const struct utopia_methods fatm_utopia_methods = {
 	fatm_utopia_readregs,
 	fatm_utopia_writereg
 };
 
 #define VC_OK(SC, VPI, VCI)						\
 	(((VPI) & ~((1 << IFP2IFATM((SC)->ifp)->mib.vpi_bits) - 1)) == 0 &&	\
 	 (VCI) != 0 && ((VCI) & ~((1 << IFP2IFATM((SC)->ifp)->mib.vci_bits) - 1)) == 0)
 
 static int fatm_load_vc(struct fatm_softc *sc, struct card_vcc *vc);
 
 /*
  * Probing is easy: step trough the list of known vendor and device
  * ids and compare. If one is found - it's our.
  */
 static int
 fatm_probe(device_t dev)
 {
 	int i;
 
 	for (i = 0; fatm_devs[i].name; i++)
 		if (pci_get_vendor(dev) == fatm_devs[i].vid &&
 		    pci_get_device(dev) == fatm_devs[i].did) {
 			device_set_desc(dev, fatm_devs[i].name);
 			return (BUS_PROBE_DEFAULT);
 		}
 	return (ENXIO);
 }
 
 /*
  * Function called at completion of a SUNI writeregs/readregs command.
  * This is called from the interrupt handler while holding the softc lock.
  * We use the queue entry as the randevouze point.
  */
 static void
 fatm_utopia_writeregs_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if(H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.suni_reg_errors++;
 		q->error = EIO;
 	}
 	wakeup(q);
 }
 
 /*
  * Write a SUNI register. The bits that are 1 in mask are written from val
  * into register reg. We wait for the command to complete by sleeping on
  * the register memory.
  *
  * We assume, that we already hold the softc mutex.
  */
 static int
 fatm_utopia_writereg(struct ifatm *ifatm, u_int reg, u_int mask, u_int val)
 {
 	int error;
 	struct cmdqueue *q;
 	struct fatm_softc *sc;
 
 	sc = ifatm->ifp->if_softc;
 	FATM_CHECKLOCK(sc);
 	if (!(ifatm->ifp->if_drv_flags & IFF_DRV_RUNNING))
 		return (EIO);
 
 	/* get queue element and fill it */
 	q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.head);
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (!(H_GETSTAT(q->q.statp) & FATM_STAT_FREE)) {
 		sc->istats.cmd_queue_full++;
 		return (EIO);
 	}
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.head, FATM_CMD_QLEN);
 
 	q->error = 0;
 	q->cb = fatm_utopia_writeregs_complete;
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 	WRITE4(sc, q->q.card + FATMOC_GETOC3_BUF, 0);
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_OP,
 	    FATM_MAKE_SETOC3(reg, val, mask) | FATM_OP_INTERRUPT_SEL);
 	BARRIER_W(sc);
 
 	/*
 	 * Wait for the command to complete
 	 */
 	error = msleep(q, &sc->mtx, PZERO | PCATCH, "fatm_setreg", hz);
 
 	switch(error) {
 
 	  case EWOULDBLOCK:
 		error = EIO;
 		break;
 
 	  case ERESTART:
 		error = EINTR;
 		break;
 
 	  case 0:
 		error = q->error;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Function called at completion of a SUNI readregs command.
  * This is called from the interrupt handler while holding the softc lock.
  * We use reg_mem as the randevouze point.
  */
 static void
 fatm_utopia_readregs_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.suni_reg_errors++;
 		q->error = EIO;
 	}
 	wakeup(&sc->reg_mem);
 }
 
 /*
  * Read SUNI registers
  *
  * We use a preallocated buffer to read the registers. Therefor we need
  * to protect against multiple threads trying to read registers. We do this
  * with a condition variable and a flag. We wait for the command to complete by sleeping on
  * the register memory.
  *
  * We assume, that we already hold the softc mutex.
  */
 static int
 fatm_utopia_readregs_internal(struct fatm_softc *sc)
 {
 	int error, i;
 	uint32_t *ptr;
 	struct cmdqueue *q;
 
 	/* get the buffer */
 	for (;;) {
 		if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING))
 			return (EIO);
 		if (!(sc->flags & FATM_REGS_INUSE))
 			break;
 		cv_wait(&sc->cv_regs, &sc->mtx);
 	}
 	sc->flags |= FATM_REGS_INUSE;
 
 	q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.head);
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (!(H_GETSTAT(q->q.statp) & FATM_STAT_FREE)) {
 		sc->istats.cmd_queue_full++;
 		return (EIO);
 	}
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.head, FATM_CMD_QLEN);
 
 	q->error = 0;
 	q->cb = fatm_utopia_readregs_complete;
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 	bus_dmamap_sync(sc->reg_mem.dmat, sc->reg_mem.map, BUS_DMASYNC_PREREAD);
 
 	WRITE4(sc, q->q.card + FATMOC_GETOC3_BUF, sc->reg_mem.paddr);
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_OP,
 	    FATM_OP_OC3_GET_REG | FATM_OP_INTERRUPT_SEL);
 	BARRIER_W(sc);
 
 	/*
 	 * Wait for the command to complete
 	 */
 	error = msleep(&sc->reg_mem, &sc->mtx, PZERO | PCATCH,
 	    "fatm_getreg", hz);
 
 	switch(error) {
 
 	  case EWOULDBLOCK:
 		error = EIO;
 		break;
 
 	  case ERESTART:
 		error = EINTR;
 		break;
 
 	  case 0:
 		bus_dmamap_sync(sc->reg_mem.dmat, sc->reg_mem.map,
 		    BUS_DMASYNC_POSTREAD);
 		error = q->error;
 		break;
 	}
 
 	if (error != 0) {
 		/* declare buffer to be free */
 		sc->flags &= ~FATM_REGS_INUSE;
 		cv_signal(&sc->cv_regs);
 		return (error);
 	}
 
 	/* swap if needed */
 	ptr = (uint32_t *)sc->reg_mem.mem;
 	for (i = 0; i < FATM_NREGS; i++)
 		ptr[i] = le32toh(ptr[i]) & 0xff;
 
 	return (0);
 }
 
 /*
  * Read SUNI registers for the SUNI module.
  *
  * We assume, that we already hold the mutex.
  */
 static int
 fatm_utopia_readregs(struct ifatm *ifatm, u_int reg, uint8_t *valp, u_int *np)
 {
 	int err;
 	int i;
 	struct fatm_softc *sc;
 
 	if (reg >= FATM_NREGS)
 		return (EINVAL);
 	if (reg + *np > FATM_NREGS)
 		*np = FATM_NREGS - reg;
 	sc = ifatm->ifp->if_softc;
 	FATM_CHECKLOCK(sc);
 
 	err = fatm_utopia_readregs_internal(sc);
 	if (err != 0)
 		return (err);
 
 	for (i = 0; i < *np; i++)
 		valp[i] = ((uint32_t *)sc->reg_mem.mem)[reg + i];
 
 	/* declare buffer to be free */
 	sc->flags &= ~FATM_REGS_INUSE;
 	cv_signal(&sc->cv_regs);
 
 	return (0);
 }
 
 /*
  * Check whether the hard is beating. We remember the last heart beat and
  * compare it to the current one. If it appears stuck for 10 times, we have
  * a problem.
  *
  * Assume we hold the lock.
  */
 static void
 fatm_check_heartbeat(struct fatm_softc *sc)
 {
 	uint32_t h;
 
 	FATM_CHECKLOCK(sc);
 
 	h = READ4(sc, FATMO_HEARTBEAT);
 	DBG(sc, BEAT, ("heartbeat %08x", h));
 
 	if (sc->stop_cnt == 10)
 		return;
 
 	if (h == sc->heartbeat) {
 		if (++sc->stop_cnt == 10) {
 			log(LOG_ERR, "i960 stopped???\n");
 			WRITE4(sc, FATMO_HIMR, 1);
 		}
 		return;
 	}
 
 	sc->stop_cnt = 0;
 	sc->heartbeat = h;
 }
 
 /*
  * Ensure that the heart is still beating.
  */
 static void
 fatm_watchdog(void *arg)
 {
 	struct fatm_softc *sc;
 
 	sc = arg;
 	FATM_CHECKLOCK(sc);
 	fatm_check_heartbeat(sc);
 	callout_reset(&sc->watchdog_timer, hz * 5, fatm_watchdog, sc);
 }
 
 /*
  * Hard reset the i960 on the board. This is done by initializing registers,
  * clearing interrupts and waiting for the selftest to finish. Not sure,
  * whether all these barriers are actually needed.
  *
  * Assumes that we hold the lock.
  */
 static int
 fatm_reset(struct fatm_softc *sc)
 {
 	int w;
 	uint32_t val;
 
 	FATM_CHECKLOCK(sc);
 
 	WRITE4(sc, FATMO_APP_BASE, FATMO_COMMON_ORIGIN);
 	BARRIER_W(sc);
 
 	WRITE4(sc, FATMO_UART_TO_960, XMIT_READY);
 	BARRIER_W(sc);
 
 	WRITE4(sc, FATMO_UART_TO_HOST, XMIT_READY);
 	BARRIER_W(sc);
 
 	WRITE4(sc, FATMO_BOOT_STATUS, COLD_START);
 	BARRIER_W(sc);
 
 	WRITE1(sc, FATMO_HCR, FATM_HCR_RESET);
 	BARRIER_W(sc);
 
 	DELAY(1000);
 
 	WRITE1(sc, FATMO_HCR, 0);
 	BARRIER_RW(sc);
 
 	DELAY(1000);
 
 	for (w = 100; w; w--) {
 		BARRIER_R(sc);
 		val = READ4(sc, FATMO_BOOT_STATUS);
 		switch (val) {
 		  case SELF_TEST_OK:
 			return (0);
 		  case SELF_TEST_FAIL:
 			return (EIO);
 		}
 		DELAY(1000);
 	}
 	return (EIO);
 }
 
 /*
  * Stop the card. Must be called WITH the lock held
  * Reset, free transmit and receive buffers. Wakeup everybody who may sleep.
  */
 static void
 fatm_stop(struct fatm_softc *sc)
 {
 	int i;
 	struct cmdqueue *q;
 	struct rbuf *rb;
 	struct txqueue *tx;
 	uint32_t stat;
 
 	FATM_CHECKLOCK(sc);
 
 	/* Stop the board */
 	utopia_stop(&sc->utopia);
 	(void)fatm_reset(sc);
 
 	/* stop watchdog */
 	callout_stop(&sc->watchdog_timer);
 
 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 		ATMEV_SEND_IFSTATE_CHANGED(IFP2IFATM(sc->ifp),
 		    sc->utopia.carrier == UTP_CARR_OK);
 
 		/*
 		 * Collect transmit mbufs, partial receive mbufs and
 		 * supplied mbufs
 		 */
 		for (i = 0; i < FATM_TX_QLEN; i++) {
 			tx = GET_QUEUE(sc->txqueue, struct txqueue, i);
 			if (tx->m) {
 				bus_dmamap_unload(sc->tx_tag, tx->map);
 				m_freem(tx->m);
 				tx->m = NULL;
 			}
 		}
 
 		/* Collect supplied mbufs */
 		while ((rb = LIST_FIRST(&sc->rbuf_used)) != NULL) {
 			LIST_REMOVE(rb, link);
 			bus_dmamap_unload(sc->rbuf_tag, rb->map);
 			m_free(rb->m);
 			rb->m = NULL;
 			LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 		}
 
 		/* Unwait any waiters */
 		wakeup(&sc->sadi_mem);
 
 		/* wakeup all threads waiting for STAT or REG buffers */
 		cv_broadcast(&sc->cv_stat);
 		cv_broadcast(&sc->cv_regs);
 
 		sc->flags &= ~(FATM_STAT_INUSE | FATM_REGS_INUSE);
 
 		/* wakeup all threads waiting on commands */
 		for (i = 0; i < FATM_CMD_QLEN; i++) {
 			q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, i);
 
 			H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 			if ((stat = H_GETSTAT(q->q.statp)) != FATM_STAT_FREE) {
 				H_SETSTAT(q->q.statp, stat | FATM_STAT_ERROR);
 				H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 				wakeup(q);
 			}
 		}
 		utopia_reset_media(&sc->utopia);
 	}
 	sc->small_cnt = sc->large_cnt = 0;
 
 	/* Reset vcc info */
 	if (sc->vccs != NULL) {
 		sc->open_vccs = 0;
 		for (i = 0; i < FORE_MAX_VCC + 1; i++) {
 			if (sc->vccs[i] != NULL) {
 				if ((sc->vccs[i]->vflags & (FATM_VCC_OPEN |
 				    FATM_VCC_TRY_OPEN)) == 0) {
 					uma_zfree(sc->vcc_zone, sc->vccs[i]);
 					sc->vccs[i] = NULL;
 				} else {
 					sc->vccs[i]->vflags = 0;
 					sc->open_vccs++;
 				}
 			}
 		}
 	}
 
 }
 
 /*
  * Load the firmware into the board and save the entry point.
  */
 static uint32_t
 firmware_load(struct fatm_softc *sc)
 {
 	struct firmware *fw = (struct firmware *)firmware;
 
 	DBG(sc, INIT, ("loading - entry=%x", fw->entry));
 	bus_space_write_region_4(sc->memt, sc->memh, fw->offset, firmware,
 	    sizeof(firmware) / sizeof(firmware[0]));
 	BARRIER_RW(sc);
 
 	return (fw->entry);
 }
 
 /*
  * Read a character from the virtual UART. The availability of a character
  * is signaled by a non-null value of the 32 bit register. The eating of
  * the character by us is signalled to the card by setting that register
  * to zero.
  */
 static int
 rx_getc(struct fatm_softc *sc)
 {
 	int w = 50;
 	int c;
 
 	while (w--) {
 		c = READ4(sc, FATMO_UART_TO_HOST);
 		BARRIER_RW(sc);
 		if (c != 0) {
 			WRITE4(sc, FATMO_UART_TO_HOST, 0);
 			DBGC(sc, UART, ("%c", c & 0xff));
 			return (c & 0xff);
 		}
 		DELAY(1000);
 	}
 	return (-1);
 }
 
 /*
  * Eat up characters from the board and stuff them in the bit-bucket.
  */
 static void
 rx_flush(struct fatm_softc *sc)
 {
 	int w = 10000;
 
 	while (w-- && rx_getc(sc) >= 0)
 		;
 }
 
 /* 
  * Write a character to the card. The UART is available if the register
  * is zero.
  */
 static int
 tx_putc(struct fatm_softc *sc, u_char c)
 {
 	int w = 10;
 	int c1;
 
 	while (w--) {
 		c1 = READ4(sc, FATMO_UART_TO_960);
 		BARRIER_RW(sc);
 		if (c1 == 0) {
 			WRITE4(sc, FATMO_UART_TO_960, c | CHAR_AVAIL);
 			DBGC(sc, UART, ("%c", c & 0xff));
 			return (0);
 		}
 		DELAY(1000);
 	}
 	return (-1);
 }
 
 /*
  * Start the firmware. This is doing by issuing a 'go' command with
  * the hex entry address of the firmware. Then we wait for the self-test to
  * succeed.
  */
 static int
 fatm_start_firmware(struct fatm_softc *sc, uint32_t start)
 {
 	static char hex[] = "0123456789abcdef";
 	u_int w, val;
 
 	DBG(sc, INIT, ("starting"));
 	rx_flush(sc);
 	tx_putc(sc, '\r');
 	DELAY(1000);
 
 	rx_flush(sc);
 
 	tx_putc(sc, 'g');
 	(void)rx_getc(sc);
 	tx_putc(sc, 'o');
 	(void)rx_getc(sc);
 	tx_putc(sc, ' ');
 	(void)rx_getc(sc);
 
 	tx_putc(sc, hex[(start >> 12) & 0xf]);
 	(void)rx_getc(sc);
 	tx_putc(sc, hex[(start >>  8) & 0xf]);
 	(void)rx_getc(sc);
 	tx_putc(sc, hex[(start >>  4) & 0xf]);
 	(void)rx_getc(sc);
 	tx_putc(sc, hex[(start >>  0) & 0xf]);
 	(void)rx_getc(sc);
 
 	tx_putc(sc, '\r');
 	rx_flush(sc);
 
 	for (w = 100; w; w--) {
 		BARRIER_R(sc);
 		val = READ4(sc, FATMO_BOOT_STATUS);
 		switch (val) {
 		  case CP_RUNNING:
 			return (0);
 		  case SELF_TEST_FAIL:
 			return (EIO);
 		}
 		DELAY(1000);
 	}
 	return (EIO);
 }
 
 /*
  * Initialize one card and host queue.
  */
 static void
 init_card_queue(struct fatm_softc *sc, struct fqueue *queue, int qlen,
     size_t qel_size, size_t desc_size, cardoff_t off,
     u_char **statpp, uint32_t *cardstat, u_char *descp, uint32_t carddesc)
 {
 	struct fqelem *el = queue->chunk;
 
 	while (qlen--) {
 		el->card = off;
 		off += 8;	/* size of card entry */
 
 		el->statp = (uint32_t *)(*statpp);
 		(*statpp) += sizeof(uint32_t);
 		H_SETSTAT(el->statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, el->statp);
 
 		WRITE4(sc, el->card + FATMOS_STATP, (*cardstat));
 		(*cardstat) += sizeof(uint32_t);
 
 		el->ioblk = descp;
 		descp += desc_size;
 		el->card_ioblk = carddesc;
 		carddesc += desc_size;
 
 		el = (struct fqelem *)((u_char *)el + qel_size);
 	}
 	queue->tail = queue->head = 0;
 }
 
 /*
  * Issue the initialize operation to the card, wait for completion and
  * initialize the on-board and host queue structures with offsets and
  * addresses.
  */
 static int
 fatm_init_cmd(struct fatm_softc *sc)
 {
 	int w, c;
 	u_char *statp;
 	uint32_t card_stat;
 	u_int cnt;
 	struct fqelem *el;
 	cardoff_t off;
 
 	DBG(sc, INIT, ("command"));
 	WRITE4(sc, FATMO_ISTAT, 0);
 	WRITE4(sc, FATMO_IMASK, 1);
 	WRITE4(sc, FATMO_HLOGGER, 0);
 
 	WRITE4(sc, FATMO_INIT + FATMOI_RECEIVE_TRESHOLD, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_NUM_CONNECT, FORE_MAX_VCC);
 	WRITE4(sc, FATMO_INIT + FATMOI_CQUEUE_LEN, FATM_CMD_QLEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_TQUEUE_LEN, FATM_TX_QLEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_RQUEUE_LEN, FATM_RX_QLEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_RPD_EXTENSION, RPD_EXTENSIONS);
 	WRITE4(sc, FATMO_INIT + FATMOI_TPD_EXTENSION, TPD_EXTENSIONS);
 
 	/*
 	 * initialize buffer descriptors
 	 */
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B1 + FATMOB_QUEUE_LENGTH,
 	    SMALL_SUPPLY_QLEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B1 + FATMOB_BUFFER_SIZE,
 	    SMALL_BUFFER_LEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B1 + FATMOB_POOL_SIZE,
 	    SMALL_POOL_SIZE);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B1 + FATMOB_SUPPLY_BLKSIZE,
 	    SMALL_SUPPLY_BLKSIZE);
 
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B1 + FATMOB_QUEUE_LENGTH,
 	    LARGE_SUPPLY_QLEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B1 + FATMOB_BUFFER_SIZE,
 	    LARGE_BUFFER_LEN);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B1 + FATMOB_POOL_SIZE,
 	    LARGE_POOL_SIZE);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B1 + FATMOB_SUPPLY_BLKSIZE,
 	    LARGE_SUPPLY_BLKSIZE);
 
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B2 + FATMOB_QUEUE_LENGTH, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B2 + FATMOB_BUFFER_SIZE, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B2 + FATMOB_POOL_SIZE, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_SMALL_B2 + FATMOB_SUPPLY_BLKSIZE, 0);
 
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B2 + FATMOB_QUEUE_LENGTH, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B2 + FATMOB_BUFFER_SIZE, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B2 + FATMOB_POOL_SIZE, 0);
 	WRITE4(sc, FATMO_INIT + FATMOI_LARGE_B2 + FATMOB_SUPPLY_BLKSIZE, 0);
 
 	/*
 	 * Start the command
 	 */
 	BARRIER_W(sc);
 	WRITE4(sc, FATMO_INIT + FATMOI_STATUS, FATM_STAT_PENDING);
 	BARRIER_W(sc);
 	WRITE4(sc, FATMO_INIT + FATMOI_OP, FATM_OP_INITIALIZE);
 	BARRIER_W(sc);
 
 	/*
 	 * Busy wait for completion
 	 */
 	w = 100;
 	while (w--) {
 		c = READ4(sc, FATMO_INIT + FATMOI_STATUS);
 		BARRIER_R(sc);
 		if (c & FATM_STAT_COMPLETE)
 			break;
 		DELAY(1000);
 	}
 
 	if (c & FATM_STAT_ERROR)
 		return (EIO);
 
 	/*
 	 * Initialize the queues
 	 */
 	statp = sc->stat_mem.mem;
 	card_stat = sc->stat_mem.paddr;
 
 	/*
 	 * Command queue. This is special in that it's on the card.
 	 */
 	el = sc->cmdqueue.chunk;
 	off = READ4(sc, FATMO_COMMAND_QUEUE);
 	DBG(sc, INIT, ("cmd queue=%x", off));
 	for (cnt = 0; cnt < FATM_CMD_QLEN; cnt++) {
 		el = &((struct cmdqueue *)sc->cmdqueue.chunk + cnt)->q;
 
 		el->card = off;
 		off += 32;		/* size of card structure */
 
 		el->statp = (uint32_t *)statp;
 		statp += sizeof(uint32_t);
 		H_SETSTAT(el->statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, el->statp);
 
 		WRITE4(sc, el->card + FATMOC_STATP, card_stat);
 		card_stat += sizeof(uint32_t);
 	}
 	sc->cmdqueue.tail = sc->cmdqueue.head = 0;
 
 	/*
 	 * Now the other queues. These are in memory
 	 */
 	init_card_queue(sc, &sc->txqueue, FATM_TX_QLEN,
 	    sizeof(struct txqueue), TPD_SIZE,
 	    READ4(sc, FATMO_TRANSMIT_QUEUE),
 	    &statp, &card_stat, sc->txq_mem.mem, sc->txq_mem.paddr);
 
 	init_card_queue(sc, &sc->rxqueue, FATM_RX_QLEN,
 	    sizeof(struct rxqueue), RPD_SIZE,
 	    READ4(sc, FATMO_RECEIVE_QUEUE),
 	    &statp, &card_stat, sc->rxq_mem.mem, sc->rxq_mem.paddr);
 
 	init_card_queue(sc, &sc->s1queue, SMALL_SUPPLY_QLEN,
 	    sizeof(struct supqueue), BSUP_BLK2SIZE(SMALL_SUPPLY_BLKSIZE),
 	    READ4(sc, FATMO_SMALL_B1_QUEUE),
 	    &statp, &card_stat, sc->s1q_mem.mem, sc->s1q_mem.paddr);
 
 	init_card_queue(sc, &sc->l1queue, LARGE_SUPPLY_QLEN,
 	    sizeof(struct supqueue), BSUP_BLK2SIZE(LARGE_SUPPLY_BLKSIZE),
 	    READ4(sc, FATMO_LARGE_B1_QUEUE),
 	    &statp, &card_stat, sc->l1q_mem.mem, sc->l1q_mem.paddr);
 
 	sc->txcnt = 0;
 
 	return (0);
 }
 
 /*
  * Read PROM. Called only from attach code. Here we spin because the interrupt
  * handler is not yet set up.
  */
 static int
 fatm_getprom(struct fatm_softc *sc)
 {
 	int i;
 	struct prom *prom;
 	struct cmdqueue *q;
 
 	DBG(sc, INIT, ("reading prom"));
 	q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.head);
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.head, FATM_CMD_QLEN);
 
 	q->error = 0;
 	q->cb = NULL;
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 	bus_dmamap_sync(sc->prom_mem.dmat, sc->prom_mem.map,
 	    BUS_DMASYNC_PREREAD);
 
 	WRITE4(sc, q->q.card + FATMOC_GPROM_BUF, sc->prom_mem.paddr);
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_OP, FATM_OP_GET_PROM_DATA);
 	BARRIER_W(sc);
 
 	for (i = 0; i < 1000; i++) {
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		if (H_GETSTAT(q->q.statp) &
 		    (FATM_STAT_COMPLETE | FATM_STAT_ERROR))
 			break;
 		DELAY(1000);
 	}
 	if (i == 1000) {
 		if_printf(sc->ifp, "getprom timeout\n");
 		return (EIO);
 	}
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		if_printf(sc->ifp, "getprom error\n");
 		return (EIO);
 	}
 	H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.tail, FATM_CMD_QLEN);
 
 	bus_dmamap_sync(sc->prom_mem.dmat, sc->prom_mem.map,
 	    BUS_DMASYNC_POSTREAD);
 
 
 #ifdef notdef
 	{
 		u_int i;
 
 		printf("PROM: ");
 		u_char *ptr = (u_char *)sc->prom_mem.mem;
 		for (i = 0; i < sizeof(struct prom); i++)
 			printf("%02x ", *ptr++);
 		printf("\n");
 	}
 #endif
 
 	prom = (struct prom *)sc->prom_mem.mem;
 
 	bcopy(prom->mac + 2, IFP2IFATM(sc->ifp)->mib.esi, 6);
 	IFP2IFATM(sc->ifp)->mib.serial = le32toh(prom->serial);
 	IFP2IFATM(sc->ifp)->mib.hw_version = le32toh(prom->version);
 	IFP2IFATM(sc->ifp)->mib.sw_version = READ4(sc, FATMO_FIRMWARE_RELEASE);
 
 	if_printf(sc->ifp, "ESI=%02x:%02x:%02x:%02x:%02x:%02x "
 	    "serial=%u hw=0x%x sw=0x%x\n", IFP2IFATM(sc->ifp)->mib.esi[0],
 	    IFP2IFATM(sc->ifp)->mib.esi[1], IFP2IFATM(sc->ifp)->mib.esi[2], IFP2IFATM(sc->ifp)->mib.esi[3],
 	    IFP2IFATM(sc->ifp)->mib.esi[4], IFP2IFATM(sc->ifp)->mib.esi[5], IFP2IFATM(sc->ifp)->mib.serial,
 	    IFP2IFATM(sc->ifp)->mib.hw_version, IFP2IFATM(sc->ifp)->mib.sw_version);
 
 	return (0);
 }
 
 /*
  * This is the callback function for bus_dmamap_load. We assume, that we
  * have a 32-bit bus and so have always one segment.
  */
 static void
 dmaload_helper(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	bus_addr_t *ptr = (bus_addr_t *)arg;
 
 	if (error != 0) {
 		printf("%s: error=%d\n", __func__, error);
 		return;
 	}
 	KASSERT(nsegs == 1, ("too many DMA segments"));
 	KASSERT(segs[0].ds_addr <= 0xffffffff, ("DMA address too large %lx",
 	    (u_long)segs[0].ds_addr));
 
 	*ptr = segs[0].ds_addr;
 }
 
 /*
  * Allocate a chunk of DMA-able memory and map it.
  */
 static int
 alloc_dma_memory(struct fatm_softc *sc, const char *nm, struct fatm_mem *mem)
 {
 	int error;
 
 	mem->mem = NULL;
 
 	if (bus_dma_tag_create(sc->parent_dmat, mem->align, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
 	    NULL, NULL, mem->size, 1, BUS_SPACE_MAXSIZE_32BIT,
 	    BUS_DMA_ALLOCNOW, NULL, NULL, &mem->dmat)) {
 		if_printf(sc->ifp, "could not allocate %s DMA tag\n",
 		    nm);
 		return (ENOMEM);
 	}
 
 	error = bus_dmamem_alloc(mem->dmat, &mem->mem, 0, &mem->map);
 	if (error) {
 		if_printf(sc->ifp, "could not allocate %s DMA memory: "
 		    "%d\n", nm, error);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 		return (error);
 	}
 
 	error = bus_dmamap_load(mem->dmat, mem->map, mem->mem, mem->size,
 	    dmaload_helper, &mem->paddr, BUS_DMA_NOWAIT);
 	if (error) {
 		if_printf(sc->ifp, "could not load %s DMA memory: "
 		    "%d\n", nm, error);
 		bus_dmamem_free(mem->dmat, mem->mem, mem->map);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 		return (error);
 	}
 
 	DBG(sc, DMA, ("DMA %s V/P/S/Z %p/%lx/%x/%x", nm, mem->mem,
 	    (u_long)mem->paddr, mem->size, mem->align));
 
 	return (0);
 }
 
 #ifdef TEST_DMA_SYNC
 static int
 alloc_dma_memoryX(struct fatm_softc *sc, const char *nm, struct fatm_mem *mem)
 {
 	int error;
 
 	mem->mem = NULL;
 
 	if (bus_dma_tag_create(NULL, mem->align, 0,
 	    BUS_SPACE_MAXADDR_24BIT, BUS_SPACE_MAXADDR,
 	    NULL, NULL, mem->size, 1, mem->size,
 	    BUS_DMA_ALLOCNOW, NULL, NULL, &mem->dmat)) {
 		if_printf(sc->ifp, "could not allocate %s DMA tag\n",
 		    nm);
 		return (ENOMEM);
 	}
 
 	mem->mem = contigmalloc(mem->size, M_DEVBUF, M_WAITOK,
 	    BUS_SPACE_MAXADDR_24BIT, BUS_SPACE_MAXADDR_32BIT, mem->align, 0);
 
 	error = bus_dmamap_create(mem->dmat, 0, &mem->map);
 	if (error) {
 		if_printf(sc->ifp, "could not allocate %s DMA map: "
 		    "%d\n", nm, error);
 		contigfree(mem->mem, mem->size, M_DEVBUF);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 		return (error);
 	}
 
 	error = bus_dmamap_load(mem->dmat, mem->map, mem->mem, mem->size,
 	    dmaload_helper, &mem->paddr, BUS_DMA_NOWAIT);
 	if (error) {
 		if_printf(sc->ifp, "could not load %s DMA memory: "
 		    "%d\n", nm, error);
 		bus_dmamap_destroy(mem->dmat, mem->map);
 		contigfree(mem->mem, mem->size, M_DEVBUF);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 		return (error);
 	}
 
 	DBG(sc, DMA, ("DMAX %s V/P/S/Z %p/%lx/%x/%x", nm, mem->mem,
 	    (u_long)mem->paddr, mem->size, mem->align));
 
 	printf("DMAX: %s V/P/S/Z %p/%lx/%x/%x", nm, mem->mem,
 	    (u_long)mem->paddr, mem->size, mem->align);
 
 	return (0);
 }
 #endif /* TEST_DMA_SYNC */
 
 /*
  * Destroy all resources of an dma-able memory chunk
  */
 static void
 destroy_dma_memory(struct fatm_mem *mem)
 {
 	if (mem->mem != NULL) {
 		bus_dmamap_unload(mem->dmat, mem->map);
 		bus_dmamem_free(mem->dmat, mem->mem, mem->map);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 	}
 }
 #ifdef TEST_DMA_SYNC
 static void
 destroy_dma_memoryX(struct fatm_mem *mem)
 {
 	if (mem->mem != NULL) {
 		bus_dmamap_unload(mem->dmat, mem->map);
 		bus_dmamap_destroy(mem->dmat, mem->map);
 		contigfree(mem->mem, mem->size, M_DEVBUF);
 		bus_dma_tag_destroy(mem->dmat);
 		mem->mem = NULL;
 	}
 }
 #endif /* TEST_DMA_SYNC */
 
 /*
  * Try to supply buffers to the card if there are free entries in the queues
  */
 static void
 fatm_supply_small_buffers(struct fatm_softc *sc)
 {
 	int nblocks, nbufs;
 	struct supqueue *q;
 	struct rbd *bd;
 	int i, j, error, cnt;
 	struct mbuf *m;
 	struct rbuf *rb;
 	bus_addr_t phys;
 
 	nbufs = max(4 * sc->open_vccs, 32);
 	nbufs = min(nbufs, SMALL_POOL_SIZE);
 	nbufs -= sc->small_cnt;
 
 	nblocks = (nbufs + SMALL_SUPPLY_BLKSIZE - 1) / SMALL_SUPPLY_BLKSIZE;
 	for (cnt = 0; cnt < nblocks; cnt++) {
 		q = GET_QUEUE(sc->s1queue, struct supqueue, sc->s1queue.head);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		if (H_GETSTAT(q->q.statp) != FATM_STAT_FREE)
 			break;
 
 		bd = (struct rbd *)q->q.ioblk;
 
 		for (i = 0; i < SMALL_SUPPLY_BLKSIZE; i++) {
 			if ((rb = LIST_FIRST(&sc->rbuf_free)) == NULL) {
 				if_printf(sc->ifp, "out of rbufs\n");
 				break;
 			}
 			MGETHDR(m, M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 				break;
 			}
-			MH_ALIGN(m, SMALL_BUFFER_LEN);
+			M_ALIGN(m, SMALL_BUFFER_LEN);
 			error = bus_dmamap_load(sc->rbuf_tag, rb->map,
 			    m->m_data, SMALL_BUFFER_LEN, dmaload_helper,
 			    &phys, BUS_DMA_NOWAIT);
 			if (error) {
 				if_printf(sc->ifp,
 				    "dmamap_load mbuf failed %d", error);
 				m_freem(m);
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 				break;
 			}
 			bus_dmamap_sync(sc->rbuf_tag, rb->map,
 			    BUS_DMASYNC_PREREAD);
 
 			LIST_REMOVE(rb, link);
 			LIST_INSERT_HEAD(&sc->rbuf_used, rb, link);
 
 			rb->m = m;
 			bd[i].handle = rb - sc->rbufs;
 			H_SETDESC(bd[i].buffer, phys);
 		}
 
 		if (i < SMALL_SUPPLY_BLKSIZE) {
 			for (j = 0; j < i; j++) {
 				rb = sc->rbufs + bd[j].handle;
 				bus_dmamap_unload(sc->rbuf_tag, rb->map);
 				m_free(rb->m);
 				rb->m = NULL;
 
 				LIST_REMOVE(rb, link);
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 			}
 			break;
 		}
 		H_SYNCQ_PREWRITE(&sc->s1q_mem, bd,
 		    sizeof(struct rbd) * SMALL_SUPPLY_BLKSIZE);
 
 		H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		WRITE4(sc, q->q.card, q->q.card_ioblk);
 		BARRIER_W(sc);
 
 		sc->small_cnt += SMALL_SUPPLY_BLKSIZE;
 
 		NEXT_QUEUE_ENTRY(sc->s1queue.head, SMALL_SUPPLY_QLEN);
 	}
 }
 
 /*
  * Try to supply buffers to the card if there are free entries in the queues
  * We assume that all buffers are within the address space accessible by the
  * card (32-bit), so we don't need bounce buffers.
  */
 static void
 fatm_supply_large_buffers(struct fatm_softc *sc)
 {
 	int nbufs, nblocks, cnt;
 	struct supqueue *q;
 	struct rbd *bd;
 	int i, j, error;
 	struct mbuf *m;
 	struct rbuf *rb;
 	bus_addr_t phys;
 
 	nbufs = max(4 * sc->open_vccs, 32);
 	nbufs = min(nbufs, LARGE_POOL_SIZE);
 	nbufs -= sc->large_cnt;
 
 	nblocks = (nbufs + LARGE_SUPPLY_BLKSIZE - 1) / LARGE_SUPPLY_BLKSIZE;
 
 	for (cnt = 0; cnt < nblocks; cnt++) {
 		q = GET_QUEUE(sc->l1queue, struct supqueue, sc->l1queue.head);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		if (H_GETSTAT(q->q.statp) != FATM_STAT_FREE)
 			break;
 
 		bd = (struct rbd *)q->q.ioblk;
 
 		for (i = 0; i < LARGE_SUPPLY_BLKSIZE; i++) {
 			if ((rb = LIST_FIRST(&sc->rbuf_free)) == NULL) {
 				if_printf(sc->ifp, "out of rbufs\n");
 				break;
 			}
 			if ((m = m_getcl(M_NOWAIT, MT_DATA,
 			    M_PKTHDR)) == NULL) {
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 				break;
 			}
 			/* No MEXT_ALIGN */
 			m->m_data += MCLBYTES - LARGE_BUFFER_LEN;
 			error = bus_dmamap_load(sc->rbuf_tag, rb->map,
 			    m->m_data, LARGE_BUFFER_LEN, dmaload_helper,
 			    &phys, BUS_DMA_NOWAIT);
 			if (error) {
 				if_printf(sc->ifp,
 				    "dmamap_load mbuf failed %d", error);
 				m_freem(m);
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 				break;
 			}
 
 			bus_dmamap_sync(sc->rbuf_tag, rb->map,
 			    BUS_DMASYNC_PREREAD);
 
 			LIST_REMOVE(rb, link);
 			LIST_INSERT_HEAD(&sc->rbuf_used, rb, link);
 
 			rb->m = m;
 			bd[i].handle = rb - sc->rbufs;
 			H_SETDESC(bd[i].buffer, phys);
 		}
 
 		if (i < LARGE_SUPPLY_BLKSIZE) {
 			for (j = 0; j < i; j++) {
 				rb = sc->rbufs + bd[j].handle;
 				bus_dmamap_unload(sc->rbuf_tag, rb->map);
 				m_free(rb->m);
 				rb->m = NULL;
 
 				LIST_REMOVE(rb, link);
 				LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 			}
 			break;
 		}
 		H_SYNCQ_PREWRITE(&sc->l1q_mem, bd,
 		    sizeof(struct rbd) * LARGE_SUPPLY_BLKSIZE);
 
 		H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 		WRITE4(sc, q->q.card, q->q.card_ioblk);
 		BARRIER_W(sc);
 
 		sc->large_cnt += LARGE_SUPPLY_BLKSIZE;
 
 		NEXT_QUEUE_ENTRY(sc->l1queue.head, LARGE_SUPPLY_QLEN);
 	}
 }
 
 
 /*
  * Actually start the card. The lock must be held here.
  * Reset, load the firmware, start it, initializes queues, read the PROM
  * and supply receive buffers to the card.
  */
 static void
 fatm_init_locked(struct fatm_softc *sc)
 {
 	struct rxqueue *q;
 	int i, c, error;
 	uint32_t start;
 
 	DBG(sc, INIT, ("initialize"));
 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
 		fatm_stop(sc);
 
 	/*
 	 * Hard reset the board
 	 */
 	if (fatm_reset(sc))
 		return;
 
 	start = firmware_load(sc);
 	if (fatm_start_firmware(sc, start) || fatm_init_cmd(sc) ||
 	    fatm_getprom(sc)) {
 		fatm_reset(sc);
 		return;
 	}
 
 	/*
 	 * Handle media
 	 */
 	c = READ4(sc, FATMO_MEDIA_TYPE);
 	switch (c) {
 
 	  case FORE_MT_TAXI_100:
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_TAXI_100;
 		IFP2IFATM(sc->ifp)->mib.pcr = 227273;
 		break;
 
 	  case FORE_MT_TAXI_140:
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_TAXI_140;
 		IFP2IFATM(sc->ifp)->mib.pcr = 318181;
 		break;
 
 	  case FORE_MT_UTP_SONET:
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_UTP_155;
 		IFP2IFATM(sc->ifp)->mib.pcr = 353207;
 		break;
 
 	  case FORE_MT_MM_OC3_ST:
 	  case FORE_MT_MM_OC3_SC:
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_MM_155;
 		IFP2IFATM(sc->ifp)->mib.pcr = 353207;
 		break;
 
 	  case FORE_MT_SM_OC3_ST:
 	  case FORE_MT_SM_OC3_SC:
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_SM_155;
 		IFP2IFATM(sc->ifp)->mib.pcr = 353207;
 		break;
 
 	  default:
 		log(LOG_ERR, "fatm: unknown media type %d\n", c);
 		IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_UNKNOWN;
 		IFP2IFATM(sc->ifp)->mib.pcr = 353207;
 		break;
 	}
 	sc->ifp->if_baudrate = 53 * 8 * IFP2IFATM(sc->ifp)->mib.pcr;
 	utopia_init_media(&sc->utopia);
 
 	/*
 	 * Initialize the RBDs
 	 */
 	for (i = 0; i < FATM_RX_QLEN; i++) {
 		q = GET_QUEUE(sc->rxqueue, struct rxqueue, i);
 		WRITE4(sc, q->q.card + 0, q->q.card_ioblk);
 	}
 	BARRIER_W(sc);
 
 	/*
 	 * Supply buffers to the card
 	 */
 	fatm_supply_small_buffers(sc);
 	fatm_supply_large_buffers(sc);
 
 	/*
 	 * Now set flags, that we are ready
 	 */
 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Start the watchdog timer
 	 */
 	callout_reset(&sc->watchdog_timer, hz * 5, fatm_watchdog, sc);
 
 	/* start SUNI */
 	utopia_start(&sc->utopia);
 
 	ATMEV_SEND_IFSTATE_CHANGED(IFP2IFATM(sc->ifp),
 	    sc->utopia.carrier == UTP_CARR_OK);
 
 	/* start all channels */
 	for (i = 0; i < FORE_MAX_VCC + 1; i++)
 		if (sc->vccs[i] != NULL) {
 			sc->vccs[i]->vflags |= FATM_VCC_REOPEN;
 			error = fatm_load_vc(sc, sc->vccs[i]);
 			if (error != 0) {
 				if_printf(sc->ifp, "reopening %u "
 				    "failed: %d\n", i, error);
 				sc->vccs[i]->vflags &= ~FATM_VCC_REOPEN;
 			}
 		}
 
 	DBG(sc, INIT, ("done"));
 }
 
 /*
  * This is the exported as initialisation function.
  */
 static void
 fatm_init(void *p)
 {
 	struct fatm_softc *sc = p;
 
 	FATM_LOCK(sc);
 	fatm_init_locked(sc);
 	FATM_UNLOCK(sc);
 }
 
 /************************************************************/
 /*
  * The INTERRUPT handling
  */
 /*
  * Check the command queue. If a command was completed, call the completion
  * function for that command.
  */
 static void
 fatm_intr_drain_cmd(struct fatm_softc *sc)
 {
 	struct cmdqueue *q;
 	int stat;
 
 	/*
 	 * Drain command queue
 	 */
 	for (;;) {
 		q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.tail);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		stat = H_GETSTAT(q->q.statp);
 
 		if (stat != FATM_STAT_COMPLETE &&
 		   stat != (FATM_STAT_COMPLETE | FATM_STAT_ERROR) &&
 		   stat != FATM_STAT_ERROR)
 			break;
 
 		(*q->cb)(sc, q);
 
 		H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		NEXT_QUEUE_ENTRY(sc->cmdqueue.tail, FATM_CMD_QLEN);
 	}
 }
 
 /*
  * Drain the small buffer supply queue.
  */
 static void
 fatm_intr_drain_small_buffers(struct fatm_softc *sc)
 {
 	struct supqueue *q;
 	int stat;
 
 	for (;;) {
 		q = GET_QUEUE(sc->s1queue, struct supqueue, sc->s1queue.tail);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		stat = H_GETSTAT(q->q.statp);
 
 		if ((stat & FATM_STAT_COMPLETE) == 0)
 			break;
 		if (stat & FATM_STAT_ERROR)
 			log(LOG_ERR, "%s: status %x\n", __func__, stat);
 
 		H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		NEXT_QUEUE_ENTRY(sc->s1queue.tail, SMALL_SUPPLY_QLEN);
 	}
 }
 
 /*
  * Drain the large buffer supply queue.
  */
 static void
 fatm_intr_drain_large_buffers(struct fatm_softc *sc)
 {
 	struct supqueue *q;
 	int stat;
 
 	for (;;) {
 		q = GET_QUEUE(sc->l1queue, struct supqueue, sc->l1queue.tail);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		stat = H_GETSTAT(q->q.statp);
 
 		if ((stat & FATM_STAT_COMPLETE) == 0)
 			break;
 		if (stat & FATM_STAT_ERROR)
 			log(LOG_ERR, "%s status %x\n", __func__, stat);
 
 		H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		NEXT_QUEUE_ENTRY(sc->l1queue.tail, LARGE_SUPPLY_QLEN);
 	}
 }
 
 /*
  * Check the receive queue. Send any received PDU up the protocol stack
  * (except when there was an error or the VCI appears to be closed. In this
  * case discard the PDU).
  */
 static void
 fatm_intr_drain_rx(struct fatm_softc *sc)
 {
 	struct rxqueue *q;
 	int stat, mlen;
 	u_int i;
 	uint32_t h;
 	struct mbuf *last, *m0;
 	struct rpd *rpd;
 	struct rbuf *rb;
 	u_int vci, vpi, pt;
 	struct atm_pseudohdr aph;
 	struct ifnet *ifp;
 	struct card_vcc *vc;
 
 	for (;;) {
 		q = GET_QUEUE(sc->rxqueue, struct rxqueue, sc->rxqueue.tail);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		stat = H_GETSTAT(q->q.statp);
 
 		if ((stat & FATM_STAT_COMPLETE) == 0)
 			break;
 
 		rpd = (struct rpd *)q->q.ioblk;
 		H_SYNCQ_POSTREAD(&sc->rxq_mem, rpd, RPD_SIZE);
 
 		rpd->nseg = le32toh(rpd->nseg);
 		mlen = 0;
 		m0 = last = 0;
 		for (i = 0; i < rpd->nseg; i++) {
 			rb = sc->rbufs + rpd->segment[i].handle;
 			if (m0 == NULL) {
 				m0 = last = rb->m;
 			} else {
 				last->m_next = rb->m;
 				last = rb->m;
 			}
 			last->m_next = NULL;
 			if (last->m_flags & M_EXT)
 				sc->large_cnt--;
 			else
 				sc->small_cnt--;
 			bus_dmamap_sync(sc->rbuf_tag, rb->map,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(sc->rbuf_tag, rb->map);
 			rb->m = NULL;
 
 			LIST_REMOVE(rb, link);
 			LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 
 			last->m_len = le32toh(rpd->segment[i].length);
 			mlen += last->m_len;
 		}
 
 		m0->m_pkthdr.len = mlen;
 		m0->m_pkthdr.rcvif = sc->ifp;
 
 		h = le32toh(rpd->atm_header);
 		vpi = (h >> 20) & 0xff;
 		vci = (h >> 4 ) & 0xffff;
 		pt  = (h >> 1 ) & 0x7;
 
 		/*
 		 * Locate the VCC this packet belongs to
 		 */
 		if (!VC_OK(sc, vpi, vci))
 			vc = NULL;
 		else if ((vc = sc->vccs[vci]) == NULL ||
 		    !(sc->vccs[vci]->vflags & FATM_VCC_OPEN)) {
 			sc->istats.rx_closed++;
 			vc = NULL;
 		}
 
 		DBG(sc, RCV, ("RCV: vc=%u.%u pt=%u mlen=%d %s", vpi, vci,
 		    pt, mlen, vc == NULL ? "dropped" : ""));
 
 		if (vc == NULL) {
 			m_freem(m0);
 		} else {
 #ifdef ENABLE_BPF
 			if (!(vc->param.flags & ATMIO_FLAG_NG) &&
 			    vc->param.aal == ATMIO_AAL_5 &&
 			    (vc->param.flags & ATM_PH_LLCSNAP))
 				BPF_MTAP(sc->ifp, m0);
 #endif
 
 			ATM_PH_FLAGS(&aph) = vc->param.flags;
 			ATM_PH_VPI(&aph) = vpi;
 			ATM_PH_SETVCI(&aph, vci);
 
 			ifp = sc->ifp;
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 
 			vc->ipackets++;
 			vc->ibytes += m0->m_pkthdr.len;
 
 			atm_input(ifp, &aph, m0, vc->rxhand);
 		}
 
 		H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		WRITE4(sc, q->q.card, q->q.card_ioblk);
 		BARRIER_W(sc);
 
 		NEXT_QUEUE_ENTRY(sc->rxqueue.tail, FATM_RX_QLEN);
 	}
 }
 
 /*
  * Check the transmit queue. Free the mbuf chains that we were transmitting.
  */
 static void
 fatm_intr_drain_tx(struct fatm_softc *sc)
 {
 	struct txqueue *q;
 	int stat;
 
 	/*
 	 * Drain tx queue
 	 */
 	for (;;) {
 		q = GET_QUEUE(sc->txqueue, struct txqueue, sc->txqueue.tail);
 
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		stat = H_GETSTAT(q->q.statp);
 
 		if (stat != FATM_STAT_COMPLETE &&
 		    stat != (FATM_STAT_COMPLETE | FATM_STAT_ERROR) &&
 		    stat != FATM_STAT_ERROR)
 			break;
 
 		H_SETSTAT(q->q.statp, FATM_STAT_FREE);
 		H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 		bus_dmamap_sync(sc->tx_tag, q->map, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(sc->tx_tag, q->map);
 
 		m_freem(q->m);
 		q->m = NULL;
 		sc->txcnt--;
 
 		NEXT_QUEUE_ENTRY(sc->txqueue.tail, FATM_TX_QLEN);
 	}
 }
 
 /*
  * Interrupt handler
  */
 static void
 fatm_intr(void *p)
 {
 	struct fatm_softc *sc = (struct fatm_softc *)p;
 
 	FATM_LOCK(sc);
 	if (!READ4(sc, FATMO_PSR)) {
 		FATM_UNLOCK(sc);
 		return;
 	}
 	WRITE4(sc, FATMO_HCR, FATM_HCR_CLRIRQ);
 
 	if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		FATM_UNLOCK(sc);
 		return;
 	}
 	fatm_intr_drain_cmd(sc);
 	fatm_intr_drain_rx(sc);
 	fatm_intr_drain_tx(sc);
 	fatm_intr_drain_small_buffers(sc);
 	fatm_intr_drain_large_buffers(sc);
 	fatm_supply_small_buffers(sc);
 	fatm_supply_large_buffers(sc);
 
 	FATM_UNLOCK(sc);
 
 	if (sc->retry_tx && _IF_QLEN(&sc->ifp->if_snd))
 		(*sc->ifp->if_start)(sc->ifp);
 }
 
 /*
  * Get device statistics. This must be called with the softc locked.
  * We use a preallocated buffer, so we need to protect this buffer.
  * We do this by using a condition variable and a flag. If the flag is set
  * the buffer is in use by one thread (one thread is executing a GETSTAT
  * card command). In this case all other threads that are trying to get
  * statistics block on that condition variable. When the thread finishes
  * using the buffer it resets the flag and signals the condition variable. This
  * will wakeup the next thread that is waiting for the buffer. If the interface
  * is stopped the stopping function will broadcast the cv. All threads will
  * find that the interface has been stopped and return.
  *
  * Aquiring of the buffer is done by the fatm_getstat() function. The freeing
  * must be done by the caller when he has finished using the buffer.
  */
 static void
 fatm_getstat_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.get_stat_errors++;
 		q->error = EIO;
 	}
 	wakeup(&sc->sadi_mem);
 }
 static int
 fatm_getstat(struct fatm_softc *sc)
 {
 	int error;
 	struct cmdqueue *q;
 
 	/*
 	 * Wait until either the interface is stopped or we can get the
 	 * statistics buffer
 	 */
 	for (;;) {
 		if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING))
 			return (EIO);
 		if (!(sc->flags & FATM_STAT_INUSE))
 			break;
 		cv_wait(&sc->cv_stat, &sc->mtx);
 	}
 	sc->flags |= FATM_STAT_INUSE;
 
 	q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.head);
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (!(H_GETSTAT(q->q.statp) & FATM_STAT_FREE)) {
 		sc->istats.cmd_queue_full++;
 		return (EIO);
 	}
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.head, FATM_CMD_QLEN);
 
 	q->error = 0;
 	q->cb = fatm_getstat_complete;
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 	bus_dmamap_sync(sc->sadi_mem.dmat, sc->sadi_mem.map,
 	    BUS_DMASYNC_PREREAD);
 
 	WRITE4(sc, q->q.card + FATMOC_GSTAT_BUF,
 	    sc->sadi_mem.paddr);
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_OP,
 	    FATM_OP_REQUEST_STATS | FATM_OP_INTERRUPT_SEL);
 	BARRIER_W(sc);
 
 	/*
 	 * Wait for the command to complete
 	 */
 	error = msleep(&sc->sadi_mem, &sc->mtx, PZERO | PCATCH,
 	    "fatm_stat", hz);
 
 	switch (error) {
 
 	  case EWOULDBLOCK:
 		error = EIO;
 		break;
 
 	  case ERESTART:
 		error = EINTR;
 		break;
 
 	  case 0:
 		bus_dmamap_sync(sc->sadi_mem.dmat, sc->sadi_mem.map,
 		    BUS_DMASYNC_POSTREAD);
 		error = q->error;
 		break;
 	}
 
 	/*
 	 * Swap statistics
 	 */
 	if (q->error == 0) {
 		u_int i;
 		uint32_t *p = (uint32_t *)sc->sadi_mem.mem;
 
 		for (i = 0; i < sizeof(struct fatm_stats) / sizeof(uint32_t);
 		    i++, p++)
 			*p = be32toh(*p);
 	}
 
 	return (error);
 }
 
 /*
  * Create a copy of a single mbuf. It can have either internal or
  * external data, it may have a packet header. External data is really
  * copied, so the new buffer is writeable.
  */
 static struct mbuf *
 copy_mbuf(struct mbuf *m)
 {
 	struct mbuf *new;
 
 	MGET(new, M_NOWAIT, MT_DATA);
 	if (new == NULL)
 		return (NULL);
 
 	if (m->m_flags & M_PKTHDR) {
 		M_MOVE_PKTHDR(new, m);
 		if (m->m_len > MHLEN)
 			MCLGET(new, M_WAITOK);
 	} else {
 		if (m->m_len > MLEN)
 			MCLGET(new, M_WAITOK);
 	}
 
 	bcopy(m->m_data, new->m_data, m->m_len);
 	new->m_len = m->m_len;
 	new->m_flags &= ~M_RDONLY;
 
 	return (new);
 }
 
 /*
  * All segments must have a four byte aligned buffer address and a four
  * byte aligned length. Step through an mbuf chain and check these conditions.
  * If the buffer address is not aligned and this is a normal mbuf, move
  * the data down. Else make a copy of the mbuf with aligned data.
  * If the buffer length is not aligned steel data from the next mbuf.
  * We don't need to check whether this has more than one external reference,
  * because steeling data doesn't change the external cluster.
  * If the last mbuf is not aligned, fill with zeroes.
  *
  * Return packet length (well we should have this in the packet header),
  * but be careful not to count the zero fill at the end.
  *
  * If fixing fails free the chain and zero the pointer.
  *
  * We assume, that aligning the virtual address also aligns the mapped bus
  * address.
  */
 static u_int
 fatm_fix_chain(struct fatm_softc *sc, struct mbuf **mp)
 {
 	struct mbuf *m = *mp, *prev = NULL, *next, *new;
 	u_int mlen = 0, fill = 0;	
 	int first, off;
 	u_char *d, *cp;
 
 	do {
 		next = m->m_next;
 
 		if ((uintptr_t)mtod(m, void *) % 4 != 0 ||
 		   (m->m_len % 4 != 0 && next)) {
 			/*
 			 * Needs fixing
 			 */
 			first = (m == *mp);
 
 			d = mtod(m, u_char *);
 			if ((off = (uintptr_t)(void *)d % 4) != 0) {
 				if (M_WRITABLE(m)) {
 					sc->istats.fix_addr_copy++;
 					bcopy(d, d - off, m->m_len);
 					m->m_data = (caddr_t)(d - off);
 				} else {
 					if ((new = copy_mbuf(m)) == NULL) {
 						sc->istats.fix_addr_noext++;
 						goto fail;
 					}
 					sc->istats.fix_addr_ext++;
 					if (prev)
 						prev->m_next = new;
 					new->m_next = next;
 					m_free(m);
 					m = new;
 				}
 			}
 
 			if ((off = m->m_len % 4) != 0) {
 				if (!M_WRITABLE(m)) {
 					if ((new = copy_mbuf(m)) == NULL) {
 						sc->istats.fix_len_noext++;
 						goto fail;
 					}
 					sc->istats.fix_len_copy++;
 					if (prev)
 						prev->m_next = new;
 					new->m_next = next;
 					m_free(m);
 					m = new;
 				} else
 					sc->istats.fix_len++;
 				d = mtod(m, u_char *) + m->m_len;
 				off = 4 - off;
 				while (off) {
 					if (next == NULL) {
 						*d++ = 0;
 						fill++;
 					} else if (next->m_len == 0) {
 						sc->istats.fix_empty++;
 						next = m_free(next);
 						continue;
 					} else {
 						cp = mtod(next, u_char *);
 						*d++ = *cp++;
 						next->m_len--;
 						next->m_data = (caddr_t)cp;
 					}
 					off--;
 					m->m_len++;
 				}
 			}
 
 			if (first)
 				*mp = m;
 		}
 
 		mlen += m->m_len;
 		prev = m;
 	} while ((m = next) != NULL);
 
 	return (mlen - fill);
 
   fail:
 	m_freem(*mp);
 	*mp = NULL;
 	return (0);
 }
 
 /*
  * The helper function is used to load the computed physical addresses
  * into the transmit descriptor.
  */
 static void
 fatm_tpd_load(void *varg, bus_dma_segment_t *segs, int nsegs,
     bus_size_t mapsize, int error)
 {
 	struct tpd *tpd = varg;
 
 	if (error)
 		return;
 
 	KASSERT(nsegs <= TPD_EXTENSIONS + TXD_FIXED, ("too many segments"));
 
 	tpd->spec = 0;
 	while (nsegs--) {
 		H_SETDESC(tpd->segment[tpd->spec].buffer, segs->ds_addr);
 		H_SETDESC(tpd->segment[tpd->spec].length, segs->ds_len);
 		tpd->spec++;
 		segs++;
 	}
 }
 
 /*
  * Start output.
  *
  * Note, that we update the internal statistics without the lock here.
  */
 static int
 fatm_tx(struct fatm_softc *sc, struct mbuf *m, struct card_vcc *vc, u_int mlen)
 {
 	struct txqueue *q;
 	u_int nblks;
 	int error, aal, nsegs;
 	struct tpd *tpd;
 
 	/*
 	 * Get a queue element.
 	 * If there isn't one - try to drain the transmit queue
 	 * We used to sleep here if that doesn't help, but we
 	 * should not sleep here, because we are called with locks.
 	 */
 	q = GET_QUEUE(sc->txqueue, struct txqueue, sc->txqueue.head);
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) != FATM_STAT_FREE) {
 		fatm_intr_drain_tx(sc);
 		H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 		if (H_GETSTAT(q->q.statp) != FATM_STAT_FREE) {
 			if (sc->retry_tx) {
 				sc->istats.tx_retry++;
 				IF_PREPEND(&sc->ifp->if_snd, m);
 				return (1);
 			}
 			sc->istats.tx_queue_full++;
 			m_freem(m);
 			return (0);
 		}
 		sc->istats.tx_queue_almost_full++;
 	}
 
 	tpd = q->q.ioblk;
 
 	m->m_data += sizeof(struct atm_pseudohdr);
 	m->m_len -= sizeof(struct atm_pseudohdr);
 
 #ifdef ENABLE_BPF
 	if (!(vc->param.flags & ATMIO_FLAG_NG) &&
 	    vc->param.aal == ATMIO_AAL_5 &&
 	    (vc->param.flags & ATM_PH_LLCSNAP))
 		BPF_MTAP(sc->ifp, m);
 #endif
 
 	/* map the mbuf */
 	error = bus_dmamap_load_mbuf(sc->tx_tag, q->map, m,
 	    fatm_tpd_load, tpd, BUS_DMA_NOWAIT);
 	if(error) {
 		if_inc_counter(sc->ifp, IFCOUNTER_OERRORS, 1);
 		if_printf(sc->ifp, "mbuf loaded error=%d\n", error);
 		m_freem(m);
 		return (0);
 	}
 	nsegs = tpd->spec;
 
 	bus_dmamap_sync(sc->tx_tag, q->map, BUS_DMASYNC_PREWRITE);
 
 	/*
 	 * OK. Now go and do it.
 	 */
 	aal = (vc->param.aal == ATMIO_AAL_5) ? 5 : 0;
 
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 	q->m = m;
 
 	/*
 	 * If the transmit queue is almost full, schedule a
 	 * transmit interrupt so that transmit descriptors can
 	 * be recycled.
 	 */
 	H_SETDESC(tpd->spec, TDX_MKSPEC((sc->txcnt >=
 	    (4 * FATM_TX_QLEN) / 5), aal, nsegs, mlen));
 	H_SETDESC(tpd->atm_header, TDX_MKHDR(vc->param.vpi,
 	    vc->param.vci, 0, 0));
 
 	if (vc->param.traffic == ATMIO_TRAFFIC_UBR)
 		H_SETDESC(tpd->stream, 0);
 	else {
 		u_int i;
 
 		for (i = 0; i < RATE_TABLE_SIZE; i++)
 			if (rate_table[i].cell_rate < vc->param.tparam.pcr)
 				break;
 		if (i > 0)
 			i--;
 		H_SETDESC(tpd->stream, rate_table[i].ratio);
 	}
 	H_SYNCQ_PREWRITE(&sc->txq_mem, tpd, TPD_SIZE);
 
 	nblks = TDX_SEGS2BLKS(nsegs);
 
 	DBG(sc, XMIT, ("XMIT: mlen=%d spec=0x%x nsegs=%d blocks=%d",
 	    mlen, le32toh(tpd->spec), nsegs, nblks));
 
 	WRITE4(sc, q->q.card + 0, q->q.card_ioblk | nblks);
 	BARRIER_W(sc);
 
 	sc->txcnt++;
 	if_inc_counter(sc->ifp, IFCOUNTER_OPACKETS, 1);
 	vc->obytes += m->m_pkthdr.len;
 	vc->opackets++;
 
 	NEXT_QUEUE_ENTRY(sc->txqueue.head, FATM_TX_QLEN);
 
 	return (0);
 }
 
 static void
 fatm_start(struct ifnet *ifp)
 {
 	struct atm_pseudohdr aph;
 	struct fatm_softc *sc;
 	struct mbuf *m;
 	u_int mlen, vpi, vci;
 	struct card_vcc *vc;
 
 	sc = ifp->if_softc;
 
 	while (1) {
 		IF_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 
 		/*
 		 * Loop through the mbuf chain and compute the total length
 		 * of the packet. Check that all data pointer are
 		 * 4 byte aligned. If they are not, call fatm_mfix to
 		 * fix that problem. This comes more or less from the
 		 * en driver.
 		 */
 		mlen = fatm_fix_chain(sc, &m);
 		if (m == NULL)
 			continue;
 
 		if (m->m_len < sizeof(struct atm_pseudohdr) &&
 		    (m = m_pullup(m, sizeof(struct atm_pseudohdr))) == NULL)
 			continue;
 
 		aph = *mtod(m, struct atm_pseudohdr *);
 		mlen -= sizeof(struct atm_pseudohdr);
 
 		if (mlen == 0) {
 			m_freem(m);
 			continue;
 		}
 		if (mlen > FATM_MAXPDU) {
 			sc->istats.tx_pdu2big++;
 			m_freem(m);
 			continue;
 		}
 
 		vci = ATM_PH_VCI(&aph);
 		vpi = ATM_PH_VPI(&aph);
 
 		/*
 		 * From here on we need the softc
 		 */
 		FATM_LOCK(sc);
 		if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			FATM_UNLOCK(sc);
 			m_freem(m);
 			break;
 		}
 		if (!VC_OK(sc, vpi, vci) || (vc = sc->vccs[vci]) == NULL ||
 		    !(vc->vflags & FATM_VCC_OPEN)) {
 			FATM_UNLOCK(sc);
 			m_freem(m);
 			continue;
 		}
 		if (fatm_tx(sc, m, vc, mlen)) {
 			FATM_UNLOCK(sc);
 			break;
 		}
 		FATM_UNLOCK(sc);
 	}
 }
 
 /*
  * VCC managment
  *
  * This may seem complicated. The reason for this is, that we need an
  * asynchronuous open/close for the NATM VCCs because our ioctl handler
  * is called with the radix node head of the routing table locked. Therefor
  * we cannot sleep there and wait for the open/close to succeed. For this
  * reason we just initiate the operation from the ioctl.
  */
 
 /*
  * Command the card to open/close a VC.
  * Return the queue entry for waiting if we are succesful.
  */
 static struct cmdqueue *
 fatm_start_vcc(struct fatm_softc *sc, u_int vpi, u_int vci, uint32_t cmd,
     u_int mtu, void (*func)(struct fatm_softc *, struct cmdqueue *))
 {
 	struct cmdqueue *q;
 
 	q = GET_QUEUE(sc->cmdqueue, struct cmdqueue, sc->cmdqueue.head);
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (!(H_GETSTAT(q->q.statp) & FATM_STAT_FREE)) {
 		sc->istats.cmd_queue_full++;
 		return (NULL);
 	}
 	NEXT_QUEUE_ENTRY(sc->cmdqueue.head, FATM_CMD_QLEN);
 
 	q->error = 0;
 	q->cb = func;
 	H_SETSTAT(q->q.statp, FATM_STAT_PENDING);
 	H_SYNCSTAT_PREWRITE(sc, q->q.statp);
 
 	WRITE4(sc, q->q.card + FATMOC_ACTIN_VPVC, MKVPVC(vpi, vci));
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_ACTIN_MTU, mtu);
 	BARRIER_W(sc);
 	WRITE4(sc, q->q.card + FATMOC_OP, cmd);
 	BARRIER_W(sc);
 
 	return (q);
 }
 
 /*
  * The VC has been opened/closed and somebody has been waiting for this.
  * Wake him up.
  */
 static void
 fatm_cmd_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.get_stat_errors++;
 		q->error = EIO;
 	}
 	wakeup(q);
 }
 
 /*
  * Open complete
  */
 static void
 fatm_open_finish(struct fatm_softc *sc, struct card_vcc *vc)
 {
 	vc->vflags &= ~FATM_VCC_TRY_OPEN;
 	vc->vflags |= FATM_VCC_OPEN;
 
 	if (vc->vflags & FATM_VCC_REOPEN) {
 		vc->vflags &= ~FATM_VCC_REOPEN;
 		return;
 	}
 
 	/* inform management if this is not an NG
 	 * VCC or it's an NG PVC. */
 	if (!(vc->param.flags & ATMIO_FLAG_NG) ||
 	    (vc->param.flags & ATMIO_FLAG_PVC))
 		ATMEV_SEND_VCC_CHANGED(IFP2IFATM(sc->ifp), 0, vc->param.vci, 1);
 }
 
 /*
  * The VC that we have tried to open asynchronuosly has been opened.
  */
 static void
 fatm_open_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 	u_int vci;
 	struct card_vcc *vc;
 
 	vci = GETVCI(READ4(sc, q->q.card + FATMOC_ACTIN_VPVC));
 	vc = sc->vccs[vci];
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.get_stat_errors++;
 		sc->vccs[vci] = NULL;
 		uma_zfree(sc->vcc_zone, vc);
 		if_printf(sc->ifp, "opening VCI %u failed\n", vci);
 		return;
 	}
 	fatm_open_finish(sc, vc);
 }
 
 /*
  * Wait on the queue entry until the VCC is opened/closed.
  */
 static int
 fatm_waitvcc(struct fatm_softc *sc, struct cmdqueue *q)
 {
 	int error;
 
 	/*
 	 * Wait for the command to complete
 	 */
 	error = msleep(q, &sc->mtx, PZERO | PCATCH, "fatm_vci", hz);
 
 	if (error != 0)
 		return (error);
 	return (q->error);
 }
 
 /*
  * Start to open a VCC. This just initiates the operation.
  */
 static int
 fatm_open_vcc(struct fatm_softc *sc, struct atmio_openvcc *op)
 {
 	int error;
 	struct card_vcc *vc;
 
 	/*
 	 * Check parameters
 	 */
 	if ((op->param.flags & ATMIO_FLAG_NOTX) &&
 	    (op->param.flags & ATMIO_FLAG_NORX))
 		return (EINVAL);
 
 	if (!VC_OK(sc, op->param.vpi, op->param.vci))
 		return (EINVAL);
 	if (op->param.aal != ATMIO_AAL_0 && op->param.aal != ATMIO_AAL_5)
 		return (EINVAL);
 
 	vc = uma_zalloc(sc->vcc_zone, M_NOWAIT | M_ZERO);
 	if (vc == NULL)
 		return (ENOMEM);
 
 	error = 0;
 
 	FATM_LOCK(sc);
 	if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		error = EIO;
 		goto done;
 	}
 	if (sc->vccs[op->param.vci] != NULL) {
 		error = EBUSY;
 		goto done;
 	}
 	vc->param = op->param;
 	vc->rxhand = op->rxhand;
 
 	switch (op->param.traffic) {
 
 	  case ATMIO_TRAFFIC_UBR:
 		break;
 
 	  case ATMIO_TRAFFIC_CBR:
 		if (op->param.tparam.pcr == 0 ||
 		    op->param.tparam.pcr > IFP2IFATM(sc->ifp)->mib.pcr) {
 			error = EINVAL;
 			goto done;
 		}
 		break;
 
 	  default:
 		error = EINVAL;
 		goto done;
 	}
 	vc->ibytes = vc->obytes = 0;
 	vc->ipackets = vc->opackets = 0;
 
 	vc->vflags = FATM_VCC_TRY_OPEN;
 	sc->vccs[op->param.vci] = vc;
 	sc->open_vccs++;
 
 	error = fatm_load_vc(sc, vc);
 	if (error != 0) {
 		sc->vccs[op->param.vci] = NULL;
 		sc->open_vccs--;
 		goto done;
 	}
 
 	/* don't free below */
 	vc = NULL;
 
   done:
 	FATM_UNLOCK(sc);
 	if (vc != NULL)
 		uma_zfree(sc->vcc_zone, vc);
 	return (error);
 }
 
 /*
  * Try to initialize the given VC
  */
 static int
 fatm_load_vc(struct fatm_softc *sc, struct card_vcc *vc)
 {
 	uint32_t cmd;
 	struct cmdqueue *q;
 	int error;
 
 	/* Command and buffer strategy */
 	cmd = FATM_OP_ACTIVATE_VCIN | FATM_OP_INTERRUPT_SEL | (0 << 16);
 	if (vc->param.aal == ATMIO_AAL_0)
 		cmd |= (0 << 8);
 	else
 		cmd |= (5 << 8);
 
 	q = fatm_start_vcc(sc, vc->param.vpi, vc->param.vci, cmd, 1,
 	    (vc->param.flags & ATMIO_FLAG_ASYNC) ?
 	    fatm_open_complete : fatm_cmd_complete);
 	if (q == NULL)
 		return (EIO);
 
 	if (!(vc->param.flags & ATMIO_FLAG_ASYNC)) {
 		error = fatm_waitvcc(sc, q);
 		if (error != 0)
 			return (error);
 		fatm_open_finish(sc, vc);
 	}
 	return (0);
 }
 
 /*
  * Finish close
  */
 static void
 fatm_close_finish(struct fatm_softc *sc, struct card_vcc *vc)
 {
 	/* inform management of this is not an NG
 	 * VCC or it's an NG PVC. */
 	if (!(vc->param.flags & ATMIO_FLAG_NG) ||
 	    (vc->param.flags & ATMIO_FLAG_PVC))
 		ATMEV_SEND_VCC_CHANGED(IFP2IFATM(sc->ifp), 0, vc->param.vci, 0);
 
 	sc->vccs[vc->param.vci] = NULL;
 	sc->open_vccs--;
 
 	uma_zfree(sc->vcc_zone, vc);
 }
 
 /*
  * The VC has been closed.
  */
 static void
 fatm_close_complete(struct fatm_softc *sc, struct cmdqueue *q)
 {
 	u_int vci;
 	struct card_vcc *vc;
 
 	vci = GETVCI(READ4(sc, q->q.card + FATMOC_ACTIN_VPVC));
 	vc = sc->vccs[vci];
 	H_SYNCSTAT_POSTREAD(sc, q->q.statp);
 	if (H_GETSTAT(q->q.statp) & FATM_STAT_ERROR) {
 		sc->istats.get_stat_errors++;
 		/* keep the VCC in that state */
 		if_printf(sc->ifp, "closing VCI %u failed\n", vci);
 		return;
 	}
 
 	fatm_close_finish(sc, vc);
 }
 
 /*
  * Initiate closing a VCC
  */
 static int
 fatm_close_vcc(struct fatm_softc *sc, struct atmio_closevcc *cl)
 {
 	int error;
 	struct cmdqueue *q;
 	struct card_vcc *vc;
 
 	if (!VC_OK(sc, cl->vpi, cl->vci))
 		return (EINVAL);
 
 	error = 0;
 
 	FATM_LOCK(sc);
 	if (!(sc->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		error = EIO;
 		goto done;
 	}
 	vc = sc->vccs[cl->vci];
 	if (vc == NULL || !(vc->vflags & (FATM_VCC_OPEN | FATM_VCC_TRY_OPEN))) {
 		error = ENOENT;
 		goto done;
 	}
 
 	q = fatm_start_vcc(sc, cl->vpi, cl->vci, 
 	    FATM_OP_DEACTIVATE_VCIN | FATM_OP_INTERRUPT_SEL, 1,
 	    (vc->param.flags & ATMIO_FLAG_ASYNC) ?
 	    fatm_close_complete : fatm_cmd_complete);
 	if (q == NULL) {
 		error = EIO;
 		goto done;
 	}
 
 	vc->vflags &= ~(FATM_VCC_OPEN | FATM_VCC_TRY_OPEN);
 	vc->vflags |= FATM_VCC_TRY_CLOSE;
 
 	if (!(vc->param.flags & ATMIO_FLAG_ASYNC)) {
 		error = fatm_waitvcc(sc, q);
 		if (error != 0)
 			goto done;
 
 		fatm_close_finish(sc, vc);
 	}
 
   done:
 	FATM_UNLOCK(sc);
 	return (error);
 }
 
 /*
  * IOCTL handler
  */
 static int
 fatm_ioctl(struct ifnet *ifp, u_long cmd, caddr_t arg)
 {
 	int error;
 	struct fatm_softc *sc = ifp->if_softc;
 	struct ifaddr *ifa = (struct ifaddr *)arg;
 	struct ifreq *ifr = (struct ifreq *)arg;
 	struct atmio_closevcc *cl = (struct atmio_closevcc *)arg;
 	struct atmio_openvcc *op = (struct atmio_openvcc *)arg;
 	struct atmio_vcctable *vtab;
 
 	error = 0;
 	switch (cmd) {
 
 	  case SIOCATMOPENVCC:		/* kernel internal use */
 		error = fatm_open_vcc(sc, op);
 		break;
 
 	  case SIOCATMCLOSEVCC:		/* kernel internal use */
 		error = fatm_close_vcc(sc, cl);
 		break;
 
 	  case SIOCSIFADDR:
 		FATM_LOCK(sc);
 		ifp->if_flags |= IFF_UP;
 		if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 			fatm_init_locked(sc);
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		  case AF_INET:
 		  case AF_INET6:
 			ifa->ifa_rtrequest = atm_rtrequest;
 			break;
 #endif
 		  default:
 			break;
 		}
 		FATM_UNLOCK(sc);
 		break;
 
 	  case SIOCSIFFLAGS:
 		FATM_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 				fatm_init_locked(sc);
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				fatm_stop(sc);
 			}
 		}
 		FATM_UNLOCK(sc);
 		break;
 
 	  case SIOCGIFMEDIA:
 	  case SIOCSIFMEDIA:
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		else
 			error = EINVAL;
 		break;
 
 	  case SIOCATMGVCCS:
 		/* return vcc table */
 		vtab = atm_getvccs((struct atmio_vcc **)sc->vccs,
 		    FORE_MAX_VCC + 1, sc->open_vccs, &sc->mtx, 1);
 		error = copyout(vtab, ifr->ifr_data, sizeof(*vtab) +
 		    vtab->count * sizeof(vtab->vccs[0]));
 		free(vtab, M_DEVBUF);
 		break;
 
 	  case SIOCATMGETVCCS:	/* internal netgraph use */
 		vtab = atm_getvccs((struct atmio_vcc **)sc->vccs,
 		    FORE_MAX_VCC + 1, sc->open_vccs, &sc->mtx, 0);
 		if (vtab == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		*(void **)arg = vtab;
 		break;
 
 	  default:
 		DBG(sc, IOCTL, ("+++ cmd=%08lx arg=%p", cmd, arg));
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Detach from the interface and free all resources allocated during
  * initialisation and later.
  */
 static int
 fatm_detach(device_t dev)
 {
 	u_int i;
 	struct rbuf *rb;
 	struct fatm_softc *sc;
 	struct txqueue *tx;
 
 	sc = device_get_softc(dev);
 
 	if (device_is_alive(dev)) {
 		FATM_LOCK(sc);
 		fatm_stop(sc);
 		utopia_detach(&sc->utopia);
 		FATM_UNLOCK(sc);
 		atm_ifdetach(sc->ifp);		/* XXX race */
 	}
 	callout_drain(&sc->watchdog_timer);
 
 	if (sc->ih != NULL)
 		bus_teardown_intr(dev, sc->irqres, sc->ih);
 
 	while ((rb = LIST_FIRST(&sc->rbuf_used)) != NULL) {
 		if_printf(sc->ifp, "rbuf %p still in use!\n", rb);
 		bus_dmamap_unload(sc->rbuf_tag, rb->map);
 		m_freem(rb->m);
 		LIST_REMOVE(rb, link);
 		LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 	}
 
 	if (sc->txqueue.chunk != NULL) {
 		for (i = 0; i < FATM_TX_QLEN; i++) {
 			tx = GET_QUEUE(sc->txqueue, struct txqueue, i);
 			bus_dmamap_destroy(sc->tx_tag, tx->map);
 		}
 	}
 
 	while ((rb = LIST_FIRST(&sc->rbuf_free)) != NULL) {
 		bus_dmamap_destroy(sc->rbuf_tag, rb->map);
 		LIST_REMOVE(rb, link);
 	}
 
 	if (sc->rbufs != NULL)
 		free(sc->rbufs, M_DEVBUF);
 	if (sc->vccs != NULL) {
 		for (i = 0; i < FORE_MAX_VCC + 1; i++)
 			if (sc->vccs[i] != NULL) {
 				uma_zfree(sc->vcc_zone, sc->vccs[i]);
 				sc->vccs[i] = NULL;
 			}
 		free(sc->vccs, M_DEVBUF);
 	}
 	if (sc->vcc_zone != NULL)
 		uma_zdestroy(sc->vcc_zone);
 
 	if (sc->l1queue.chunk != NULL)
 		free(sc->l1queue.chunk, M_DEVBUF);
 	if (sc->s1queue.chunk != NULL)
 		free(sc->s1queue.chunk, M_DEVBUF);
 	if (sc->rxqueue.chunk != NULL)
 		free(sc->rxqueue.chunk, M_DEVBUF);
 	if (sc->txqueue.chunk != NULL)
 		free(sc->txqueue.chunk, M_DEVBUF);
 	if (sc->cmdqueue.chunk != NULL)
 		free(sc->cmdqueue.chunk, M_DEVBUF);
 
 	destroy_dma_memory(&sc->reg_mem);
 	destroy_dma_memory(&sc->sadi_mem);
 	destroy_dma_memory(&sc->prom_mem);
 #ifdef TEST_DMA_SYNC
 	destroy_dma_memoryX(&sc->s1q_mem);
 	destroy_dma_memoryX(&sc->l1q_mem);
 	destroy_dma_memoryX(&sc->rxq_mem);
 	destroy_dma_memoryX(&sc->txq_mem);
 	destroy_dma_memoryX(&sc->stat_mem);
 #endif
 
 	if (sc->tx_tag != NULL)
 		if (bus_dma_tag_destroy(sc->tx_tag))
 			printf("tx DMA tag busy!\n");
 
 	if (sc->rbuf_tag != NULL)
 		if (bus_dma_tag_destroy(sc->rbuf_tag))
 			printf("rbuf DMA tag busy!\n");
 
 	if (sc->parent_dmat != NULL)
 		if (bus_dma_tag_destroy(sc->parent_dmat))
 			printf("parent DMA tag busy!\n");
 
 	if (sc->irqres != NULL)
 		bus_release_resource(dev, SYS_RES_IRQ, sc->irqid, sc->irqres);
 
 	if (sc->memres != NULL)
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    sc->memid, sc->memres);
 
 	(void)sysctl_ctx_free(&sc->sysctl_ctx);
 
 	cv_destroy(&sc->cv_stat);
 	cv_destroy(&sc->cv_regs);
 
 	mtx_destroy(&sc->mtx);
 
 	if_free(sc->ifp);
 
 	return (0);
 }
 
 /*
  * Sysctl handler
  */
 static int
 fatm_sysctl_istats(SYSCTL_HANDLER_ARGS)
 {
 	struct fatm_softc *sc = arg1;
 	u_long *ret;
 	int error;
 
 	ret = malloc(sizeof(sc->istats), M_TEMP, M_WAITOK);
 
 	FATM_LOCK(sc);
 	bcopy(&sc->istats, ret, sizeof(sc->istats));
 	FATM_UNLOCK(sc);
 
 	error = SYSCTL_OUT(req, ret, sizeof(sc->istats));
 	free(ret, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Sysctl handler for card statistics
  * This is disable because it destroys the PHY statistics.
  */
 static int
 fatm_sysctl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct fatm_softc *sc = arg1;
 	int error;
 	const struct fatm_stats *s;
 	u_long *ret;
 	u_int i;
 
 	ret = malloc(sizeof(u_long) * FATM_NSTATS, M_TEMP, M_WAITOK);
 
 	FATM_LOCK(sc);
 
 	if ((error = fatm_getstat(sc)) == 0) {
 		s = sc->sadi_mem.mem;
 		i = 0;
 		ret[i++] = s->phy_4b5b.crc_header_errors;
 		ret[i++] = s->phy_4b5b.framing_errors;
 		ret[i++] = s->phy_oc3.section_bip8_errors;
 		ret[i++] = s->phy_oc3.path_bip8_errors;
 		ret[i++] = s->phy_oc3.line_bip24_errors;
 		ret[i++] = s->phy_oc3.line_febe_errors;
 		ret[i++] = s->phy_oc3.path_febe_errors;
 		ret[i++] = s->phy_oc3.corr_hcs_errors;
 		ret[i++] = s->phy_oc3.ucorr_hcs_errors;
 		ret[i++] = s->atm.cells_transmitted;
 		ret[i++] = s->atm.cells_received;
 		ret[i++] = s->atm.vpi_bad_range;
 		ret[i++] = s->atm.vpi_no_conn;
 		ret[i++] = s->atm.vci_bad_range;
 		ret[i++] = s->atm.vci_no_conn;
 		ret[i++] = s->aal0.cells_transmitted;
 		ret[i++] = s->aal0.cells_received;
 		ret[i++] = s->aal0.cells_dropped;
 		ret[i++] = s->aal4.cells_transmitted;
 		ret[i++] = s->aal4.cells_received;
 		ret[i++] = s->aal4.cells_crc_errors;
 		ret[i++] = s->aal4.cels_protocol_errors;
 		ret[i++] = s->aal4.cells_dropped;
 		ret[i++] = s->aal4.cspdus_transmitted;
 		ret[i++] = s->aal4.cspdus_received;
 		ret[i++] = s->aal4.cspdus_protocol_errors;
 		ret[i++] = s->aal4.cspdus_dropped;
 		ret[i++] = s->aal5.cells_transmitted;
 		ret[i++] = s->aal5.cells_received;
 		ret[i++] = s->aal5.congestion_experienced;
 		ret[i++] = s->aal5.cells_dropped;
 		ret[i++] = s->aal5.cspdus_transmitted;
 		ret[i++] = s->aal5.cspdus_received;
 		ret[i++] = s->aal5.cspdus_crc_errors;
 		ret[i++] = s->aal5.cspdus_protocol_errors;
 		ret[i++] = s->aal5.cspdus_dropped;
 		ret[i++] = s->aux.small_b1_failed;
 		ret[i++] = s->aux.large_b1_failed;
 		ret[i++] = s->aux.small_b2_failed;
 		ret[i++] = s->aux.large_b2_failed;
 		ret[i++] = s->aux.rpd_alloc_failed;
 		ret[i++] = s->aux.receive_carrier;
 	}
 	/* declare the buffer free */
 	sc->flags &= ~FATM_STAT_INUSE;
 	cv_signal(&sc->cv_stat);
 
 	FATM_UNLOCK(sc);
 
 	if (error == 0)
 		error = SYSCTL_OUT(req, ret, sizeof(u_long) * FATM_NSTATS);
 	free(ret, M_TEMP);
 
 	return (error);
 }
 
 #define MAXDMASEGS 32		/* maximum number of receive descriptors */
 
 /*
  * Attach to the device.
  *
  * We assume, that there is a global lock (Giant in this case) that protects
  * multiple threads from entering this function. This makes sense, doesn't it?
  */
 static int
 fatm_attach(device_t dev)
 {
 	struct ifnet *ifp;
 	struct fatm_softc *sc;
 	int unit;
 	uint16_t cfg;
 	int error = 0;
 	struct rbuf *rb;
 	u_int i;
 	struct txqueue *tx;
 
 	sc = device_get_softc(dev);
 	unit = device_get_unit(dev);
 
 	ifp = sc->ifp = if_alloc(IFT_ATM);
 	if (ifp == NULL) {
 		error = ENOSPC;
 		goto fail;
 	}
 
 	IFP2IFATM(sc->ifp)->mib.device = ATM_DEVICE_PCA200E;
 	IFP2IFATM(sc->ifp)->mib.serial = 0;
 	IFP2IFATM(sc->ifp)->mib.hw_version = 0;
 	IFP2IFATM(sc->ifp)->mib.sw_version = 0;
 	IFP2IFATM(sc->ifp)->mib.vpi_bits = 0;
 	IFP2IFATM(sc->ifp)->mib.vci_bits = FORE_VCIBITS;
 	IFP2IFATM(sc->ifp)->mib.max_vpcs = 0;
 	IFP2IFATM(sc->ifp)->mib.max_vccs = FORE_MAX_VCC;
 	IFP2IFATM(sc->ifp)->mib.media = IFM_ATM_UNKNOWN;
 	IFP2IFATM(sc->ifp)->phy = &sc->utopia;
 
 	LIST_INIT(&sc->rbuf_free);
 	LIST_INIT(&sc->rbuf_used);
 
 	/*
 	 * Initialize mutex and condition variables.
 	 */
 	mtx_init(&sc->mtx, device_get_nameunit(dev),
 	    MTX_NETWORK_LOCK, MTX_DEF);
 
 	cv_init(&sc->cv_stat, "fatm_stat");
 	cv_init(&sc->cv_regs, "fatm_regs");
 
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	callout_init_mtx(&sc->watchdog_timer, &sc->mtx, 0);
 
 	/*
 	 * Make the sysctl tree
 	 */
 	if ((sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_hw_atm), OID_AUTO,
 	    device_get_nameunit(dev), CTLFLAG_RD, 0, "")) == NULL)
 		goto fail;
 
 	if (SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "istats", CTLTYPE_ULONG | CTLFLAG_RD, sc, 0,
 	    fatm_sysctl_istats, "LU", "internal statistics") == NULL)
 		goto fail;
 
 	if (SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "stats", CTLTYPE_ULONG | CTLFLAG_RD, sc, 0,
 	    fatm_sysctl_stats, "LU", "card statistics") == NULL)
 		goto fail;
 
 	if (SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "retry_tx", CTLFLAG_RW, &sc->retry_tx, 0,
 	    "retry flag") == NULL)
 		goto fail;
 
 #ifdef FATM_DEBUG
 	if (SYSCTL_ADD_UINT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    OID_AUTO, "debug", CTLFLAG_RW, &sc->debug, 0, "debug flags")
 	    == NULL)
 		goto fail;
 	sc->debug = FATM_DEBUG;
 #endif
 
 	/*
 	 * Network subsystem stuff
 	 */
 	ifp->if_softc = sc;
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_flags = IFF_SIMPLEX;
 	ifp->if_ioctl = fatm_ioctl;
 	ifp->if_start = fatm_start;
 	ifp->if_init = fatm_init;
 	ifp->if_linkmib = &IFP2IFATM(sc->ifp)->mib;
 	ifp->if_linkmiblen = sizeof(IFP2IFATM(sc->ifp)->mib);
 
 	/*
 	 * Enable busmaster
 	 */
 	pci_enable_busmaster(dev);
 
 	/*
 	 * Map memory
 	 */
 	sc->memid = 0x10;
 	sc->memres = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->memid,
 	    RF_ACTIVE);
 	if (sc->memres == NULL) {
 		if_printf(ifp, "could not map memory\n");
 		error = ENXIO;
 		goto fail;
 	}
 	sc->memh = rman_get_bushandle(sc->memres);
 	sc->memt = rman_get_bustag(sc->memres);
 
 	/*
 	 * Convert endianess of slave access
 	 */
 	cfg = pci_read_config(dev, FATM_PCIR_MCTL, 1);
 	cfg |= FATM_PCIM_SWAB;
 	pci_write_config(dev, FATM_PCIR_MCTL, cfg, 1);
 
 	/*
 	 * Allocate interrupt (activate at the end)
 	 */
 	sc->irqid = 0;
 	sc->irqres = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->irqid,
 	    RF_SHAREABLE | RF_ACTIVE);
 	if (sc->irqres == NULL) {
 		if_printf(ifp, "could not allocate irq\n");
 		error = ENXIO;
 		goto fail;
 	}
 
 	/*
 	 * Allocate the parent DMA tag. This is used simply to hold overall
 	 * restrictions for the controller (and PCI bus) and is never used
 	 * to do anything.
 	 */
 	if (bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
 	    NULL, NULL, BUS_SPACE_MAXSIZE_32BIT, MAXDMASEGS,
 	    BUS_SPACE_MAXSIZE_32BIT, 0, NULL, NULL,
 	    &sc->parent_dmat)) {
 		if_printf(ifp, "could not allocate parent DMA tag\n");
 		error = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Allocate the receive buffer DMA tag. This tag must map a maximum of
 	 * a mbuf cluster.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
 	    NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, 
 	    NULL, NULL, &sc->rbuf_tag)) {
 		if_printf(ifp, "could not allocate rbuf DMA tag\n");
 		error = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Allocate the transmission DMA tag. Must add 1, because
 	 * rounded up PDU will be 65536 bytes long.
 	 */
 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0,
 	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR,
 	    NULL, NULL,
 	    FATM_MAXPDU + 1, TPD_EXTENSIONS + TXD_FIXED, MCLBYTES, 0,
 	    NULL, NULL, &sc->tx_tag)) {
 		if_printf(ifp, "could not allocate tx DMA tag\n");
 		error = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Allocate DMAable memory.
 	 */
 	sc->stat_mem.size = sizeof(uint32_t) * (FATM_CMD_QLEN + FATM_TX_QLEN
 	    + FATM_RX_QLEN + SMALL_SUPPLY_QLEN + LARGE_SUPPLY_QLEN);
 	sc->stat_mem.align = 4;
 
 	sc->txq_mem.size = FATM_TX_QLEN * TPD_SIZE;
 	sc->txq_mem.align = 32;
 
 	sc->rxq_mem.size = FATM_RX_QLEN * RPD_SIZE;
 	sc->rxq_mem.align = 32;
 
 	sc->s1q_mem.size = SMALL_SUPPLY_QLEN *
 	    BSUP_BLK2SIZE(SMALL_SUPPLY_BLKSIZE);
 	sc->s1q_mem.align = 32;
 
 	sc->l1q_mem.size = LARGE_SUPPLY_QLEN *
 	    BSUP_BLK2SIZE(LARGE_SUPPLY_BLKSIZE);
 	sc->l1q_mem.align = 32;
 
 #ifdef TEST_DMA_SYNC
 	if ((error = alloc_dma_memoryX(sc, "STATUS", &sc->stat_mem)) != 0 ||
 	    (error = alloc_dma_memoryX(sc, "TXQ", &sc->txq_mem)) != 0 ||
 	    (error = alloc_dma_memoryX(sc, "RXQ", &sc->rxq_mem)) != 0 ||
 	    (error = alloc_dma_memoryX(sc, "S1Q", &sc->s1q_mem)) != 0 ||
 	    (error = alloc_dma_memoryX(sc, "L1Q", &sc->l1q_mem)) != 0)
 		goto fail;
 #else
 	if ((error = alloc_dma_memory(sc, "STATUS", &sc->stat_mem)) != 0 ||
 	    (error = alloc_dma_memory(sc, "TXQ", &sc->txq_mem)) != 0 ||
 	    (error = alloc_dma_memory(sc, "RXQ", &sc->rxq_mem)) != 0 ||
 	    (error = alloc_dma_memory(sc, "S1Q", &sc->s1q_mem)) != 0 ||
 	    (error = alloc_dma_memory(sc, "L1Q", &sc->l1q_mem)) != 0)
 		goto fail;
 #endif
 
 	sc->prom_mem.size = sizeof(struct prom);
 	sc->prom_mem.align = 32;
 	if ((error = alloc_dma_memory(sc, "PROM", &sc->prom_mem)) != 0)
 		goto fail;
 
 	sc->sadi_mem.size = sizeof(struct fatm_stats);
 	sc->sadi_mem.align = 32;
 	if ((error = alloc_dma_memory(sc, "STATISTICS", &sc->sadi_mem)) != 0)
 		goto fail;
 
 	sc->reg_mem.size = sizeof(uint32_t) * FATM_NREGS;
 	sc->reg_mem.align = 32;
 	if ((error = alloc_dma_memory(sc, "REGISTERS", &sc->reg_mem)) != 0)
 		goto fail;
 
 	/*
 	 * Allocate queues
 	 */
 	sc->cmdqueue.chunk = malloc(FATM_CMD_QLEN * sizeof(struct cmdqueue),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 	sc->txqueue.chunk = malloc(FATM_TX_QLEN * sizeof(struct txqueue),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 	sc->rxqueue.chunk = malloc(FATM_RX_QLEN * sizeof(struct rxqueue),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 	sc->s1queue.chunk = malloc(SMALL_SUPPLY_QLEN * sizeof(struct supqueue),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 	sc->l1queue.chunk = malloc(LARGE_SUPPLY_QLEN * sizeof(struct supqueue),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 
 	sc->vccs = malloc((FORE_MAX_VCC + 1) * sizeof(sc->vccs[0]),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 	sc->vcc_zone = uma_zcreate("FATM vccs", sizeof(struct card_vcc),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	if (sc->vcc_zone == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Allocate memory for the receive buffer headers. The total number
 	 * of headers should probably also include the maximum number of
 	 * buffers on the receive queue.
 	 */
 	sc->rbuf_total = SMALL_POOL_SIZE + LARGE_POOL_SIZE;
 	sc->rbufs = malloc(sc->rbuf_total * sizeof(struct rbuf),
 	    M_DEVBUF, M_ZERO | M_WAITOK);
 
 	/*
 	 * Put all rbuf headers on the free list and create DMA maps.
 	 */
 	for (rb = sc->rbufs, i = 0; i < sc->rbuf_total; i++, rb++) {
 		if ((error = bus_dmamap_create(sc->rbuf_tag, 0, &rb->map))) {
 			if_printf(sc->ifp, "creating rx map: %d\n",
 			    error);
 			goto fail;
 		}
 		LIST_INSERT_HEAD(&sc->rbuf_free, rb, link);
 	}
 
 	/*
 	 * Create dma maps for transmission. In case of an error, free the
 	 * allocated DMA maps, because on some architectures maps are NULL
 	 * and we cannot distinguish between a failure and a NULL map in
 	 * the detach routine.
 	 */
 	for (i = 0; i < FATM_TX_QLEN; i++) {
 		tx = GET_QUEUE(sc->txqueue, struct txqueue, i);
 		if ((error = bus_dmamap_create(sc->tx_tag, 0, &tx->map))) {
 			if_printf(sc->ifp, "creating tx map: %d\n",
 			    error);
 			while (i > 0) {
 				tx = GET_QUEUE(sc->txqueue, struct txqueue,
 				    i - 1);
 				bus_dmamap_destroy(sc->tx_tag, tx->map);
 				i--;
 			}
 			goto fail;
 		}
 	}
 
 	utopia_attach(&sc->utopia, IFP2IFATM(sc->ifp), &sc->media, &sc->mtx,
 	    &sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
 	    &fatm_utopia_methods);
 	sc->utopia.flags |= UTP_FL_NORESET | UTP_FL_POLL_CARRIER;
 
 	/*
 	 * Attach the interface
 	 */
 	atm_ifattach(ifp);
 	ifp->if_snd.ifq_maxlen = 512;
 
 #ifdef ENABLE_BPF
 	bpfattach(ifp, DLT_ATM_RFC1483, sizeof(struct atmllc));
 #endif
 
 	error = bus_setup_intr(dev, sc->irqres, INTR_TYPE_NET | INTR_MPSAFE,
 	    NULL, fatm_intr, sc, &sc->ih);
 	if (error) {
 		if_printf(ifp, "couldn't setup irq\n");
 		goto fail;
 	}
 
   fail:
 	if (error)
 		fatm_detach(dev);
 
 	return (error);
 }
 
 #if defined(FATM_DEBUG) && 0
 static void
 dump_s1_queue(struct fatm_softc *sc)
 {
 	int i;
 	struct supqueue *q;
 
 	for(i = 0; i < SMALL_SUPPLY_QLEN; i++) {
 		q = GET_QUEUE(sc->s1queue, struct supqueue, i);
 		printf("%2d: card=%x(%x,%x) stat=%x\n", i,
 		    q->q.card,
 		    READ4(sc, q->q.card),
 		    READ4(sc, q->q.card + 4),
 		    *q->q.statp);
 	}
 }
 #endif
 
 /*
  * Driver infrastructure.
  */
 static device_method_t fatm_methods[] = {
 	DEVMETHOD(device_probe,		fatm_probe),
 	DEVMETHOD(device_attach,	fatm_attach),
 	DEVMETHOD(device_detach,	fatm_detach),
 	{ 0, 0 }
 };
 static driver_t fatm_driver = {
 	"fatm",
 	fatm_methods,
 	sizeof(struct fatm_softc),
 };
 
 DRIVER_MODULE(fatm, pci, fatm_driver, fatm_devclass, 0, 0);
Index: head/sys/dev/iscsi_initiator/isc_soc.c
===================================================================
--- head/sys/dev/iscsi_initiator/isc_soc.c	(revision 276691)
+++ head/sys/dev/iscsi_initiator/isc_soc.c	(revision 276692)
@@ -1,701 +1,701 @@
 /*-
  * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /*
  | $Id: isc_soc.c 998 2009-12-20 10:32:45Z danny $
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_iscsi_initiator.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/sysctl.h>
 #include <sys/file.h>
 #include <sys/uio.h>
 #include <sys/socketvar.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/syslog.h>
 #include <sys/mbuf.h>
 #include <sys/user.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 
 #include <dev/iscsi_initiator/iscsi.h>
 #include <dev/iscsi_initiator/iscsivar.h>
 
 #ifndef NO_USE_MBUF
 #define USE_MBUF
 #endif
 
 #ifdef USE_MBUF
 static int ou_refcnt = 0;
 /*
  | function for freeing external storage for mbuf
  */
 static void
 ext_free(struct mbuf *m, void *a, void *b)
 {
      pduq_t *pq = b;
 
      if(pq->buf != NULL) {
 	  debug(3, "ou_refcnt=%d a=%p b=%p", ou_refcnt, a, pq->buf);
 	  free(pq->buf, M_ISCSIBUF);
 	  pq->buf = NULL;
      }
 }
 
 int
 isc_sendPDU(isc_session_t *sp, pduq_t *pq)
 {
      struct mbuf *mh, **mp;
      pdu_t	*pp = &pq->pdu;
      int	len, error;
 
      debug_called(8);
      /* 
       | mbuf for the iSCSI header
       */
      MGETHDR(mh, M_WAITOK, MT_DATA);
      mh->m_pkthdr.rcvif = NULL;
      mh->m_next = NULL;
      mh->m_len = sizeof(union ipdu_u);
 
      if(ISOK2DIG(sp->hdrDigest, pp)) {
 	  pp->hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
 	  mh->m_len += sizeof(pp->hdr_dig);
 	  if(pp->ahs_len) {
 	       debug(2, "ahs_len=%d", pp->ahs_len);
 	       pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
 	  }
 	  debug(3, "pp->hdr_dig=%04x", htonl(pp->hdr_dig));
      }
      if(pp->ahs_len) {
           /* 
 	   | Add any AHS to the iSCSI hdr mbuf
 	   */
 	  if((mh->m_len + pp->ahs_len) < MHLEN) {
-	       MH_ALIGN(mh, mh->m_len + pp->ahs_len);
+	       M_ALIGN(mh, mh->m_len + pp->ahs_len);
 	       bcopy(&pp->ipdu, mh->m_data, mh->m_len);
 	       bcopy(pp->ahs_addr, mh->m_data + mh->m_len, pp->ahs_len);
 	       mh->m_len += pp->ahs_len;
 	  }
 	  else
 	       panic("len AHS=%d too big, not impleneted yet", pp->ahs_len);
      }
      else {
-	  MH_ALIGN(mh, mh->m_len);
+	  M_ALIGN(mh, mh->m_len);
 	  bcopy(&pp->ipdu, mh->m_data, mh->m_len);
      }
      mh->m_pkthdr.len = mh->m_len;
      mp = &mh->m_next;
      if(pp->ds_len && pq->pdu.ds_addr) {
           struct mbuf *md;
           int	off = 0;
 
           len = pp->ds_len;
           while(len > 0) {
 	       int l;
 
 	       MGET(md, M_WAITOK, MT_DATA);
 	       md->m_ext.ext_cnt = &ou_refcnt;
 	       l = min(MCLBYTES, len);
 	       debug(4, "setting ext_free(arg=%p len/l=%d/%d)", pq->buf, len, l);
 	       MEXTADD(md, pp->ds_addr + off, l, ext_free, 
 #if __FreeBSD_version >= 800000
 		       pp->ds_addr + off,
 #endif
 		       pq, 0, EXT_EXTREF);
 	       md->m_len = l;
 	       md->m_next = NULL;
 	       mh->m_pkthdr.len += l;
 	       *mp = md;
 	       mp = &md->m_next;
 	       len -= l;
 	       off += l;
           }
 	  if(((pp->ds_len & 03) != 0) || ISOK2DIG(sp->dataDigest, pp)) {
 	       MGET(md, M_WAITOK, MT_DATA);
 	       if(pp->ds_len & 03)
 		    len = 4 - (pp->ds_len & 03);
 	       else
 		    len = 0;
 	       md->m_len = len;
 	       if(ISOK2DIG(sp->dataDigest, pp))
 		    md->m_len += sizeof(pp->ds_dig);
 	       M_ALIGN(md, md->m_len);
 	       if(ISOK2DIG(sp->dataDigest, pp)) {
 		    pp->ds_dig = sp->dataDigest(pp->ds_addr, pp->ds_len, 0);
 		    if(len) {
 			 bzero(md->m_data, len); // RFC says SHOULD be 0
 			 pp->ds_dig = sp->dataDigest(md->m_data, len, pp->ds_dig);
 		    }
 		    bcopy(&pp->ds_dig, md->m_data+len, sizeof(pp->ds_dig));
 	       }
 	       md->m_next = NULL;
 	       mh->m_pkthdr.len += md->m_len;
 	       *mp = md;
 	  }
      }
      if((error = sosend(sp->soc, NULL, NULL, mh, 0, 0, sp->td)) != 0) {
 	  sdebug(2, "error=%d", error);
 	  return error;
      }
      sp->stats.nsent++;
      getbintime(&sp->stats.t_sent);
      return 0;
 }
 #else /* NO_USE_MBUF */
 int
 isc_sendPDU(isc_session_t *sp, pduq_t *pq)
 {
      struct uio *uio = &pq->uio;
      struct iovec *iv;
      pdu_t	*pp = &pq->pdu;
      int	len, error;
 
      debug_called(8);
 
      bzero(uio, sizeof(struct uio));
      uio->uio_rw = UIO_WRITE;
      uio->uio_segflg = UIO_SYSSPACE;
      uio->uio_td = sp->td;
      uio->uio_iov = iv = pq->iov;
 
      iv->iov_base = &pp->ipdu;
      iv->iov_len = sizeof(union ipdu_u);
      uio->uio_resid = iv->iov_len;
      iv++;
      if(ISOK2DIG(sp->hdrDigest, pp))
 	  pq->pdu.hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
      if(pp->ahs_len) {
 	  iv->iov_base = pp->ahs_addr;
 	  iv->iov_len = pp->ahs_len;
 	  uio->uio_resid += iv->iov_len;
 	  iv++;
 	  if(ISOK2DIG(sp->hdrDigest, pp))
 	       pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
      }
      if(ISOK2DIG(sp->hdrDigest, pp)) {
 	  debug(3, "hdr_dig=%04x", htonl(pp->hdr_dig));
 	  iv->iov_base = &pp->hdr_dig;
 	  iv->iov_len = sizeof(int);
 	  uio->uio_resid += iv->iov_len ;
 	  iv++;
      }
      if(pq->pdu.ds_addr &&  pp->ds_len) {
 	  iv->iov_base = pp->ds_addr;
 	  iv->iov_len = pp->ds_len;
 	  while(iv->iov_len & 03) // the specs say it must be int alligned
 	       iv->iov_len++;
 	  uio->uio_resid += iv->iov_len ;
 	  iv++;
 	  if(ISOK2DIG(sp->dataDigest, pp)) {
 	       pp->ds_dig = sp->dataDigest(pp->ds, pp->ds_len, 0);
 	       iv->iov_base = &pp->ds_dig;
 	       iv->iov_len = sizeof(pp->ds_dig);
 	       uio->uio_resid += iv->iov_len ;
 	       iv++;
 	  }
      }
      uio->uio_iovcnt = iv - pq->iov;
      sdebug(4, "pq->len=%d uio->uio_resid=%d  uio->uio_iovcnt=%d", pq->len,
 	    uio->uio_resid,
 	    uio->uio_iovcnt);
 
      sdebug(4, "opcode=%x iovcnt=%d uio_resid=%d itt=%x",
 	    pp->ipdu.bhs.opcode, uio->uio_iovcnt, uio->uio_resid,
 	    ntohl(pp->ipdu.bhs.itt));
      sdebug(5, "sp=%p sp->soc=%p uio=%p sp->td=%p",
 	    sp, sp->soc, uio, sp->td);
      do {
 	  len = uio->uio_resid;
 	  error = sosend(sp->soc, NULL, uio, 0, 0, 0, sp->td);
 	  if(uio->uio_resid == 0 || error || len == uio->uio_resid) {
 	       if(uio->uio_resid) {
 		    sdebug(2, "uio->uio_resid=%d uio->uio_iovcnt=%d error=%d len=%d",
 			   uio->uio_resid, uio->uio_iovcnt, error, len);
 		    if(error == 0)
 			 error = EAGAIN; // 35
 	       }
 	       break;
 	  }
 	  /*
 	   | XXX: untested code
 	   */
 	  sdebug(1, "uio->uio_resid=%d uio->uio_iovcnt=%d",
 		 uio->uio_resid, uio->uio_iovcnt);
 	  iv = uio->uio_iov;
 	  len -= uio->uio_resid;
 	  while(uio->uio_iovcnt > 0) {
 	       if(iv->iov_len > len) {
 		    caddr_t bp = (caddr_t)iv->iov_base;
 
 		    iv->iov_len -= len;
 		    iv->iov_base = (void *)&bp[len];
 		    break;
 	       }
 	       len -= iv->iov_len;
 	       uio->uio_iovcnt--;
 	       uio->uio_iov++;
 	       iv++;
 	  }
      } while(uio->uio_resid);
 
      if(error == 0) {
 	  sp->stats.nsent++;
 	  getbintime(&sp->stats.t_sent);
      }
 
      return error;
 }
 #endif /* USE_MBUF */
 
 /*
  | wait till a PDU header is received
  | from the socket.
  */
 /*
    The format of the BHS is:
 
    Byte/     0       |       1       |       2       |       3       |
       /              |               |               |               |
      |0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|
      +---------------+---------------+---------------+---------------+
     0|.|I| Opcode    |F|  Opcode-specific fields                     |
      +---------------+---------------+---------------+---------------+
     4|TotalAHSLength | DataSegmentLength                             |
      +---------------+---------------+---------------+---------------+
     8| LUN or Opcode-specific fields                                 |
      +                                                               +
    12|                                                               |
      +---------------+---------------+---------------+---------------+
    16| Initiator Task Tag                                            |
      +---------------+---------------+---------------+---------------+
    20/ Opcode-specific fields                                        /
     +/                                                               /
      +---------------+---------------+---------------+---------------+
    48
  */
 static __inline int
 so_getbhs(isc_session_t *sp)
 {
      bhs_t *bhs		= &sp->bhs;
      struct uio		*uio = &sp->uio;
      struct iovec	*iov = &sp->iov;
      int		error, flags;
 
      debug_called(8);
 
      iov->iov_base	= bhs;
      iov->iov_len	= sizeof(bhs_t);
 
      uio->uio_iov	= iov;
      uio->uio_iovcnt	= 1;
      uio->uio_rw	= UIO_READ;
      uio->uio_segflg	= UIO_SYSSPACE;
      uio->uio_td	= curthread; // why ...
      uio->uio_resid	= sizeof(bhs_t);
 
      flags = MSG_WAITALL;
      error = soreceive(sp->soc, NULL, uio, 0, 0, &flags);
 
      if(error)
 	  debug(2, 
 #if __FreeBSD_version > 800000
 		"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd",
 #else
 		"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd",
 #endif
 		error,
 		sp->soc->so_error, uio->uio_resid, iov->iov_len);
      if(!error && (uio->uio_resid > 0)) {
 	  error = EPIPE; // was EAGAIN
 	  debug(2,
 #if __FreeBSD_version > 800000
 		"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd so_state=%x",
 #else
 		"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd so_state=%x",
 #endif
 		error,
 		sp->soc->so_error, uio->uio_resid, iov->iov_len, sp->soc->so_state);
      }
      return error;
 }
 
 /*
  | so_recv gets called when 
  | an iSCSI header has been received.
  | Note: the designers had no intentions 
  |       in making programmer's life easy.
  */
 static int
 so_recv(isc_session_t *sp, pduq_t *pq)
 {
      sn_t		*sn = &sp->sn;
      struct uio		*uio = &pq->uio;
      pdu_t		*pp = &pq->pdu;
      bhs_t		*bhs = &pp->ipdu.bhs;
      struct iovec	*iov = pq->iov;
      int		error;
      u_int		len;
      u_int		max, exp;
      int		flags = MSG_WAITALL;
 
      debug_called(8);
      /*
       | now calculate how much data should be in the buffer
       */
      uio->uio_iov	= iov;
      uio->uio_iovcnt	= 0;
      len = 0;
      if(bhs->AHSLength) {
 	  debug(2, "bhs->AHSLength=%d", bhs->AHSLength);
 	  pp->ahs_len = bhs->AHSLength * 4;
 	  len += pp->ahs_len;
 	  pp->ahs_addr = malloc(pp->ahs_len, M_TEMP, M_WAITOK); // XXX: could get stuck here
 	  iov->iov_base = pp->ahs_addr;
 	  iov->iov_len = pp->ahs_len;
 	  uio->uio_iovcnt++;
 	  iov++;
      }
      if(ISOK2DIG(sp->hdrDigest, pp)) {
 	  len += sizeof(pp->hdr_dig);
 	  iov->iov_base = &pp->hdr_dig;
 	  iov->iov_len = sizeof(pp->hdr_dig);
 	  uio->uio_iovcnt++;
      }
      if(len) {
 	  uio->uio_rw		= UIO_READ;
 	  uio->uio_segflg	= UIO_SYSSPACE;
 	  uio->uio_resid	= len;
 	  uio->uio_td		= sp->td; // why ...
 	  error = soreceive(sp->soc, NULL, uio, NULL, NULL, &flags);
 	  //if(error == EAGAIN)
 	  // XXX: this needs work! it hangs iscontrol
 	  if(error || uio->uio_resid) {
 	       debug(2, 
 #if __FreeBSD_version > 800000
 		     "len=%d error=%d uio->uio_resid=%zd",
 #else
 		     "len=%d error=%d uio->uio_resid=%d",
 #endif
 		     len, error, uio->uio_resid);
 	       goto out;
 	  }
 	  if(ISOK2DIG(sp->hdrDigest, pp)) {
 	       bhs_t	*bhs;
 	       u_int	digest;
 	       
 	       bhs = (bhs_t *)&pp->ipdu;
 	       digest = sp->hdrDigest(bhs, sizeof(bhs_t), 0);
 	       if(pp->ahs_len)
 		    digest = sp->hdrDigest(pp->ahs_addr, pp->ahs_len, digest);
 	       if(pp->hdr_dig != digest) {
 		    debug(2, "bad header digest: received=%x calculated=%x", pp->hdr_dig, digest);
 		    // XXX: now what?
 		    error = EIO;
 		    goto out;
 	       }
 	  }
 	  if(pp->ahs_len) {
 	       debug(2, "ahs len=%x type=%x spec=%x",
 		     pp->ahs_addr->len, pp->ahs_addr->type, pp->ahs_addr->spec);
 	       // XXX: till I figure out what to do with this
 	       free(pp->ahs_addr, M_TEMP);
 	  }
 	  pq->len += len; // XXX: who needs this?
 	  bzero(uio, sizeof(struct uio));
 	  len = 0;
      }
 
      if(bhs->DSLength) {
 	  len = bhs->DSLength;
 #if BYTE_ORDER == LITTLE_ENDIAN
 	  len = ((len & 0x00ff0000) >> 16)
 	       | (len & 0x0000ff00)
 	       | ((len & 0x000000ff) << 16);
 #endif
 	  pp->ds_len = len;
 	  if((sp->opt.maxRecvDataSegmentLength > 0) && (len > sp->opt.maxRecvDataSegmentLength)) {
 	       xdebug("impossible PDU length(%d) opt.maxRecvDataSegmentLength=%d",
 		      len, sp->opt.maxRecvDataSegmentLength);
 	       log(LOG_ERR,
 		   "so_recv: impossible PDU length(%d) from iSCSI %s/%s\n",
 		   len, sp->opt.targetAddress, sp->opt.targetName);
 	       /*
 		| XXX: this will really screwup the stream.
 		| should clear up the buffer till a valid header
 		| is found, or just close connection ...
 		| should read the RFC.
 	        */
 	       error = E2BIG;
 	       goto out;
 	  }
 	  while(len & 03)
 	       len++;
 	  if(ISOK2DIG(sp->dataDigest, pp))
 	       len += 4;
 	  uio->uio_resid = len;
 	  uio->uio_td = sp->td; // why ...
 	  pq->len += len; // XXX: do we need this?
 	  error = soreceive(sp->soc, NULL, uio, &pq->mp, NULL, &flags);
 	  //if(error == EAGAIN)
 	  // XXX: this needs work! it hangs iscontrol
 	  if(error || uio->uio_resid)
 	       goto out;
           if(ISOK2DIG(sp->dataDigest, pp)) {
 	       struct mbuf *m;
 	       u_int    digest, ds_len, cnt;
 
 	       // get the received digest
 	       m_copydata(pq->mp,
 			  len - sizeof(pp->ds_dig),
 			  sizeof(pp->ds_dig),
 			  (caddr_t)&pp->ds_dig);
 	       // calculate all mbufs 
 	       digest = 0;
 	       ds_len = len - sizeof(pp->ds_dig);
 	       for(m = pq->mp; m != NULL; m = m->m_next) {
 		    cnt = MIN(ds_len, m->m_len);
 		    digest = sp->dataDigest(mtod(m, char *), cnt, digest);
 		    ds_len -= cnt;
 		    if(ds_len == 0)
 			 break;
 	       }
 	       if(digest != pp->ds_dig) {
 		    sdebug(1, "bad data digest: received=%x calculated=%x", pp->ds_dig, digest);
 		    error = EIO; // XXX: find a better error
 		    goto out;
 	       }
 	       KASSERT(ds_len == 0, ("ds_len not zero"));
 	  }
      }
      sdebug(6, "len=%d] opcode=0x%x ahs_len=0x%x ds_len=0x%x",
 	    pq->len, bhs->opcode, pp->ahs_len, pp->ds_len);
 
      max = ntohl(bhs->MaxCmdSN);
      exp = ntohl(bhs->ExpStSN);
      if(max < exp - 1 &&
 	max > exp - _MAXINCR) {
 	  sdebug(2,  "bad cmd window size");
 	  error = EIO; // XXX: for now;
 	  goto out; // error
      }
      if(SNA_GT(max, sn->maxCmd))
 	  sn->maxCmd = max;
      if(SNA_GT(exp, sn->expCmd))
 	  sn->expCmd = exp;
      /*
       | remove from the holding queue packets
       | that have been acked and don't need
       | further processing.
       */
      i_acked_hld(sp, NULL);
 
      sp->cws = sn->maxCmd - sn->expCmd + 1;
 
      return 0;
 
  out:
      // XXX: need some work here
      if(pp->ahs_len) {
 	  // XXX: till I figure out what to do with this
 	  free(pp->ahs_addr, M_TEMP);
      }
      xdebug("have a problem, error=%d", error);
      pdu_free(sp->isc, pq);
      if(!error && uio->uio_resid > 0)
 	  error = EPIPE;
      return error;
 }
 
 /*
  | wait for something to arrive.
  | and if the pdu is without errors, process it.
  */
 static int
 so_input(isc_session_t *sp)
 {
      pduq_t		*pq;
      int		error;
 
      debug_called(8);
      /*
       | first read in the iSCSI header
       */
      error = so_getbhs(sp);
      if(error == 0) {
 	  /*
 	   | now read the rest.
 	   */
 	  pq = pdu_alloc(sp->isc, M_NOWAIT); 
 	  if(pq == NULL) { // XXX: might cause a deadlock ...
 	       debug(2, "out of pdus, wait");
 	       pq = pdu_alloc(sp->isc, M_WAITOK);  // OK to WAIT
 	  }
 	  pq->pdu.ipdu.bhs = sp->bhs;
 	  pq->len = sizeof(bhs_t);	// so far only the header was read
 	  error = so_recv(sp, pq);
 	  if(error != 0) {
 	       error += 0x800; // XXX: just to see the error.
 	       // terminal error
 	       // XXX: close connection and exit
 	  }
 	  else {
 	       sp->stats.nrecv++;
 	       getbintime(&sp->stats.t_recv);
 	       ism_recv(sp, pq);
 	  }
      }
      return error;
 }
 
 /*
  | one per active (connected) session.
  | this thread is responsible for reading
  | in packets from the target.
  */
 static void
 isc_in(void *vp)
 {
      isc_session_t	*sp = (isc_session_t *)vp;
      struct socket	*so = sp->soc;
      int		error;
 
      debug_called(8);
 
      sp->flags |= ISC_CON_RUNNING;
      error = 0;
      while((sp->flags & (ISC_CON_RUN | ISC_LINK_UP)) == (ISC_CON_RUN | ISC_LINK_UP)) {
 	  // XXX: hunting ...
 	  if(sp->soc == NULL || !(so->so_state & SS_ISCONNECTED)) {
 	       debug(2, "sp->soc=%p", sp->soc);
 	       break;
 	  }
 	  error = so_input(sp);
 	  if(error == 0) {
 	       mtx_lock(&sp->io_mtx);
 	       if(sp->flags & ISC_OWAITING) {
 		    wakeup(&sp->flags);
 	       }
 	       mtx_unlock(&sp->io_mtx);
 	  } else if(error == EPIPE) {
 	       break;
 	  }
 	  else if(error == EAGAIN) {
 	       if(so->so_state & SS_ISCONNECTED) 
 		    // there seems to be a problem in 6.0 ...
 		    tsleep(sp, PRIBIO, "isc_soc", 2*hz);
 	  }
      }
      sdebug(2, "terminated, flags=%x so_count=%d so_state=%x error=%d proc=%p",
 	    sp->flags, so->so_count, so->so_state, error, sp->proc);
      if((sp->proc != NULL) && sp->signal) {
 	  PROC_LOCK(sp->proc);
 	  kern_psignal(sp->proc, sp->signal);
 	  PROC_UNLOCK(sp->proc);
 	  sp->flags |= ISC_SIGNALED;
 	  sdebug(2, "pid=%d signaled(%d)", sp->proc->p_pid, sp->signal);
      }
      else {
 	  // we have to do something ourselves
 	  // like closing this session ...
      }
      /*
       | we've been terminated
       */
      // do we need this mutex ...?
      mtx_lock(&sp->io_mtx);
      sp->flags &= ~(ISC_CON_RUNNING | ISC_LINK_UP);
      wakeup(&sp->soc);
      mtx_unlock(&sp->io_mtx);
 
      sdebug(2, "dropped ISC_CON_RUNNING");
 #if __FreeBSD_version >= 800000
      kproc_exit(0);
 #else
      kthread_exit(0);
 #endif
 }
 
 void
 isc_stop_receiver(isc_session_t *sp)
 {
      int	n;
 
      debug_called(8);
      sdebug(3, "sp=%p sp->soc=%p", sp, sp? sp->soc: 0);
      mtx_lock(&sp->io_mtx);
      sp->flags &= ~ISC_LINK_UP;
      msleep(&sp->soc, &sp->io_mtx, PRIBIO|PDROP, "isc_stpc", 5*hz);
 
      soshutdown(sp->soc, SHUT_RD);
 
      mtx_lock(&sp->io_mtx);
      sdebug(3, "soshutdown");
      sp->flags &= ~ISC_CON_RUN;
      n = 2;
      while(n-- && (sp->flags & ISC_CON_RUNNING)) {
 	  sdebug(3, "waiting n=%d... flags=%x", n, sp->flags);
 	  msleep(&sp->soc, &sp->io_mtx, PRIBIO, "isc_stpc", 5*hz);
      }
      mtx_unlock(&sp->io_mtx);
 
      if(sp->fp != NULL)
 	  fdrop(sp->fp, sp->td);
      fputsock(sp->soc);
      sp->soc = NULL;
      sp->fp = NULL;
 
      sdebug(3, "done");
 }
 
 void
 isc_start_receiver(isc_session_t *sp)
 {
      debug_called(8);
 
      sp->flags |= ISC_CON_RUN | ISC_LINK_UP;
 #if __FreeBSD_version >= 800000
      kproc_create
 #else
      kthread_create
 #endif
 	  (isc_in, sp, &sp->soc_proc, 0, 0, "isc_in %d", sp->sid);
 }
Index: head/sys/dev/patm/if_patm_rx.c
===================================================================
--- head/sys/dev/patm/if_patm_rx.c	(revision 276691)
+++ head/sys/dev/patm/if_patm_rx.c	(revision 276692)
@@ -1,526 +1,526 @@
 /*-
  * Copyright (c) 2003
  *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
  * 	All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Author: Hartmut Brandt <harti@freebsd.org>
  *
  * Driver for IDT77252 based cards like ProSum's.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_natm.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/errno.h>
 #include <sys/conf.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <sys/endian.h>
 #include <vm/uma.h>
 
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_atm.h>
 #include <net/route.h>
 #ifdef ENABLE_BPF
 #include <net/bpf.h>
 #endif
 #include <netinet/in.h>
 #include <netinet/if_atm.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/mbpool.h>
 
 #include <dev/utopia/utopia.h>
 #include <dev/patm/idt77252reg.h>
 #include <dev/patm/if_patmvar.h>
 
 static void *patm_rcv_handle(struct patm_softc *sc, u_int handle);
 static void patm_rcv_free(struct patm_softc *, void *, u_int handle);
 static struct mbuf *patm_rcv_mbuf(struct patm_softc *, void *, u_int, int);
 
 static __inline void
 rct_write(struct patm_softc *sc, u_int cid, u_int w, u_int val)
 {
 	patm_sram_write(sc, sc->mmap->rct + cid * IDT_RCT_ENTRY_SIZE + w, val);
 }
 static __inline u_int
 rct_read(struct patm_softc *sc, u_int cid, u_int w)
 {
 	return (patm_sram_read(sc, sc->mmap->rct +
 	    cid * IDT_RCT_ENTRY_SIZE + w));
 }
 
 /* check if we can open this one */
 int
 patm_rx_vcc_can_open(struct patm_softc *sc, struct patm_vcc *vcc)
 {
 	return (0);
 }
 
 /*
  * open the VCC
  */
 void
 patm_rx_vcc_open(struct patm_softc *sc, struct patm_vcc *vcc)
 {
 	uint32_t w1 = IDT_RCT_OPEN;
 
 	patm_debug(sc, VCC, "%u.%u RX opening", vcc->vcc.vpi, vcc->vcc.vci);
 
 	switch (vcc->vcc.aal) {
 	  case ATMIO_AAL_0:
 		w1 |= IDT_RCT_AAL0 | IDT_RCT_FBP2 | IDT_RCT_RCI;
 		break;
 	  case ATMIO_AAL_34:
 		w1 |= IDT_RCT_AAL34;
 		break;
 	  case ATMIO_AAL_5:
 		w1 |= IDT_RCT_AAL5;
 		break;
 	  case ATMIO_AAL_RAW:
 		w1 |= IDT_RCT_AALRAW | IDT_RCT_RCI;
 		break;
 	}
 
 	if (vcc->cid != 0)
 		patm_sram_write4(sc, sc->mmap->rct + vcc->cid *
 		    IDT_RCT_ENTRY_SIZE, w1, 0, 0, 0xffffffff);
 	else {
 		/* switch the interface into promiscuous mode */
 		patm_nor_write(sc, IDT_NOR_CFG, patm_nor_read(sc, IDT_NOR_CFG) |
 		    IDT_CFG_ICAPT | IDT_CFG_VPECA);
 	}
 
 	vcc->vflags |= PATM_VCC_RX_OPEN;
 }
 
 /* close the given vcc for transmission */
 void
 patm_rx_vcc_close(struct patm_softc *sc, struct patm_vcc *vcc)
 {
 	u_int w1;
 
 	patm_debug(sc, VCC, "%u.%u RX closing", vcc->vcc.vpi, vcc->vcc.vci);
 
 	if (vcc->cid == 0) {
 		/* switch off promiscuous mode */
 		patm_nor_write(sc, IDT_NOR_CFG, patm_nor_read(sc, IDT_NOR_CFG) &
 		    ~(IDT_CFG_ICAPT | IDT_CFG_VPECA));
 		vcc->vflags &= ~PATM_VCC_RX_OPEN;
 		return;
 	}
 
 	/* close the connection but keep state */
 	w1 = rct_read(sc, vcc->cid, 0);
 	w1 &= ~IDT_RCT_OPEN;
 	rct_write(sc, vcc->cid, 0, w1);
 
 	/* minimum idle count */
 	w1 = (w1 & ~IDT_RCT_IACT_CNT_MASK) | (1 << IDT_RCT_IACT_CNT_SHIFT);
 	rct_write(sc, vcc->cid, 0, w1);
 
 	/* initialize scan */
 	patm_nor_write(sc, IDT_NOR_IRCP, vcc->cid);
 
 	vcc->vflags &= ~PATM_VCC_RX_OPEN;
 	vcc->vflags |= PATM_VCC_RX_CLOSING;
 
 	/*
 	 * check the RSQ
 	 * This is a hack. The problem is, that although an entry is written
 	 * to the RSQ, no interrupt is generated. Also we must wait 1 cell
 	 * time for the SAR to process the scan of our connection.
 	 */
 	DELAY(1);
 	patm_intr_rsq(sc);
 }
 
 /* transmission side finally closed */
 void
 patm_rx_vcc_closed(struct patm_softc *sc, struct patm_vcc *vcc)
 {
 	patm_debug(sc, VCC, "%u.%u RX finally closed",
 	    vcc->vcc.vpi, vcc->vcc.vci);
 }
 
 /*
  * Handle the given receive status queue entry
  */
 void
 patm_rx(struct patm_softc *sc, struct idt_rsqe *rsqe)
 {
 	struct mbuf *m;
 	void *buf;
 	u_int stat, cid, w, cells, len, h;
 	struct patm_vcc *vcc;
 	struct atm_pseudohdr aph;
 	u_char *trail;
 
 	cid = le32toh(rsqe->cid);
 	stat = le32toh(rsqe->stat);
 	h = le32toh(rsqe->handle);
 
 	cid = PATM_CID(sc, IDT_RSQE_VPI(cid), IDT_RSQE_VCI(cid));
 	vcc = sc->vccs[cid];
 
 	if (IDT_RSQE_TYPE(stat) == IDT_RSQE_IDLE) {
 		/* connection has gone idle */
 		if (stat & IDT_RSQE_BUF)
 			patm_rcv_free(sc, patm_rcv_handle(sc, h), h);
 
 		w = rct_read(sc, cid, 0);
 		if (w != 0 && !(w & IDT_RCT_OPEN))
 			rct_write(sc, cid, 0, 0);
 		if (vcc != NULL && (vcc->vflags & PATM_VCC_RX_CLOSING)) {
 			patm_debug(sc, VCC, "%u.%u RX closed", vcc->vcc.vpi,
 			    vcc->vcc.vci);
 			vcc->vflags &= ~PATM_VCC_RX_CLOSING;
 			if (vcc->vcc.flags & ATMIO_FLAG_ASYNC) {
 				patm_rx_vcc_closed(sc, vcc);
 				if (!(vcc->vflags & PATM_VCC_OPEN))
 					patm_vcc_closed(sc, vcc);
 			} else
 				cv_signal(&sc->vcc_cv);
 		}
 		return;
 	}
 
 	buf = patm_rcv_handle(sc, h);
 
 	if (vcc == NULL || (vcc->vflags & PATM_VCC_RX_OPEN) == 0) {
 		patm_rcv_free(sc, buf, h);
 		return;
 	}
 
 	cells = IDT_RSQE_CNT(stat);
 	KASSERT(cells > 0, ("zero cell count"));
 
 	if (vcc->vcc.aal == ATMIO_AAL_0) {
 		/* deliver this packet as it is */
 		if ((m = patm_rcv_mbuf(sc, buf, h, 1)) == NULL)
 			return;
 
 		m->m_len = cells * 48;
 		m->m_pkthdr.len = m->m_len;
 		m->m_pkthdr.rcvif = sc->ifp;
 
 	} else if (vcc->vcc.aal == ATMIO_AAL_34) {
 		/* XXX AAL3/4 */
 		patm_rcv_free(sc, buf, h);
 		return;
 
 	} else if (vcc->vcc.aal == ATMIO_AAL_5) {
 		if (stat & IDT_RSQE_CRC) {
 			if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
 			if (vcc->chain != NULL) {
 				m_freem(vcc->chain);
 				vcc->chain = vcc->last = NULL;
 			}
 			return;
 		}
 
 		/* append to current chain */
 		if (vcc->chain == NULL) {
 			if ((m = patm_rcv_mbuf(sc, buf, h, 1)) == NULL)
 				return;
 			m->m_len = cells * 48;
 			m->m_pkthdr.len = m->m_len;
 			m->m_pkthdr.rcvif = sc->ifp;
 			vcc->chain = vcc->last = m;
 		} else {
 			if ((m = patm_rcv_mbuf(sc, buf, h, 0)) == NULL)
 				return;
 			m->m_len = cells * 48;
 			vcc->last->m_next = m;
 			vcc->last = m;
 			vcc->chain->m_pkthdr.len += m->m_len;
 		}
 
 		if (!(stat & IDT_RSQE_EPDU))
 			return;
 
 		trail = mtod(m, u_char *) + m->m_len - 6;
 		len = (trail[0] << 8) + trail[1];
 
 		if ((u_int)vcc->chain->m_pkthdr.len < len + 8) {
 			patm_printf(sc, "%s: bad aal5 lengths %u %u\n",
 			    __func__, (u_int)m->m_pkthdr.len, len);
 			m_freem(vcc->chain);
 			vcc->chain = vcc->last = NULL;
 			return;
 		}
 		m->m_len -= vcc->chain->m_pkthdr.len - len;
 		KASSERT(m->m_len >= 0, ("bad last mbuf"));
 
 		m = vcc->chain;
 		vcc->chain = vcc->last = NULL;
 		m->m_pkthdr.len = len;
 	} else
 		panic("bad aal");
 
 #if 0
 	{
 		u_int i;
 
 		for (i = 0; i < m->m_len; i++) {
 			printf("%02x ", mtod(m, u_char *)[i]);
 		}
 		printf("\n");
 	}
 #endif
 
 	if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1);
 	/* this is in if_atmsubr.c */
 	/* if_inc_counter(sc->ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); */
 
 	vcc->ibytes += m->m_pkthdr.len;
 	vcc->ipackets++;
 
 	ATM_PH_FLAGS(&aph) = vcc->vcc.flags & 0xff;
 	ATM_PH_VPI(&aph) = IDT_RSQE_VPI(cid);
 	ATM_PH_SETVCI(&aph, IDT_RSQE_VCI(cid));
 
 #ifdef ENABLE_BPF
 	if (!(vcc->vcc.flags & ATMIO_FLAG_NG) &&
 	    (vcc->vcc.aal == ATMIO_AAL_5) &&
 	    (vcc->vcc.flags & ATM_PH_LLCSNAP))
 		BPF_MTAP(sc->ifp, m);
 #endif
 
 	atm_input(sc->ifp, &aph, m, vcc->rxhand);
 }
 
 /*
  * Get the buffer for a receive handle. This is either an mbuf for
  * a large handle or a pool buffer for the others.
  */
 static void *
 patm_rcv_handle(struct patm_softc *sc, u_int handle)
 {
 	void *buf;
 	u_int c;
 
 	if ((handle & ~MBUF_HMASK) == LMBUF_HANDLE) {
 		struct lmbuf *b;
 
 		c = handle & MBUF_HMASK;
 		b = &sc->lbufs[c];
 
 		buf = b->m;
 		b->m = NULL;
 
 		bus_dmamap_sync(sc->lbuf_tag, b->map, BUS_DMASYNC_POSTREAD);
 		patm_lbuf_free(sc, b);
 
 	} else if ((handle & ~MBUF_HMASK) == MBUF_VHANDLE) {
 		mbp_sync(sc->vbuf_pool, handle,
 		    0, VMBUF_SIZE, BUS_DMASYNC_POSTREAD);
 		buf = mbp_get(sc->vbuf_pool, handle);
 
 	} else {
 		mbp_sync(sc->sbuf_pool, handle,
 		    0, SMBUF_SIZE, BUS_DMASYNC_POSTREAD);
 		buf = mbp_get(sc->sbuf_pool, handle);
 	}
 
 	return (buf);
 }
 
 /*
  * Free a buffer.
  */
 static void
 patm_rcv_free(struct patm_softc *sc, void *p, u_int handle)
 {
 	if ((handle & ~MBUF_HMASK) == LMBUF_HANDLE)
 		m_free((struct mbuf *)p);
 
 	else if ((handle & ~MBUF_HMASK) == MBUF_VHANDLE)
 		mbp_free(sc->vbuf_pool, p);
 
 	else
 		mbp_free(sc->sbuf_pool, p);
 }
 
 /*
  * Make an mbuf around the buffer
  */
 static struct mbuf *
 patm_rcv_mbuf(struct patm_softc *sc, void *buf, u_int h, int hdr)
 {
 	struct mbuf *m;
 
 	if ((h & ~MBUF_HMASK) == MBUF_LHANDLE)
 		return ((struct mbuf *)buf);
 
 	if (hdr)
 		MGETHDR(m, M_NOWAIT, MT_DATA);
 	else
 		MGET(m, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		patm_rcv_free(sc, buf, h);
 		return (NULL);
 	}
 
 	if ((h & ~MBUF_HMASK) == MBUF_VHANDLE) {
 		MEXTADD(m, (caddr_t)buf, VMBUF_SIZE, mbp_ext_free,
 		    buf, sc->vbuf_pool, M_PKTHDR, EXT_NET_DRV);
 		m->m_data += VMBUF_OFFSET;
 	} else {
 		MEXTADD(m, (caddr_t)buf, SMBUF_SIZE, mbp_ext_free,
 		    buf, sc->sbuf_pool, M_PKTHDR, EXT_NET_DRV);
 		m->m_data += SMBUF_OFFSET;
 	}
 
 	if (!(m->m_flags & M_EXT)) {
 		patm_rcv_free(sc, buf, h);
 		m_free(m);
 		return (NULL);
 	}
 	return (m);
 }
 
 /*
  * Process the raw cell at the given address.
  */
 void
 patm_rx_raw(struct patm_softc *sc, u_char *cell)
 {
 	u_int vpi, vci, cid;
 	struct patm_vcc *vcc;
 	struct mbuf *m;
 	u_char *dst;
 	struct timespec ts;
 	struct atm_pseudohdr aph;
 	uint64_t cts;
 
 	sc->stats.raw_cells++;
 
 	/*
 	 * For some non-appearant reason the cell header
 	 * is in the wrong endian.
 	 */
 	*(uint32_t *)cell = bswap32(*(uint32_t *)cell);
 
 	vpi = ((cell[0] & 0xf) << 4) | ((cell[1] & 0xf0) >> 4);
 	vci = ((cell[1] & 0xf) << 12) | (cell[2] << 4) | ((cell[3] & 0xf0) >> 4);
 	cid = PATM_CID(sc, vpi, vci);
 
 	vcc = sc->vccs[cid];
 	if (vcc == NULL || !(vcc->vflags & PATM_VCC_RX_OPEN) ||
 	    vcc->vcc.aal != ATMIO_AAL_RAW) {
 		vcc = sc->vccs[0];
 		if (vcc == NULL || !(vcc->vflags & PATM_VCC_RX_OPEN)) {
 			sc->stats.raw_no_vcc++;
 			return;
 		}
 	}
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		sc->stats.raw_no_buf++;
 		return;
 	}
 	m->m_pkthdr.rcvif = sc->ifp;
 
 	switch (vcc->vflags & PATM_RAW_FORMAT) {
 
 	  default:
 	  case PATM_RAW_CELL:
 		m->m_len = m->m_pkthdr.len = 53;
-		MH_ALIGN(m, 53);
+		M_ALIGN(m, 53);
 		dst = mtod(m, u_char *);
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = 0;		/* HEC */
 		bcopy(cell + 12, dst, 48);
 		break;
 
 	  case PATM_RAW_NOHEC:
 		m->m_len = m->m_pkthdr.len = 52;
-		MH_ALIGN(m, 52);
+		M_ALIGN(m, 52);
 		dst = mtod(m, u_char *);
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		bcopy(cell + 12, dst, 48);
 		break;
 
 	  case PATM_RAW_CS:
 		m->m_len = m->m_pkthdr.len = 64;
-		MH_ALIGN(m, 64);
+		M_ALIGN(m, 64);
 		dst = mtod(m, u_char *);
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = *cell++;
 		*dst++ = 0;		/* HEC */
 		*dst++ = 0;		/* flags */
 		*dst++ = 0;		/* reserved */
 		*dst++ = 0;		/* reserved */
 		nanotime(&ts);
 		cts = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 		bcopy(dst, &cts, 8);
 		bcopy(cell + 12, dst + 8, 48);
 		break;
 	}
 
 	if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1);
 	/* this is in if_atmsubr.c */
 	/* if_inc_counter(sc->ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); */
 
 	vcc->ibytes += m->m_pkthdr.len;
 	vcc->ipackets++;
 
 	ATM_PH_FLAGS(&aph) = vcc->vcc.flags & 0xff;
 	ATM_PH_VPI(&aph) = vcc->vcc.vpi;
 	ATM_PH_SETVCI(&aph, vcc->vcc.vci);
 
 	atm_input(sc->ifp, &aph, m, vcc->rxhand);
 }
Index: head/sys/kern/uipc_mbuf.c
===================================================================
--- head/sys/kern/uipc_mbuf.c	(revision 276691)
+++ head/sys/kern/uipc_mbuf.c	(revision 276692)
@@ -1,2193 +1,2161 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mbuf_profiling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
 
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
 int	max_datalen;
 #ifdef MBUF_STRESS_TEST
 int	m_defragpackets;
 int	m_defragbytes;
 int	m_defraguseless;
 int	m_defragfailure;
 int	m_defragrandomfailures;
 #endif
 
 /*
  * sysctl(8) exported objects
  */
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 0, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
 	   &max_protohdr, 0, "Size of largest protocol layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
 	   &max_hdr, 0, "Size of largest link plus protocol header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
 #ifdef MBUF_STRESS_TEST
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 	   &m_defragpackets, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 	   &m_defragbytes, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 	   &m_defraguseless, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 	   &m_defragfailure, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif
 
 /*
  * Ensure the correct size of various mbuf parameters.  It could be off due
  * to compiler-induced padding and alignment artifacts.
  */
 CTASSERT(sizeof(struct mbuf) == MSIZE);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 	return (m);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one but still return the top of the newly allocated
  * chain.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
 	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
 	/* Validate flags. */
 	flags &= (M_PKTHDR | M_EOR);
 
 	/* Packet header mbuf must be first in chain. */
 	if ((flags & M_PKTHDR) && m != NULL)
 		flags &= ~M_PKTHDR;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	while (len > 0) {
 		if (len > MCLBYTES)
 			mb = m_getjcl(how, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		else if (len >= MINCLSIZE)
 			mb = m_getcl(how, type, (flags & M_PKTHDR));
 		else if (flags & M_PKTHDR)
 			mb = m_gethdr(how, type);
 		else
 			mb = m_get(how, type);
 
 		/* Fail the whole operation if one mbuf can't be allocated. */
 		if (mb == NULL) {
 			if (nm != NULL)
 				m_freem(nm);
 			return (NULL);
 		}
 
 		/* Book keeping. */
 		len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
 			((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
 		if (mtail != NULL)
 			mtail->m_next = mb;
 		else
 			nm = mb;
 		mtail = mb;
 		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
 	if (flags & M_EOR)
 		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 			;
 		mtail->m_next = nm;
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = nm;
 
 	return (m);
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	while (mb != NULL)
 		mb = m_free(mb);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.  If the setting
  * up of the reference count fails, the M_EXT bit will not be set.  If
  * successfull, the M_EXT bit is set in the mbuf's flags.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 int
 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
     void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2,
     int flags, int type, int wait)
 {
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	if (type != EXT_EXTREF)
 		mb->m_ext.ext_cnt = uma_zalloc(zone_ext_refcnt, wait);
 
 	if (mb->m_ext.ext_cnt == NULL)
 		return (ENOMEM);
 
 	*(mb->m_ext.ext_cnt) = 1;
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 	mb->m_ext.ext_flags = 0;
 
 	return (0);
 }
 
 /*
  * Non-directly-exported function to clean up after mbufs with M_EXT
  * storage attached to them if the reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/*
 	 * Check if the header is embedded in the cluster.
 	 */
 	freembuf = (m->m_flags & M_NOFREE) ? 0 : 1;
 
 	switch (m->m_ext.ext_type) {
 	case EXT_SFBUF:
 		sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
 		break;
 	default:
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		/* 
 		 * Free attached storage if this mbuf is the only
 		 * reference to it.
 		 */
 		if (*(m->m_ext.ext_cnt) != 1) {
 			if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1)
 				break;
 		}
 
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:	/* The packet zone is special. */
 			if (*(m->m_ext.ext_cnt) == 0)
 				*(m->m_ext.ext_cnt) = 1;
 			uma_zfree(zone_pack, m);
 			return;		/* Job done. */
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			break;
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			*(m->m_ext.ext_cnt) = 0;
 			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
 				m->m_ext.ext_cnt));
 			/* FALLTHROUGH */
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 				("%s: ext_free not set", __func__));
 			(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
 			    m->m_ext.ext_arg2);
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 				("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf)
 		uma_zfree(zone_mbuf, m);
 }
 
 /*
  * Attach the cluster from *m to *n, set up m_ext in *n
  * and bump the refcount of the cluster.
  */
 static void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
 
 	switch (m->m_ext.ext_type) {
 	case EXT_SFBUF:
 		sf_ext_ref(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
 		break;
 	default:
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		if (*(m->m_ext.ext_cnt) == 1)
 			*(m->m_ext.ext_cnt) += 1;
 		else
 			atomic_add_int(m->m_ext.ext_cnt, 1);
 	}
 
 	n->m_ext = m->m_ext;
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & M_RDONLY;
 }
 
 /*
  * Clean up mbuf (chain) from any tags and packet headers.
  * If "all" is set then the first mbuf in the chain will be
  * cleaned too.
  */
 void
 m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR) {
 			m_tag_delete_chain(m, NULL);
 			m->m_flags &= ~M_PKTHDR;
 			bzero(&m->m_pkthdr, sizeof(struct pkthdr));
 		}
 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
 	}
 }
 
 /*
  * Sanity checks on mbuf (chain) for use in KASSERT() and general
  * debugging.
  * Returns 0 or panics when bad and 1 on all tests passed.
  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
  * blow up later.
  */
 int
 m_sanity(struct mbuf *m0, int sanitize)
 {
 	struct mbuf *m;
 	caddr_t a, b;
 	int pktlen = 0;
 
 #ifdef INVARIANTS
 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
 #else
 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
 #endif
 
 	for (m = m0; m != NULL; m = m->m_next) {
 		/*
 		 * Basic pointer checks.  If any of these fails then some
 		 * unrelated kernel memory before or after us is trashed.
 		 * No way to recover from that.
 		 */
 		a = ((m->m_flags & M_EXT) ? m->m_ext.ext_buf :
 			((m->m_flags & M_PKTHDR) ? (caddr_t)(&m->m_pktdat) :
 			 (caddr_t)(&m->m_dat)) );
 		b = (caddr_t)(a + (m->m_flags & M_EXT ? m->m_ext.ext_size :
 			((m->m_flags & M_PKTHDR) ? MHLEN : MLEN)));
 		if ((caddr_t)m->m_data < a)
 			M_SANITY_ACTION("m_data outside mbuf data range left");
 		if ((caddr_t)m->m_data > b)
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
 			if (sanitize) {
 				m_freem(m->m_nextpkt);
 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
 			} else
 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
 		}
 
 		/* packet length (not mbuf length!) calculation */
 		if (m0->m_flags & M_PKTHDR)
 			pktlen += m->m_len;
 
 		/* m_tags may only be attached to first mbuf in chain. */
 		if (m != m0 && m->m_flags & M_PKTHDR &&
 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
 			if (sanitize) {
 				m_tag_delete_chain(m, NULL);
 				/* put in 0xDEADC0DE perhaps? */
 			} else
 				M_SANITY_ACTION("m_tags on in-chain mbuf");
 		}
 
 		/* M_PKTHDR may only be set on first mbuf in chain */
 		if (m != m0 && m->m_flags & M_PKTHDR) {
 			if (sanitize) {
 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 				m->m_flags &= ~M_PKTHDR;
 				/* put in 0xDEADCODE and leave hdr flag in */
 			} else
 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
 		}
 	}
 	m = m0;
 	if (pktlen && pktlen != m->m_pkthdr.len) {
 		if (sanitize)
 			m->m_pkthdr.len = 0;
 		else
 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
 	}
 	return 1;
 
 #undef	M_SANITY_ACTION
 }
 
 
 /*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
 void
 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	/* see below for why these are not enabled */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
 	    ("m_move_pkthdr: to has tags"));
 #endif
 #ifdef MAC
 	/*
 	 * XXXMAC: It could be this should also occur for non-MAC?
 	 */
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
 	from->m_flags &= ~M_PKTHDR;
 }
 
 /*
  * Duplicate "from"'s mbuf pkthdr in "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * In particular, this does a deep copy of the packet tags.
  */
 int
 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
 {
 
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
 	 * when the mbuf is allocated with m_gethdr(). Many users
 	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
 #endif
 	MBUF_CHECKSLEEP(how);
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
 	SLIST_INIT(&to->m_pkthdr.tags);
 	return (m_tag_copy_chain(to, from, how));
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
 		mn = m_gethdr(how, m->m_type);
 	else
 		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
 		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
-	if(m->m_flags & M_PKTHDR) {
-		if (len < MHLEN)
-			MH_ALIGN(m, len);
-	} else {
-		if (len < MLEN)
-			M_ALIGN(m, len);
-	}
+	if (len < M_SIZE(m))
+		M_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	MBUF_CHECKSLEEP(wait);
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = 0;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		if (copyhdr)
 			n = m_gethdr(wait, m->m_type);
 		else
 			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, wait))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Returns mbuf chain with new head for the prepending case.
  * Copies from mbuf (chain) n from off for len to mbuf (chain) m
  * either prepending or appending the data.
  * The resulting mbuf (chain) m is fully writeable.
  * m is destination (is made writeable)
  * n is source, off is offset in source, len is len from offset
  * dir, 0 append, 1 prepend
  * how, wait or nowait
  */
 
 static int
 m_bcopyxxx(void *s, void *t, u_int len)
 {
 	bcopy(s, t, (size_t)len);
 	return 0;
 }
 
 struct mbuf *
 m_copymdata(struct mbuf *m, struct mbuf *n, int off, int len,
     int prep, int how)
 {
 	struct mbuf *mm, *x, *z, *prev = NULL;
 	caddr_t p;
 	int i, nlen = 0;
 	caddr_t buf[MLEN];
 
 	KASSERT(m != NULL && n != NULL, ("m_copymdata, no target or source"));
 	KASSERT(off >= 0, ("m_copymdata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copymdata, negative len %d", len));
 	KASSERT(prep == 0 || prep == 1, ("m_copymdata, unknown direction %d", prep));
 
 	mm = m;
 	if (!prep) {
 		while(mm->m_next) {
 			prev = mm;
 			mm = mm->m_next;
 		}
 	}
 	for (z = n; z != NULL; z = z->m_next)
 		nlen += z->m_len;
 	if (len == M_COPYALL)
 		len = nlen - off;
 	if (off + len > nlen || len < 1)
 		return NULL;
 
 	if (!M_WRITABLE(mm)) {
 		/* XXX: Use proper m_xxx function instead. */
 		x = m_getcl(how, MT_DATA, mm->m_flags);
 		if (x == NULL)
 			return NULL;
 		bcopy(mm->m_ext.ext_buf, x->m_ext.ext_buf, x->m_ext.ext_size);
 		p = x->m_ext.ext_buf + (mm->m_data - mm->m_ext.ext_buf);
 		x->m_data = p;
 		mm->m_next = NULL;
 		if (mm != m)
 			prev->m_next = x;
 		m_free(mm);
 		mm = x;
 	}
 
 	/*
 	 * Append/prepend the data.  Allocating mbufs as necessary.
 	 */
 	/* Shortcut if enough free space in first/last mbuf. */
 	if (!prep && M_TRAILINGSPACE(mm) >= len) {
 		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t) +
 			 mm->m_len);
 		mm->m_len += len;
 		mm->m_pkthdr.len += len;
 		return m;
 	}
 	if (prep && M_LEADINGSPACE(mm) >= len) {
 		mm->m_data = mtod(mm, caddr_t) - len;
 		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t));
 		mm->m_len += len;
 		mm->m_pkthdr.len += len;
 		return mm;
 	}
 
 	/* Expand first/last mbuf to cluster if possible. */
 	if (!prep && !(mm->m_flags & M_EXT) && len > M_TRAILINGSPACE(mm)) {
 		bcopy(mm->m_data, &buf, mm->m_len);
 		m_clget(mm, how);
 		if (!(mm->m_flags & M_EXT))
 			return NULL;
 		bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
 		mm->m_data = mm->m_ext.ext_buf;
 	}
 	if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
 		bcopy(mm->m_data, &buf, mm->m_len);
 		m_clget(mm, how);
 		if (!(mm->m_flags & M_EXT))
 			return NULL;
 		bcopy(&buf, (caddr_t *)mm->m_ext.ext_buf +
 		    mm->m_ext.ext_size - mm->m_len, mm->m_len);
 		mm->m_data = (caddr_t)mm->m_ext.ext_buf +
 		    mm->m_ext.ext_size - mm->m_len;
 	}
 
 	/* Append/prepend as many mbuf (clusters) as necessary to fit len. */
 	if (!prep && len > M_TRAILINGSPACE(mm)) {
 		if (!m_getm(mm, len - M_TRAILINGSPACE(mm), how, MT_DATA))
 			return NULL;
 	}
 	if (prep && len > M_LEADINGSPACE(mm)) {
 		if (!(z = m_getm(NULL, len - M_LEADINGSPACE(mm), how, MT_DATA)))
 			return NULL;
 		i = 0;
 		for (x = z; x != NULL; x = x->m_next) {
 			i += x->m_flags & M_EXT ? x->m_ext.ext_size :
 			    (x->m_flags & M_PKTHDR ? MHLEN : MLEN);
 			if (!x->m_next)
 				break;
 		}
 		z->m_data += i - len;
 		m_move_pkthdr(mm, z);
 		x->m_next = mm;
 		mm = z;
 	}
 
 	/* Seek to start position in source mbuf. Optimization for long chains. */
 	while (off > 0) {
 		if (off < n->m_len)
 			break;
 		off -= n->m_len;
 		n = n->m_next;
 	}
 
 	/* Copy data into target mbuf. */
 	z = mm;
 	while (len > 0) {
 		KASSERT(z != NULL, ("m_copymdata, falling off target edge"));
 		i = M_TRAILINGSPACE(z);
 		m_apply(n, off, i, m_bcopyxxx, mtod(z, caddr_t) + z->m_len);
 		z->m_len += i;
 		/* fixup pkthdr.len if necessary */
 		if ((prep ? mm : m)->m_flags & M_PKTHDR)
 			(prep ? mm : m)->m_pkthdr.len += i;
 		off += i;
 		len -= i;
 		z = z->m_next;
 	}
 	return (prep ? mm : m);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
 	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	if (!m_dup_pkthdr(n, m, how))
 		goto nospace;
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
 		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	MBUF_CHECKSLEEP(how);
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	M_ASSERTPKTHDR(m);
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		if (remain >= MINCLSIZE) {
 			n = m_getcl(how, m->m_type, 0);
 			nsize = MCLBYTES;
 		} else {
 			n = m_get(how, m->m_type);
 			nsize = MLEN;
 		}
 		if (n == NULL)
 			goto nospace;
 
 		if (top == NULL) {		/* First one, must be PKTHDR */
 			if (!m_dup_pkthdr(n, m, how)) {
 				m_free(n);
 				goto nospace;
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 /*
  * Concatenate two pkthdr mbuf chains.
  */
 void
 m_catpkt(struct mbuf *m, struct mbuf *n)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERTPKTHDR(n);
 
 	m->m_pkthdr.len += n->m_pkthdr.len;
 	m_demote(n, 1, 0);
 
 	m_cat(m, n);
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		if (mp->m_flags & M_PKTHDR)
 			mp->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				if (m->m_next != NULL) {
 					m_freem(m->m_next);
 					m->m_next = NULL;
 				}
 				break;
 			}
 			count -= m->m_len;
 		}
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod will work
  * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		if (n->m_flags & M_PKTHDR)
 			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Like m_pullup(), except a new mbuf is always allocated, and we allow
  * the amount of empty space before the data in the new mbuf to be specified
  * (in the event that the caller expects to prepend later).
  */
 int MSFail;
 
 struct mbuf *
 m_copyup(struct mbuf *n, int len, int dstoff)
 {
 	struct mbuf *m;
 	int count, space;
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
 	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
 	if (n->m_flags & M_PKTHDR)
 		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
 		    (unsigned)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
  bad:
 	m_freem(n);
 	MSFail++;
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	MBUF_CHECKSLEEP(wait);
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR && remain == 0) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_next = m->m_next;
 		m->m_next = NULL;
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		return (n);
 	} else if (m0->m_flags & M_PKTHDR) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
-			MH_ALIGN(n, 0);
+			M_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
-			MH_ALIGN(n, remain);
+			M_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + len;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
     void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
 				if (m && totlen + off + max_linkhdr <= MLEN) {
 					m->m_data += max_linkhdr;
 					len -= max_linkhdr;
 				}
 			}
 			if (m == NULL)
 				return NULL;
 			m->m_pkthdr.rcvif = ifp;
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
 				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
 				m_freem(top);
 				return NULL;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		if (m->m_next == NULL && (len > m->m_len - off)) {
 			m->m_len += min(len - (m->m_len - off),
 			    M_TRAILINGSPACE(m));
 		}
 		mlen = min (m->m_len - off, len);
 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 int
 m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space, remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len, remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 	return (remainder == 0);
 }
 
 /*
  * Apply function f to the data in an mbuf chain starting "off" bytes from
  * the beginning, continuing for "len" bytes.
  */
 int
 m_apply(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	u_int count;
 	int rval;
 
 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 		m = m->m_next;
 	}
 	return (0);
 }
 
 /*
  * Return a pointer to mbuf/offset of location in mbuf chain.
  */
 struct mbuf *
 m_getptr(struct mbuf *m, int loc, int *off)
 {
 
 	while (loc >= 0) {
 		/* Normal end of search. */
 		if (m->m_len > loc) {
 			*off = loc;
 			return (m);
 		} else {
 			loc -= m->m_len;
 			if (m->m_next == NULL) {
 				if (loc == 0) {
 					/* Point at the end of valid data. */
 					*off = m->m_len;
 					return (m);
 				}
 				return (NULL);
 			}
 			m = m->m_next;
 		}
 	}
 	return (NULL);
 }
 
 void
 m_print(const struct mbuf *m, int maxlen)
 {
 	int len;
 	int pdata;
 	const struct mbuf *m2;
 
 	if (m == NULL) {
 		printf("mbuf: %p\n", m);
 		return;
 	}
 
 	if (m->m_flags & M_PKTHDR)
 		len = m->m_pkthdr.len;
 	else
 		len = -1;
 	m2 = m;
 	while (m2 != NULL && (len == -1 || len)) {
 		pdata = m2->m_len;
 		if (maxlen != -1 && pdata > maxlen)
 			pdata = maxlen;
 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	if (len > 0)
 		printf("%d bytes unaccounted for.\n", len);
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
 
 /*
  * Defragment a mbuf chain, returning the shortest possible
  * chain of mbufs and clusters.  If allocation fails and
  * this cannot be completed, NULL will be returned, but
  * the passed in chain will be unchanged.  Upon success,
  * the original chain will be freed, and the new chain
  * will be returned.
  *
  * If a non-packet header is passed in, the original
  * mbuf (chain?) will be returned unharmed.
  */
 struct mbuf *
 m_defrag(struct mbuf *m0, int how)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0, length;
 
 	MBUF_CHECKSLEEP(how);
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 #ifdef MBUF_STRESS_TEST
 	if (m_defragrandomfailures) {
 		int temp = arc4random() & 0xff;
 		if (temp == 0xba)
 			goto nospace;
 	}
 #endif
 
 	if (m0->m_pkthdr.len > MHLEN)
 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m_final = m_gethdr(how, MT_DATA);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	while (progress < m0->m_pkthdr.len) {
 		length = m0->m_pkthdr.len - progress;
 		if (length > MCLBYTES)
 			length = MCLBYTES;
 
 		if (m_new == NULL) {
 			if (length > MLEN)
 				m_new = m_getcl(how, MT_DATA, 0);
 			else
 				m_new = m_get(how, MT_DATA);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
 		progress += length;
 		m_new->m_len = length;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 #ifdef MBUF_STRESS_TEST
 	if (m0->m_next == NULL)
 		m_defraguseless++;
 #endif
 	m_freem(m0);
 	m0 = m_final;
 #ifdef MBUF_STRESS_TEST
 	m_defragpackets++;
 	m_defragbytes += m0->m_pkthdr.len;
 #endif
 	return (m0);
 nospace:
 #ifdef MBUF_STRESS_TEST
 	m_defragfailure++;
 #endif
 	if (m_final)
 		m_freem(m_final);
 	return (NULL);
 }
 
 /*
  * Defragment an mbuf chain, returning at most maxfrags separate
  * mbufs+clusters.  If this is not possible NULL is returned and
  * the original mbuf chain is left in it's present (potentially
  * modified) state.  We use two techniques: collapsing consecutive
  * mbufs and replacing consecutive mbufs by a cluster.
  *
  * NB: this should really be named m_defrag but that name is taken
  */
 struct mbuf *
 m_collapse(struct mbuf *m0, int how, int maxfrags)
 {
 	struct mbuf *m, *n, *n2, **prev;
 	u_int curfrags;
 
 	/*
 	 * Calculate the current number of frags.
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
 		curfrags++;
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
 	 * pkthdr.  This may be suboptimal if the first mbuf has much
 	 * less data than the following.
 	 */
 	m = m0;
 again:
 	for (;;) {
 		n = m->m_next;
 		if (n == NULL)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
 				n->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			m_free(n);
 			if (--curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
 	}
 	KASSERT(maxfrags > 1,
 		("maxfrags %u, but normal collapse failed", maxfrags));
 	/*
 	 * Collapse consecutive mbufs to a cluster.
 	 */
 	prev = &m0->m_next;		/* NB: not the first mbuf */
 	while ((n = *prev) != NULL) {
 		if ((n2 = n->m_next) != NULL &&
 		    n->m_len + n2->m_len < MCLBYTES) {
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
 			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
 			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
 				n2->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
 			m_free(n);
 			m_free(n2);
 			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
 			 * again before we allocate another cluster.
 			 */
 			goto again;
 		}
 		prev = &n->m_next;
 	}
 	/*
 	 * No place where we can collapse to a cluster; punt.
 	 * This can occur if, for example, you request 2 frags
 	 * but the packet requires that both be clusters (we
 	 * never reallocate the first mbuf to avoid moving the
 	 * packet header).
 	 */
 bad:
 	return NULL;
 }
 
 #ifdef MBUF_STRESS_TEST
 
 /*
  * Fragment an mbuf chain.  There's no reason you'd ever want to do
  * this in normal usage, but it's great for stress testing various
  * mbuf consumers.
  *
  * If fragmentation is not possible, the original chain will be
  * returned.
  *
  * Possible length values:
  * 0	 no fragmentation will occur
  * > 0	each fragment will be of the specified length
  * -1	each fragment will be the same random value in length
  * -2	each fragment's length will be entirely random
  * (Random values range from 1 to 256)
  */
 struct mbuf *
 m_fragment(struct mbuf *m0, int how, int length)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0;
 
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	if ((length == 0) || (length < -2))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	if (length == -1)
 		length = 1 + (arc4random() & 255);
 
 	while (progress < m0->m_pkthdr.len) {
 		int fraglen;
 
 		if (length > 0)
 			fraglen = length;
 		else
 			fraglen = 1 + (arc4random() & 255);
 		if (fraglen > m0->m_pkthdr.len - progress)
 			fraglen = m0->m_pkthdr.len - progress;
 
 		if (fraglen > MCLBYTES)
 			fraglen = MCLBYTES;
 
 		if (m_new == NULL) {
 			m_new = m_getcl(how, MT_DATA, 0);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
 		progress += fraglen;
 		m_new->m_len = fraglen;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 	m_freem(m0);
 	m0 = m_final;
 	return (m0);
 nospace:
 	if (m_final)
 		m_freem(m_final);
 	/* Return the original chain on failure */
 	return (m0);
 }
 
 #endif
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  */
 struct mbuf *
 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
 	struct mbuf *m, *mb;
 	int error, length;
 	ssize_t total;
 	int progress = 0;
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/*
 	 * The smallest unit returned by m_getm2() is a single mbuf
 	 * with pkthdr.  We can't align past it.
 	 */
 	if (align >= MHLEN)
 		return (NULL);
 
 	/*
 	 * Give us the full allocation or nothing.
 	 * If len is zero return the smallest empty mbuf.
 	 */
 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
 	if (m == NULL)
 		return (NULL);
 	m->m_data += align;
 
 	/* Fill all mbufs with uio data and update header information. */
 	for (mb = m; mb != NULL; mb = mb->m_next) {
 		length = min(M_TRAILINGSPACE(mb), total - progress);
 
 		error = uiomove(mtod(mb, void *), length, uio);
 		if (error) {
 			m_freem(m);
 			return (NULL);
 		}
 
 		mb->m_len = length;
 		progress += length;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	KASSERT(progress == total, ("%s: progress != total", __func__));
 
 	return (m);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 int
 m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
-}
-
-/*
- * Set the m_data pointer of a newly-allocated mbuf
- * to place an object of the specified size at the
- * end of the mbuf, longword aligned.
- */
-void
-m_align(struct mbuf *m, int len)
-{
-#ifdef INVARIANTS
-	const char *msg = "%s: not a virgin mbuf";
-#endif
-	int adjust;
-
-	if (m->m_flags & M_EXT) {
-		KASSERT(m->m_data == m->m_ext.ext_buf, (msg, __func__));
-		adjust = m->m_ext.ext_size - len;
-	} else if (m->m_flags & M_PKTHDR) {
-		KASSERT(m->m_data == m->m_pktdat, (msg, __func__));
-		adjust = MHLEN - len;
-	} else {
-		KASSERT(m->m_data == m->m_dat, (msg, __func__));
-		adjust = MLEN - len;
-	}
-
-	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 /*
  * Create a writable copy of the mbuf chain.  While doing this
  * we compact the chain with a goal of producing a chain with
  * at most two mbufs.  The second mbuf in this chain is likely
  * to be a cluster.  The primary purpose of this work is to create
  * a writable packet for encryption, compression, etc.  The
  * secondary goal is to linearize the data so the data can be
  * passed to crypto hardware in the most efficient manner possible.
  */
 struct mbuf *
 m_unshare(struct mbuf *m0, int how)
 {
 	struct mbuf *m, *mprev;
 	struct mbuf *n, *mfirst, *mlast;
 	int len, off;
 
 	mprev = NULL;
 	for (m = m0; m != NULL; m = mprev->m_next) {
 		/*
 		 * Regular mbufs are ignored unless there's a cluster
 		 * in front of it that we can use to coalesce.  We do
 		 * the latter mainly so later clusters can be coalesced
 		 * also w/o having to handle them specially (i.e. convert
 		 * mbuf+cluster -> cluster).  This optimization is heavily
 		 * influenced by the assumption that we're running over
 		 * Ethernet where MCLBYTES is large enough that the max
 		 * packet size will permit lots of coalescing into a
 		 * single cluster.  This in turn permits efficient
 		 * crypto operations, especially when using hardware.
 		 */
 		if ((m->m_flags & M_EXT) == 0) {
 			if (mprev && (mprev->m_flags & M_EXT) &&
 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
 				/* XXX: this ignores mbuf types */
 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 				    mtod(m, caddr_t), m->m_len);
 				mprev->m_len += m->m_len;
 				mprev->m_next = m->m_next;	/* unlink from chain */
 				m_free(m);			/* reclaim mbuf */
 #if 0
 				newipsecstat.ips_mbcoalesced++;
 #endif
 			} else {
 				mprev = m;
 			}
 			continue;
 		}
 		/*
 		 * Writable mbufs are left alone (for now).
 		 */
 		if (M_WRITABLE(m)) {
 			mprev = m;
 			continue;
 		}
 
 		/*
 		 * Not writable, replace with a copy or coalesce with
 		 * the previous mbuf if possible (since we have to copy
 		 * it anyway, we try to reduce the number of mbufs and
 		 * clusters so that future work is easier).
 		 */
 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
 		/* NB: we only coalesce into a cluster or larger */
 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
 			/* XXX: this ignores mbuf types */
 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 			    mtod(m, caddr_t), m->m_len);
 			mprev->m_len += m->m_len;
 			mprev->m_next = m->m_next;	/* unlink from chain */
 			m_free(m);			/* reclaim mbuf */
 #if 0
 			newipsecstat.ips_clcoalesced++;
 #endif
 			continue;
 		}
 
 		/*
 		 * Allocate new space to hold the copy and copy the data.
 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
 		 * splitting them into clusters.  We could just malloc a
 		 * buffer and make it external but too many device drivers
 		 * don't know how to break up the non-contiguous memory when
 		 * doing DMA.
 		 */
 		n = m_getcl(how, m->m_type, m->m_flags);
 		if (n == NULL) {
 			m_freem(m0);
 			return (NULL);
 		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
 		mlast = NULL;
 		for (;;) {
 			int cc = min(len, MCLBYTES);
 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
 			n->m_len = cc;
 			if (mlast != NULL)
 				mlast->m_next = n;
 			mlast = n;
 #if 0
 			newipsecstat.ips_clcopied++;
 #endif
 
 			len -= cc;
 			if (len <= 0)
 				break;
 			off += cc;
 
 			n = m_getcl(how, m->m_type, m->m_flags);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);
 				return (NULL);
 			}
 		}
 		n->m_next = m->m_next;
 		if (mprev == NULL)
 			m0 = mfirst;		/* new head of chain */
 		else
 			mprev->m_next = mfirst;	/* replace old mbuf */
 		m_free(m);			/* release old mbuf */
 		mprev = mfirst;
 	}
 	return (m0);
 }
 
 #ifdef MBUF_PROFILING
 
 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
 struct mbufprofile {
 	uintmax_t wasted[MP_BUCKETS];
 	uintmax_t used[MP_BUCKETS];
 	uintmax_t segments[MP_BUCKETS];
 } mbprof;
 
 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
 #define MP_NUMLINES 6
 #define MP_NUMSPERLINE 16
 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
 /* work out max space needed and add a bit of spare space too */
 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
 
 char mbprofbuf[MP_BUFSIZE];
 
 void
 m_profile(struct mbuf *m)
 {
 	int segments = 0;
 	int used = 0;
 	int wasted = 0;
 
 	while (m) {
 		segments++;
 		used += m->m_len;
 		if (m->m_flags & M_EXT) {
 			wasted += MHLEN - sizeof(m->m_ext) +
 			    m->m_ext.ext_size - m->m_len;
 		} else {
 			if (m->m_flags & M_PKTHDR)
 				wasted += MHLEN - m->m_len;
 			else
 				wasted += MLEN - m->m_len;
 		}
 		m = m->m_next;
 	}
 	/* be paranoid.. it helps */
 	if (segments > MP_BUCKETS - 1)
 		segments = MP_BUCKETS - 1;
 	if (used > 100000)
 		used = 100000;
 	if (wasted > 100000)
 		wasted = 100000;
 	/* store in the appropriate bucket */
 	/* don't bother locking. if it's slightly off, so what? */
 	mbprof.segments[segments]++;
 	mbprof.used[fls(used)]++;
 	mbprof.wasted[fls(wasted)]++;
 }
 
 static void
 mbprof_textify(void)
 {
 	int offset;
 	char *c;
 	uint64_t *p;
 
 	p = &mbprof.wasted[0];
 	c = mbprofbuf;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "wasted:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.wasted[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.used[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "used:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.used[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.segments[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "segments:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.segments[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 }
 
 static int
 mbprof_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	mbprof_textify();
 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
 	return (error);
 }
 
 static int
 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(&mbprof, sizeof(mbprof));
 	}
 
 	return (error);
 }
 
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
 	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
 	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
 #endif
 
Index: head/sys/net80211/ieee80211_freebsd.c
===================================================================
--- head/sys/net80211/ieee80211_freebsd.c	(revision 276691)
+++ head/sys/net80211/ieee80211_freebsd.c	(revision 276692)
@@ -1,921 +1,921 @@
 /*-
  * Copyright (c) 2003-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 support (FreeBSD-specific code)
  */
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h> 
 #include <sys/linker.h>
 #include <sys/mbuf.h>   
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <sys/socket.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_input.h>
 
 SYSCTL_NODE(_net, OID_AUTO, wlan, CTLFLAG_RD, 0, "IEEE 80211 parameters");
 
 #ifdef IEEE80211_DEBUG
 int	ieee80211_debug = 0;
 SYSCTL_INT(_net_wlan, OID_AUTO, debug, CTLFLAG_RW, &ieee80211_debug,
 	    0, "debugging printfs");
 #endif
 
 static MALLOC_DEFINE(M_80211_COM, "80211com", "802.11 com state");
 
 #if __FreeBSD_version >= 1000020
 static const char wlanname[] = "wlan";
 static struct if_clone *wlan_cloner;
 #endif
 
 /*
  * Allocate/free com structure in conjunction with ifnet;
  * these routines are registered with if_register_com_alloc
  * below and are called automatically by the ifnet code
  * when the ifnet of the parent device is created.
  */
 static void *
 wlan_alloc(u_char type, struct ifnet *ifp)
 {
 	struct ieee80211com *ic;
 
 	ic = malloc(sizeof(struct ieee80211com), M_80211_COM, M_WAITOK|M_ZERO);
 	ic->ic_ifp = ifp;
 
 	return (ic);
 }
 
 static void
 wlan_free(void *ic, u_char type)
 {
 	free(ic, M_80211_COM);
 }
 
 static int
 wlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ieee80211_clone_params cp;
 	struct ieee80211vap *vap;
 	struct ieee80211com *ic;
 	struct ifnet *ifp;
 	int error;
 
 	error = copyin(params, &cp, sizeof(cp));
 	if (error)
 		return error;
 	ifp = ifunit(cp.icp_parent);
 	if (ifp == NULL)
 		return ENXIO;
 	/* XXX move printfs to DIAGNOSTIC before release */
 	if (ifp->if_type != IFT_IEEE80211) {
 		if_printf(ifp, "%s: reject, not an 802.11 device\n", __func__);
 		return ENXIO;
 	}
 	if (cp.icp_opmode >= IEEE80211_OPMODE_MAX) {
 		if_printf(ifp, "%s: invalid opmode %d\n",
 		    __func__, cp.icp_opmode);
 		return EINVAL;
 	}
 	ic = ifp->if_l2com;
 	if ((ic->ic_caps & ieee80211_opcap[cp.icp_opmode]) == 0) {
 		if_printf(ifp, "%s mode not supported\n",
 		    ieee80211_opmode_name[cp.icp_opmode]);
 		return EOPNOTSUPP;
 	}
 	if ((cp.icp_flags & IEEE80211_CLONE_TDMA) &&
 #ifdef IEEE80211_SUPPORT_TDMA
 	    (ic->ic_caps & IEEE80211_C_TDMA) == 0
 #else
 	    (1)
 #endif
 	) {
 		if_printf(ifp, "TDMA not supported\n");
 		return EOPNOTSUPP;
 	}
 #if __FreeBSD_version >= 1000020
 	vap = ic->ic_vap_create(ic, wlanname, unit,
 			cp.icp_opmode, cp.icp_flags, cp.icp_bssid,
 			cp.icp_flags & IEEE80211_CLONE_MACADDR ?
 			    cp.icp_macaddr : (const uint8_t *)IF_LLADDR(ifp));
 #else
 	vap = ic->ic_vap_create(ic, ifc->ifc_name, unit,
 			cp.icp_opmode, cp.icp_flags, cp.icp_bssid,
 			cp.icp_flags & IEEE80211_CLONE_MACADDR ?
 			    cp.icp_macaddr : (const uint8_t *)IF_LLADDR(ifp));
 
 #endif
 
 	return (vap == NULL ? EIO : 0);
 }
 
 static void
 wlan_clone_destroy(struct ifnet *ifp)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_vap_delete(vap);
 }
 
 #if __FreeBSD_version < 1000020
 IFC_SIMPLE_DECLARE(wlan, 0);
 #endif
 
 void
 ieee80211_vap_destroy(struct ieee80211vap *vap)
 {
 	CURVNET_SET(vap->iv_ifp->if_vnet);
 #if __FreeBSD_version >= 1000020
 	if_clone_destroyif(wlan_cloner, vap->iv_ifp);
 #else
 	if_clone_destroyif(&wlan_cloner, vap->iv_ifp);
 #endif
 	CURVNET_RESTORE();
 }
 
 int
 ieee80211_sysctl_msecs_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int msecs = ticks_to_msecs(*(int *)arg1);
 	int error, t;
 
 	error = sysctl_handle_int(oidp, &msecs, 0, req);
 	if (error || !req->newptr)
 		return error;
 	t = msecs_to_ticks(msecs);
 	*(int *)arg1 = (t < 1) ? 1 : t;
 	return 0;
 }
 
 static int
 ieee80211_sysctl_inact(SYSCTL_HANDLER_ARGS)
 {
 	int inact = (*(int *)arg1) * IEEE80211_INACT_WAIT;
 	int error;
 
 	error = sysctl_handle_int(oidp, &inact, 0, req);
 	if (error || !req->newptr)
 		return error;
 	*(int *)arg1 = inact / IEEE80211_INACT_WAIT;
 	return 0;
 }
 
 static int
 ieee80211_sysctl_parent(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211com *ic = arg1;
 	const char *name = ic->ic_ifp->if_xname;
 
 	return SYSCTL_OUT(req, name, strlen(name));
 }
 
 static int
 ieee80211_sysctl_radar(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211com *ic = arg1;
 	int t = 0, error;
 
 	error = sysctl_handle_int(oidp, &t, 0, req);
 	if (error || !req->newptr)
 		return error;
 	IEEE80211_LOCK(ic);
 	ieee80211_dfs_notify_radar(ic, ic->ic_curchan);
 	IEEE80211_UNLOCK(ic);
 	return 0;
 }
 
 void
 ieee80211_sysctl_attach(struct ieee80211com *ic)
 {
 }
 
 void
 ieee80211_sysctl_detach(struct ieee80211com *ic)
 {
 }
 
 void
 ieee80211_sysctl_vattach(struct ieee80211vap *vap)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	char num[14];			/* sufficient for 32 bits */
 
 	ctx = (struct sysctl_ctx_list *) malloc(sizeof(struct sysctl_ctx_list),
 		M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (ctx == NULL) {
 		if_printf(ifp, "%s: cannot allocate sysctl context!\n",
 			__func__);
 		return;
 	}
 	sysctl_ctx_init(ctx);
 	snprintf(num, sizeof(num), "%u", ifp->if_dunit);
 	oid = SYSCTL_ADD_NODE(ctx, &SYSCTL_NODE_CHILDREN(_net, wlan),
 		OID_AUTO, num, CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"%parent", CTLTYPE_STRING | CTLFLAG_RD, vap->iv_ic, 0,
 		ieee80211_sysctl_parent, "A", "parent device");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"driver_caps", CTLFLAG_RW, &vap->iv_caps, 0,
 		"driver capabilities");
 #ifdef IEEE80211_DEBUG
 	vap->iv_debug = ieee80211_debug;
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"debug", CTLFLAG_RW, &vap->iv_debug, 0,
 		"control debugging printfs");
 #endif
 	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"bmiss_max", CTLFLAG_RW, &vap->iv_bmiss_max, 0,
 		"consecutive beacon misses before scanning");
 	/* XXX inherit from tunables */
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"inact_run", CTLTYPE_INT | CTLFLAG_RW, &vap->iv_inact_run, 0,
 		ieee80211_sysctl_inact, "I",
 		"station inactivity timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"inact_probe", CTLTYPE_INT | CTLFLAG_RW, &vap->iv_inact_probe, 0,
 		ieee80211_sysctl_inact, "I",
 		"station inactivity probe timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"inact_auth", CTLTYPE_INT | CTLFLAG_RW, &vap->iv_inact_auth, 0,
 		ieee80211_sysctl_inact, "I",
 		"station authentication timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"inact_init", CTLTYPE_INT | CTLFLAG_RW, &vap->iv_inact_init, 0,
 		ieee80211_sysctl_inact, "I",
 		"station initial state timeout (sec)");
 	if (vap->iv_htcaps & IEEE80211_HTC_HT) {
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_bk", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_BK], 0,
 			"BK traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_be", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_BE], 0,
 			"BE traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_vo", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_VO], 0,
 			"VO traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_vi", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_VI], 0,
 			"VI traffic tx aggr threshold (pps)");
 	}
 	if (vap->iv_caps & IEEE80211_C_DFS) {
 		SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"radar", CTLTYPE_INT | CTLFLAG_RW, vap->iv_ic, 0,
 			ieee80211_sysctl_radar, "I", "simulate radar event");
 	}
 	vap->iv_sysctl = ctx;
 	vap->iv_oid = oid;
 }
 
 void
 ieee80211_sysctl_vdetach(struct ieee80211vap *vap)
 {
 
 	if (vap->iv_sysctl != NULL) {
 		sysctl_ctx_free(vap->iv_sysctl);
 		free(vap->iv_sysctl, M_DEVBUF);
 		vap->iv_sysctl = NULL;
 	}
 }
 
 int
 ieee80211_node_dectestref(struct ieee80211_node *ni)
 {
 	/* XXX need equivalent of atomic_dec_and_test */
 	atomic_subtract_int(&ni->ni_refcnt, 1);
 	return atomic_cmpset_int(&ni->ni_refcnt, 0, 1);
 }
 
 void
 ieee80211_drain_ifq(struct ifqueue *ifq)
 {
 	struct ieee80211_node *ni;
 	struct mbuf *m;
 
 	for (;;) {
 		IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 
 		ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
 		KASSERT(ni != NULL, ("frame w/o node"));
 		ieee80211_free_node(ni);
 		m->m_pkthdr.rcvif = NULL;
 
 		m_freem(m);
 	}
 }
 
 void
 ieee80211_flush_ifq(struct ifqueue *ifq, struct ieee80211vap *vap)
 {
 	struct ieee80211_node *ni;
 	struct mbuf *m, **mprev;
 
 	IF_LOCK(ifq);
 	mprev = &ifq->ifq_head;
 	while ((m = *mprev) != NULL) {
 		ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
 		if (ni != NULL && ni->ni_vap == vap) {
 			*mprev = m->m_nextpkt;		/* remove from list */
 			ifq->ifq_len--;
 
 			m_freem(m);
 			ieee80211_free_node(ni);	/* reclaim ref */
 		} else
 			mprev = &m->m_nextpkt;
 	}
 	/* recalculate tail ptr */
 	m = ifq->ifq_head;
 	for (; m != NULL && m->m_nextpkt != NULL; m = m->m_nextpkt)
 		;
 	ifq->ifq_tail = m;
 	IF_UNLOCK(ifq);
 }
 
 /*
  * As above, for mbufs allocated with m_gethdr/MGETHDR
  * or initialized by M_COPY_PKTHDR.
  */
 #define	MC_ALIGN(m, len)						\
 do {									\
 	(m)->m_data += (MCLBYTES - (len)) &~ (sizeof(long) - 1);	\
 } while (/* CONSTCOND */ 0)
 
 /*
  * Allocate and setup a management frame of the specified
  * size.  We return the mbuf and a pointer to the start
  * of the contiguous data area that's been reserved based
  * on the packet length.  The data area is forced to 32-bit
  * alignment and the buffer length to a multiple of 4 bytes.
  * This is done mainly so beacon frames (that require this)
  * can use this interface too.
  */
 struct mbuf *
 ieee80211_getmgtframe(uint8_t **frm, int headroom, int pktlen)
 {
 	struct mbuf *m;
 	u_int len;
 
 	/*
 	 * NB: we know the mbuf routines will align the data area
 	 *     so we don't need to do anything special.
 	 */
 	len = roundup2(headroom + pktlen, 4);
 	KASSERT(len <= MCLBYTES, ("802.11 mgt frame too large: %u", len));
 	if (len < MINCLSIZE) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		/*
 		 * Align the data in case additional headers are added.
 		 * This should only happen when a WEP header is added
 		 * which only happens for shared key authentication mgt
 		 * frames which all fit in MHLEN.
 		 */
 		if (m != NULL)
-			MH_ALIGN(m, len);
+			M_ALIGN(m, len);
 	} else {
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m != NULL)
 			MC_ALIGN(m, len);
 	}
 	if (m != NULL) {
 		m->m_data += headroom;
 		*frm = m->m_data;
 	}
 	return m;
 }
 
 #ifndef __NO_STRICT_ALIGNMENT
 /*
  * Re-align the payload in the mbuf.  This is mainly used (right now)
  * to handle IP header alignment requirements on certain architectures.
  */
 struct mbuf *
 ieee80211_realign(struct ieee80211vap *vap, struct mbuf *m, size_t align)
 {
 	int pktlen, space;
 	struct mbuf *n;
 
 	pktlen = m->m_pkthdr.len;
 	space = pktlen + align;
 	if (space < MINCLSIZE)
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 	else {
 		n = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
 		    space <= MCLBYTES ?     MCLBYTES :
 #if MJUMPAGESIZE != MCLBYTES
 		    space <= MJUMPAGESIZE ? MJUMPAGESIZE :
 #endif
 		    space <= MJUM9BYTES ?   MJUM9BYTES : MJUM16BYTES);
 	}
 	if (__predict_true(n != NULL)) {
 		m_move_pkthdr(n, m);
 		n->m_data = (caddr_t)(ALIGN(n->m_data + align) - align);
 		m_copydata(m, 0, pktlen, mtod(n, caddr_t));
 		n->m_len = pktlen;
 	} else {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    mtod(m, const struct ieee80211_frame *), NULL,
 		    "%s", "no mbuf to realign");
 		vap->iv_stats.is_rx_badalign++;
 	}
 	m_freem(m);
 	return n;
 }
 #endif /* !__NO_STRICT_ALIGNMENT */
 
 int
 ieee80211_add_callback(struct mbuf *m,
 	void (*func)(struct ieee80211_node *, void *, int), void *arg)
 {
 	struct m_tag *mtag;
 	struct ieee80211_cb *cb;
 
 	mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_CALLBACK,
 			sizeof(struct ieee80211_cb), M_NOWAIT);
 	if (mtag == NULL)
 		return 0;
 
 	cb = (struct ieee80211_cb *)(mtag+1);
 	cb->func = func;
 	cb->arg = arg;
 	m_tag_prepend(m, mtag);
 	m->m_flags |= M_TXCB;
 	return 1;
 }
 
 void
 ieee80211_process_callback(struct ieee80211_node *ni,
 	struct mbuf *m, int status)
 {
 	struct m_tag *mtag;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_CALLBACK, NULL);
 	if (mtag != NULL) {
 		struct ieee80211_cb *cb = (struct ieee80211_cb *)(mtag+1);
 		cb->func(ni, cb->arg, status);
 	}
 }
 
 /*
  * Transmit a frame to the parent interface.
  *
  * TODO: if the transmission fails, make sure the parent node is freed
  *   (the callers will first need modifying.)
  */
 int
 ieee80211_parent_xmitpkt(struct ieee80211com *ic,
 	struct mbuf *m)
 {
 	struct ifnet *parent = ic->ic_ifp;
 	/*
 	 * Assert the IC TX lock is held - this enforces the
 	 * processing -> queuing order is maintained
 	 */
 	IEEE80211_TX_LOCK_ASSERT(ic);
 
 	return (parent->if_transmit(parent, m));
 }
 
 /*
  * Transmit a frame to the VAP interface.
  */
 int
 ieee80211_vap_xmitpkt(struct ieee80211vap *vap, struct mbuf *m)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	/*
 	 * When transmitting via the VAP, we shouldn't hold
 	 * any IC TX lock as the VAP TX path will acquire it.
 	 */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	return (ifp->if_transmit(ifp, m));
 
 }
 
 #include <sys/libkern.h>
 
 void
 get_random_bytes(void *p, size_t n)
 {
 	uint8_t *dp = p;
 
 	while (n > 0) {
 		uint32_t v = arc4random();
 		size_t nb = n > sizeof(uint32_t) ? sizeof(uint32_t) : n;
 		bcopy(&v, dp, n > sizeof(uint32_t) ? sizeof(uint32_t) : n);
 		dp += sizeof(uint32_t), n -= nb;
 	}
 }
 
 /*
  * Helper function for events that pass just a single mac address.
  */
 static void
 notify_macaddr(struct ifnet *ifp, int op, const uint8_t mac[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_join_event iev;
 
 	CURVNET_SET(ifp->if_vnet);
 	memset(&iev, 0, sizeof(iev));
 	IEEE80211_ADDR_COPY(iev.iev_addr, mac);
 	rt_ieee80211msg(ifp, op, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_node_join(struct ieee80211_node *ni, int newassoc)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode join",
 	    (ni == vap->iv_bss) ? "bss " : "");
 
 	if (ni == vap->iv_bss) {
 		notify_macaddr(ifp, newassoc ?
 		    RTM_IEEE80211_ASSOC : RTM_IEEE80211_REASSOC, ni->ni_bssid);
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		notify_macaddr(ifp, newassoc ?
 		    RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, ni->ni_macaddr);
 	}
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_node_leave(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode leave",
 	    (ni == vap->iv_bss) ? "bss " : "");
 
 	if (ni == vap->iv_bss) {
 		rt_ieee80211msg(ifp, RTM_IEEE80211_DISASSOC, NULL, 0);
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 	} else {
 		/* fire off wireless event station leaving */
 		notify_macaddr(ifp, RTM_IEEE80211_LEAVE, ni->ni_macaddr);
 	}
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_scan_done(struct ieee80211vap *vap)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", "notify scan done");
 
 	/* dispatch wireless event indicating scan completed */
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0);
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_replay_failure(struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh, const struct ieee80211_key *k,
 	u_int64_t rsc, int tid)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2,
 	    "%s replay detected tid %d <rsc %ju, csc %ju, keyix %u rxkeyix %u>",
 	    k->wk_cipher->ic_name, tid, (intmax_t) rsc,
 	    (intmax_t) k->wk_keyrsc[tid],
 	    k->wk_keyix, k->wk_rxkeyix);
 
 	if (ifp != NULL) {		/* NB: for cipher test modules */
 		struct ieee80211_replay_event iev;
 
 		IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1);
 		IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2);
 		iev.iev_cipher = k->wk_cipher->ic_cipher;
 		if (k->wk_rxkeyix != IEEE80211_KEYIX_NONE)
 			iev.iev_keyix = k->wk_rxkeyix;
 		else
 			iev.iev_keyix = k->wk_keyix;
 		iev.iev_keyrsc = k->wk_keyrsc[tid];
 		iev.iev_rsc = rsc;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_michael_failure(struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh, u_int keyix)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2,
 	    "michael MIC verification failed <keyix %u>", keyix);
 	vap->iv_stats.is_rx_tkipmic++;
 
 	if (ifp != NULL) {		/* NB: for cipher test modules */
 		struct ieee80211_michael_event iev;
 
 		IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1);
 		IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2);
 		iev.iev_cipher = IEEE80211_CIPHER_TKIP;
 		iev.iev_keyix = keyix;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_wds_discover(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	notify_macaddr(ifp, RTM_IEEE80211_WDS, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_csa(struct ieee80211com *ic,
 	const struct ieee80211_channel *c, int mode, int count)
 {
 	struct ifnet *ifp = ic->ic_ifp;
 	struct ieee80211_csa_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	iev.iev_mode = mode;
 	iev.iev_count = count;
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_CSA, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_radar(struct ieee80211com *ic,
 	const struct ieee80211_channel *c)
 {
 	struct ifnet *ifp = ic->ic_ifp;
 	struct ieee80211_radar_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_RADAR, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_cac(struct ieee80211com *ic,
 	const struct ieee80211_channel *c, enum ieee80211_notify_cac_event type)
 {
 	struct ifnet *ifp = ic->ic_ifp;
 	struct ieee80211_cac_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	iev.iev_type = type;
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_CAC, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_node_deauth(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node deauth");
 
 	notify_macaddr(ifp, RTM_IEEE80211_DEAUTH, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_node_auth(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node auth");
 
 	notify_macaddr(ifp, RTM_IEEE80211_AUTH, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_country(struct ieee80211vap *vap,
 	const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t cc[2])
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_country_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	IEEE80211_ADDR_COPY(iev.iev_addr, bssid);
 	iev.iev_cc[0] = cc[0];
 	iev.iev_cc[1] = cc[1];
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_COUNTRY, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_radio(struct ieee80211com *ic, int state)
 {
 	struct ifnet *ifp = ic->ic_ifp;
 	struct ieee80211_radio_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_state = state;
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_RADIO, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_load_module(const char *modname)
 {
 
 #ifdef notyet
 	(void)kern_kldload(curthread, modname, NULL);
 #else
 	printf("%s: load the %s module by hand for now.\n", __func__, modname);
 #endif
 }
 
 static eventhandler_tag wlan_bpfevent;
 static eventhandler_tag wlan_ifllevent;
 
 static void
 bpf_track(void *arg, struct ifnet *ifp, int dlt, int attach)
 {
 	/* NB: identify vap's by if_init */
 	if (dlt == DLT_IEEE802_11_RADIO &&
 	    ifp->if_init == ieee80211_init) {
 		struct ieee80211vap *vap = ifp->if_softc;
 		/*
 		 * Track bpf radiotap listener state.  We mark the vap
 		 * to indicate if any listener is present and the com
 		 * to indicate if any listener exists on any associated
 		 * vap.  This flag is used by drivers to prepare radiotap
 		 * state only when needed.
 		 */
 		if (attach) {
 			ieee80211_syncflag_ext(vap, IEEE80211_FEXT_BPF);
 			if (vap->iv_opmode == IEEE80211_M_MONITOR)
 				atomic_add_int(&vap->iv_ic->ic_montaps, 1);
 		} else if (!bpf_peers_present(vap->iv_rawbpf)) {
 			ieee80211_syncflag_ext(vap, -IEEE80211_FEXT_BPF);
 			if (vap->iv_opmode == IEEE80211_M_MONITOR)
 				atomic_subtract_int(&vap->iv_ic->ic_montaps, 1);
 		}
 	}
 }
 
 static void
 wlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	struct ieee80211com *ic = ifp->if_l2com;
 	struct ieee80211vap *vap, *next;
 
 	if (ifp->if_type != IFT_IEEE80211 || ic == NULL)
 		return;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH_SAFE(vap, &ic->ic_vaps, iv_next, next) {
 		/*
 		 * If the MAC address has changed on the parent and it was
 		 * copied to the vap on creation then re-sync.
 		 */
 		if (vap->iv_ic == ic &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_UNIQMAC) == 0) {
 			IEEE80211_ADDR_COPY(vap->iv_myaddr, IF_LLADDR(ifp));
 			IEEE80211_UNLOCK(ic);
 			if_setlladdr(vap->iv_ifp, IF_LLADDR(ifp),
 			    IEEE80211_ADDR_LEN);
 			IEEE80211_LOCK(ic);
 		}
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Module glue.
  *
  * NB: the module name is "wlan" for compatibility with NetBSD.
  */
 static int
 wlan_modevent(module_t mod, int type, void *unused)
 {
 	switch (type) {
 	case MOD_LOAD:
 		if (bootverbose)
 			printf("wlan: <802.11 Link Layer>\n");
 		wlan_bpfevent = EVENTHANDLER_REGISTER(bpf_track,
 		    bpf_track, 0, EVENTHANDLER_PRI_ANY);
 		if (wlan_bpfevent == NULL)
 			return ENOMEM;
 		wlan_ifllevent = EVENTHANDLER_REGISTER(iflladdr_event,
 		    wlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		if (wlan_ifllevent == NULL) {
 			EVENTHANDLER_DEREGISTER(bpf_track, wlan_bpfevent);
 			return ENOMEM;
 		}
 #if __FreeBSD_version >= 1000020
 		wlan_cloner = if_clone_simple(wlanname, wlan_clone_create,
 		    wlan_clone_destroy, 0);
 #else
 		if_clone_attach(&wlan_cloner);
 #endif
 		if_register_com_alloc(IFT_IEEE80211, wlan_alloc, wlan_free);
 		return 0;
 	case MOD_UNLOAD:
 		if_deregister_com_alloc(IFT_IEEE80211);
 #if __FreeBSD_version >= 1000020
 		if_clone_detach(wlan_cloner);
 #else
 		if_clone_detach(&wlan_cloner);
 #endif
 		EVENTHANDLER_DEREGISTER(bpf_track, wlan_bpfevent);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, wlan_ifllevent);
 		return 0;
 	}
 	return EINVAL;
 }
 
 static moduledata_t wlan_mod = {
 #if __FreeBSD_version >= 1000020
 	wlanname,
 #else
 	"wlan",
 #endif
 	wlan_modevent,
 	0
 };
 DECLARE_MODULE(wlan, wlan_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 MODULE_VERSION(wlan, 1);
 MODULE_DEPEND(wlan, ether, 1, 1, 1);
 #ifdef	IEEE80211_ALQ
 MODULE_DEPEND(wlan, alq, 1, 1, 1);
 #endif	/* IEEE80211_ALQ */
 
Index: head/sys/netinet/if_ether.c
===================================================================
--- head/sys/netinet/if_ether.c	(revision 276691)
+++ head/sys/netinet/if_ether.c	(revision 276692)
@@ -1,947 +1,947 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <net/if_arc.h>
 #include <net/iso88025.h>
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
 
 /* timer values */
 static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 static VNET_DEFINE(int, arp_maxtries) = 5;
 static VNET_DEFINE(int, arp_proxyall) = 0;
 static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, arp_maxhold) = 1;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 
 static void	arp_init(void);
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 #ifdef AF_INET
 /*
  * called by in_scrubprefix() to remove entry from the table when
  * the interface goes away
  */
 void
 arp_ifscrub(struct ifnet *ifp, uint32_t addr)
 {
 	struct sockaddr_in addr4;
 
 	bzero((void *)&addr4, sizeof(addr4));
 	addr4.sin_len    = sizeof(addr4);
 	addr4.sin_family = AF_INET;
 	addr4.sin_addr.s_addr = addr;
 	IF_AFDATA_RLOCK(ifp);
 	lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR),
 	    (struct sockaddr *)&addr4);
 	IF_AFDATA_RUNLOCK(ifp);
 }
 #endif
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 
 	if (lle->la_flags & LLE_STATIC) {
 		LLE_WUNLOCK(lle);
 		return;
 	}
 
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->la_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		size_t pkts_dropped;
 
 		LLE_REMREF(lle);
 		pkts_dropped = llentry_free(lle);
 		ARPSTAT_ADD(dropped, pkts_dropped);
 	} else
 		LLE_FREE_LOCKED(lle);
 
 	IF_AFDATA_UNLOCK(ifp);
 
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return;
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
-	MH_ALIGN(m, m->m_len);
+	M_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txrequests);
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  * On input:
  *    ifp is the interface we use
  *    is_gw != if @dst represents gateway to some destination
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is where we want the address.
  *    flags returns lle entry flags.
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = 0;
 	u_int flags = 0;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 retry:
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), flags, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
 	    && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
 		flags |= (LLE_CREATE | LLE_EXCLUSIVE);
 		IF_AFDATA_WLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), flags, dst);
 		IF_AFDATA_WUNLOCK(ifp);
 	}
 	if (la == NULL) {
 		if (flags & LLE_CREATE)
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname);
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
 		/*
 		 * If entry has an expiry time and it is approaching,
 		 * see if we need to send an ARP request within this
 		 * arpt_down interval.
 		 */
 		if (!(la->la_flags & LLE_STATIC) &&
 		    time_uptime + la->la_preempt > la->la_expire) {
 			arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 			la->la_preempt--;
 		}
 
 		if (pflags != NULL)
 			*pflags = la->la_flags;
 		error = 0;
 		goto done;
 	}
 
 	if (la->la_flags & LLE_STATIC) {   /* should not happen! */
 		log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
 		    inet_ntoa(SIN(dst)->sin_addr));
 		m_freem(m);
 		error = EINVAL;
 		goto done;
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 	if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
 		flags |= LLE_EXCLUSIVE;
 		LLE_RUNLOCK(la);
 		goto retry;
 	}
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		if (la->la_numheld >= V_arp_maxhold) {
 			if (la->la_hold != NULL) {
 				next = la->la_hold->m_nextpkt;
 				m_freem(la->la_hold);
 				la->la_hold = next;
 				la->la_numheld--;
 				ARPSTAT_INC(dropped);
 			}
 		}
 		if (la->la_hold != NULL) {
 			curr = la->la_hold;
 			while (curr->m_nextpkt != NULL)
 				curr = curr->m_nextpkt;
 			curr->m_nextpkt = m;
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
 		if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
 			flags &= ~LLE_EXCLUSIVE;
 			LLE_DOWNGRADE(la);
 		}
 
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
 done:
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WUNLOCK(la);
 	else
 		LLE_RUNLOCK(la);
 	return (error);
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n");
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
 	    ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 &&
 	    ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) {
 		log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)"
 		    " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "",
 		    ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":",
 		    ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":");
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_len < arphdr_len(ar)) {
 		if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
 			log(LOG_NOTICE, "arp: runt packet\n");
 			m_freem(m);
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 #define	ARP_LOG(pri, ...)	do {					\
 	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL;
 	struct rtentry *rt;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op, flags;
 	int req_len;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
 	if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
 		ARP_LOG(LOG_NOTICE, "runt packet -- m_pullup failed\n");
 		return;
 	}
 
 	ah = mtod(m, struct arphdr *);
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	IN_IFADDR_RLOCK();
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			goto match;
 		}
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				IN_IFADDR_RUNLOCK();
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 	IN_IFADDR_RUNLOCK();
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	IN_IFADDR_RLOCK();
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
 		IN_IFADDR_RUNLOCK();
 		goto drop;
 	}
 	ifa_ref(&ia->ia_ifa);
 	IN_IFADDR_RUNLOCK();
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
 	flags |= LLE_EXCLUSIVE;
 	IF_AFDATA_LOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
 	IF_AFDATA_UNLOCK(ifp);
 	if (la != NULL) {
 		/* the following is not an error when doing bridging */
 		if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 			if (log_arp_wrong_iface)
 				ARP_LOG(LOG_WARNING, "%s is on %s "
 				    "but got reply from %*D on %s\n",
 				    inet_ntoa(isaddr),
 				    la->lle_tbl->llt_ifp->if_xname,
 				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 				    ifp->if_xname);
 			LLE_WUNLOCK(la);
 			goto reply;
 		}
 		if ((la->la_flags & LLE_VALID) &&
 		    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
 			if (la->la_flags & LLE_STATIC) {
 				LLE_WUNLOCK(la);
 				if (log_arp_permanent_modify)
 					ARP_LOG(LOG_ERR,
 					    "%*D attempts to modify "
 					    "permanent entry for %s on %s\n",
 					    ifp->if_addrlen,
 					    (u_char *)ar_sha(ah), ":",
 					    inet_ntoa(isaddr), ifp->if_xname);
 				goto reply;
 			}
 			if (log_arp_movements) {
 				ARP_LOG(LOG_INFO, "%s moved from %*D "
 				    "to %*D on %s\n",
 				    inet_ntoa(isaddr),
 				    ifp->if_addrlen,
 				    (u_char *)&la->ll_addr, ":",
 				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 				    ifp->if_xname);
 			}
 		}
 
 		if (ifp->if_addrlen != ah->ar_hln) {
 			LLE_WUNLOCK(la);
 			ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 			    "i/f %d (ignored)\n", ifp->if_addrlen,
 			    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 			    ifp->if_addrlen);
 			goto drop;
 		}
 		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
 		la->la_flags |= LLE_VALID;
 
 		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;
 
 			LLE_ADDREF(la);
 			la->la_expire = time_uptime + V_arpt_keep;
 			canceled = callout_reset(&la->la_timer,
 			    hz * V_arpt_keep, arptimer, la);
 			if (canceled)
 				LLE_REMREF(la);
 		}
 		la->la_asked = 0;
 		la->la_preempt = V_arp_maxtries;
 		/*
 		 * The packets are all freed within the call to the output
 		 * routine.
 		 *
 		 * NB: The lock MUST be released before the call to the
 		 * output routine.
 		 */
 		if (la->la_hold != NULL) {
 			struct mbuf *m_hold, *m_hold_next;
 
 			m_hold = la->la_hold;
 			la->la_hold = NULL;
 			la->la_numheld = 0;
 			memcpy(&sa, L3_ADDR(la), sizeof(sa));
 			LLE_WUNLOCK(la);
 			for (; m_hold != NULL; m_hold = m_hold_next) {
 				m_hold_next = m_hold->m_nextpkt;
 				m_hold->m_nextpkt = NULL;
 				/* Avoid confusing lower layers. */
 				m_clrprotoflags(m_hold);
 				(*ifp->if_output)(ifp, m_hold, &sa, NULL);
 			}
 		} else
 			LLE_WUNLOCK(la);
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		struct llentry *lle = NULL;
 
 		sin.sin_addr = itaddr;
 		IF_AFDATA_RLOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			sin.sin_addr = itaddr;
 			/* XXX MRT use table 0 for arp reply  */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (!rt->rt_ifp || rt->rt_ifp == ifp) {
 				RTFREE_LOCKED(rt);
 				goto drop;
 			}
 			RTFREE_LOCKED(rt);
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 			sin.sin_addr = isaddr;
 
 			/* XXX MRT use table 0 for arp checks */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 			if (rt->rt_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
 				    " from %s via %s, expecting %s\n",
 				    inet_ntoa(isaddr), ifp->if_xname,
 				    rt->rt_ifp->if_xname);
 				RTFREE_LOCKED(rt);
 				goto drop;
 			}
 			RTFREE_LOCKED(rt);
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	struct llentry *lle;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
 		/*
 		 * interface address is considered static entry
 		 * because the output of the arp utility shows
 		 * that L2 entry as permanent
 		 */
 		IF_AFDATA_LOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC),
 				 (struct sockaddr *)IA_SIN(ifa));
 		IF_AFDATA_UNLOCK(ifp);
 		if (lle == NULL)
 			log(LOG_INFO, "arp_ifinit: cannot create arp "
 			    "entry for interface address\n");
 		else
 			LLE_RUNLOCK(lle);
 	}
 	ifa->ifa_rtrequest = NULL;
 }
 
 void
 arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
 {
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, enaddr);
 	ifa->ifa_rtrequest = NULL;
 }
 
 static void
 arp_init(void)
 {
 
 	netisr_register(&arp_nh);
 }
 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
Index: head/sys/netinet/igmp.c
===================================================================
--- head/sys/netinet/igmp.c	(revision 276691)
+++ head/sys/netinet/igmp.c	(revision 276692)
@@ -1,3655 +1,3655 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifinfo *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifinfo *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifinfo *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct ifqueue *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
 		    struct ifqueue *, struct ifqueue *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct ifqueue *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifinfo to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifinfo	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			error = SYSCTL_OUT(req, igi,
 			    sizeof(struct igmp_ifinfo));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		_IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifinfo *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifinfo *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 
 	SLIST_INIT(&igi->igi_relinmhead);
 
 	/*
 	 * Responses to general queries are subject to bounds.
 	 */
 	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	igi_delete_locked(ifp);
 
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			_IF_DRAIN(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 
 #ifdef INVARIANTS
 	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifinfo *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifinfo *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t srclen;
 				int nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				srclen = sizeof(struct in_addr) * nsrc;
 				if (nsrc * sizeof(in_addr_t) > srclen) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				    srclen;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct ifqueue		 scq;	/* State-change packets */
 	struct ifqueue		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifinfo	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 
 			memset(&qrq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
 
 			memset(&scq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
     struct ifqueue *qrq, struct ifqueue *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifinfo *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		_IF_DRAIN(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifinfo *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
-	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
+	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifinfo *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	struct ifnet		*ifp;
 	struct ifqueue		*ifq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			ifq = &inm->inm_scq;
 			_IF_DRAIN(ifq);
 			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	_IF_DRAIN(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			_IF_DRAIN(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(ifq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = ifq->ifq_tail;
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
-				MH_ALIGN(m, IGMP_LEADINGSPACE);
+				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		_IF_ENQUEUE(ifq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
-				MH_ALIGN(m, IGMP_LEADINGSPACE);
+				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		_IF_ENQUEUE(ifq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = ifq->ifq_tail;
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
-						MH_ALIGN(m, IGMP_LEADINGSPACE);
+						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				_IF_ENQUEUE(ifq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
 {
 	struct ifqueue	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (gq->ifq_head == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = gq->ifq_head;
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = ifscq->ifq_tail;
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && _IF_QFULL(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			_IF_DEQUEUE(gq, m0);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
 			    __func__, m0, ifscq);
 			_IF_ENQUEUE(ifscq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (igi->igi_gq.ifq_head != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 igmp_init(void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 
 	IGMP_LOCK_INIT();
 
 	m_raopt = igmp_ra_alloc();
 
 	netisr_register(&igmp_nh);
 }
 SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
 
 static void
 igmp_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
 	netisr_unregister(&igmp_nh);
 
 	m_free(m_raopt);
 	m_raopt = NULL;
 
 	IGMP_LOCK_DESTROY();
 }
 SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
 
 static void
 vnet_igmp_init(const void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 
 	LIST_INIT(&V_igi_head);
 }
 VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
     NULL);
 
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
 	KASSERT(LIST_EMPTY(&V_igi_head),
 	    ("%s: igi list not empty; ifnets not detached?", __func__));
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
 DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: head/sys/netinet/ip_carp.c
===================================================================
--- head/sys/netinet/ip_carp.c	(revision 276691)
+++ head/sys/netinet/ip_carp.c	(revision 276692)
@@ -1,2198 +1,2198 @@
 /*-
  * Copyright (c) 2002 Michael Shalayeff.
  * Copyright (c) 2003 Ryan McBride.
  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/counter.h>
 
 #include <net/ethernet.h>
 #include <net/fddi.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip.h>
 #include <machine/in_cksum.h>
 #endif
 #ifdef INET
 #include <netinet/ip_var.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <crypto/sha1.h>
 
 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
 
 struct carp_softc {
 	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
 	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
 	struct sockaddr_dl	sc_addr;	/* Our link level address. */
 	struct callout		sc_ad_tmo;	/* Advertising timeout. */
 #ifdef INET
 	struct callout		sc_md_tmo;	/* Master down timeout. */
 #endif
 #ifdef INET6
 	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
 #endif
 	struct mtx		sc_mtx;
 
 	int			sc_vhid;
 	int			sc_advskew;
 	int			sc_advbase;
 
 	int			sc_naddrs;
 	int			sc_naddrs6;
 	int			sc_ifasiz;
 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
 	int			sc_suppress;
 	int			sc_sendad_errors;
 #define	CARP_SENDAD_MAX_ERRORS	3
 	int			sc_sendad_success;
 #define	CARP_SENDAD_MIN_SUCCESS 3
 
 	int			sc_init_counter;
 	uint64_t		sc_counter;
 
 	/* authentication */
 #define	CARP_HMAC_PAD	64
 	unsigned char sc_key[CARP_KEY_LEN];
 	unsigned char sc_pad[CARP_HMAC_PAD];
 	SHA1_CTX sc_sha1;
 
 	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
 	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
 };
 
 struct carp_if {
 #ifdef INET
 	int	cif_naddrs;
 #endif
 #ifdef INET6
 	int	cif_naddrs6;
 #endif
 	TAILQ_HEAD(, carp_softc) cif_vrs;
 #ifdef INET
 	struct ip_moptions 	 cif_imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions 	 cif_im6o;
 #endif
 	struct ifnet	*cif_ifp;
 	struct mtx	cif_mtx;
 	uint32_t	cif_flags;
 #define	CIF_PROMISC	0x00000001
 };
 
 #define	CARP_INET	0
 #define	CARP_INET6	1
 static int proto_reg[] = {-1, -1};
 
 /*
  * Brief design of carp(4).
  *
  * Any carp-capable ifnet may have a list of carp softcs hanging off
  * its ifp->if_carp pointer. Each softc represents one unique virtual
  * host id, or vhid. The softc has a back pointer to the ifnet. All
  * softcs are joined in a global list, which has quite limited use.
  *
  * Any interface address that takes part in CARP negotiation has a
  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
  * AF_INET or AF_INET6 address.
  *
  * Although, one can get the softc's backpointer to ifnet and traverse
  * through its ifp->if_addrhead queue to find all interface addresses
  * involved in CARP, we keep a growable array of ifaddr pointers. This
  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
  * do calls into the network stack, thus avoiding LORs.
  *
  * Locking:
  *
  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
  * callout-driven events and ioctl()s.
  *
  * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to
  * traverse the global list we use the mutex carp_mtx.
  *
  * Known issues with locking:
  *
  * - There is no protection for races between two ioctl() requests,
  *   neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all
  *   interface ioctl()s should be serialized right in net/if.c.
  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
  *   counting is done on the softc.
  * - On module unload we may race (?) with packet processing thread
  *   dereferencing our function pointers.
  */
 
 /* Accept incoming CARP packets. */
 static VNET_DEFINE(int, carp_allow) = 1;
 #define	V_carp_allow	VNET(carp_allow)
 
 /* Preempt slower nodes. */
 static VNET_DEFINE(int, carp_preempt) = 0;
 #define	V_carp_preempt	VNET(carp_preempt)
 
 /* Log level. */
 static VNET_DEFINE(int, carp_log) = 1;
 #define	V_carp_log	VNET(carp_log)
 
 /* Global advskew demotion. */
 static VNET_DEFINE(int, carp_demotion) = 0;
 #define	V_carp_demotion	VNET(carp_demotion)
 
 /* Send error demotion factor. */
 static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
 #define	V_carp_senderr_adj	VNET(carp_senderr_adj)
 
 /* Iface down demotion factor. */
 static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
 #define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
 
 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net_inet, IPPROTO_CARP,	carp,	CTLFLAG_RW, 0,	"CARP");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_log), 0, "CARP log level");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
     0, 0, carp_demote_adj_sysctl, "I",
     "Adjust demotion factor (skew of advskew)");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_ifdown_adj), 0,
     "Interface down demotion factor adjustment");
 
 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
 VNET_PCPUSTAT_SYSINIT(carpstats);
 VNET_PCPUSTAT_SYSUNINIT(carpstats);
 
 #define	CARPSTATS_ADD(name, val)	\
     counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
 	sizeof(uint64_t)], (val))
 #define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
 
 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
     carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
 
 #define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
 	NULL, MTX_DEF)
 #define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
 #define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 #define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
 #define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
 	NULL, MTX_DEF)
 #define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
 #define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
 #define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
 #define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
 #define	CIF_FREE(cif)	do {				\
 		CIF_LOCK_ASSERT(cif);			\
 		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
 			carp_free_if(cif);		\
 		else					\
 			CIF_UNLOCK(cif);		\
 } while (0)
 
 #define	CARP_LOG(...)	do {				\
 	if (V_carp_log > 0)				\
 		log(LOG_INFO, "carp: " __VA_ARGS__);	\
 } while (0)
 
 #define	CARP_DEBUG(...)	do {				\
 	if (V_carp_log > 1)				\
 		log(LOG_DEBUG, __VA_ARGS__);		\
 } while (0)
 
 #define	IFNET_FOREACH_IFA(ifp, ifa)					\
 	IF_ADDR_LOCK_ASSERT(ifp);					\
 	TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)		\
 		if ((ifa)->ifa_carp != NULL)
 
 #define	CARP_FOREACH_IFA(sc, ifa)					\
 	CARP_LOCK_ASSERT(sc);						\
 	for (int _i = 0;						\
 		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
 		((ifa) = sc->sc_ifas[_i]) != NULL;			\
 		++_i)
 
 #define	IFNET_FOREACH_CARP(ifp, sc)					\
 	CIF_LOCK_ASSERT(ifp->if_carp);					\
 	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
 
 #define	DEMOTE_ADVSKEW(sc)					\
     (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
     CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
 
 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
 static struct carp_softc
 		*carp_alloc(struct ifnet *);
 static void	carp_detach_locked(struct ifaddr *);
 static void	carp_destroy(struct carp_softc *);
 static struct carp_if
 		*carp_alloc_if(struct ifnet *);
 static void	carp_free_if(struct carp_if *);
 static void	carp_set_state(struct carp_softc *, int);
 static void	carp_sc_state(struct carp_softc *);
 static void	carp_setrun(struct carp_softc *, sa_family_t);
 static void	carp_master_down(void *);
 static void	carp_master_down_locked(struct carp_softc *);
 static void	carp_send_ad(void *);
 static void	carp_send_ad_locked(struct carp_softc *);
 static void	carp_addroute(struct carp_softc *);
 static void	carp_ifa_addroute(struct ifaddr *);
 static void	carp_delroute(struct carp_softc *);
 static void	carp_ifa_delroute(struct ifaddr *);
 static void	carp_send_ad_all(void *, int);
 static void	carp_demote_adj(int, char *);
 
 static LIST_HEAD(, carp_softc) carp_list;
 static struct mtx carp_mtx;
 static struct task carp_sendall_task =
     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
 
 static void
 carp_hmac_prepare(struct carp_softc *sc)
 {
 	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
 	uint8_t vhid = sc->sc_vhid & 0xff;
 	struct ifaddr *ifa;
 	int i, found;
 #ifdef INET
 	struct in_addr last, cur, in;
 #endif
 #ifdef INET6
 	struct in6_addr last6, cur6, in6;
 #endif
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* Compute ipad from key. */
 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36;
 
 	/* Precompute first part of inner hash. */
 	SHA1Init(&sc->sc_sha1);
 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
 #ifdef INET
 	cur.s_addr = 0;
 	do {
 		found = 0;
 		last = cur;
 		cur.s_addr = 0xffffffff;
 		CARP_FOREACH_IFA(sc, ifa) {
 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			if (ifa->ifa_addr->sa_family == AF_INET &&
 			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
 				cur.s_addr = in.s_addr;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
 	} while (found);
 #endif /* INET */
 #ifdef INET6
 	memset(&cur6, 0, sizeof(cur6));
 	do {
 		found = 0;
 		last6 = cur6;
 		memset(&cur6, 0xff, sizeof(cur6));
 		CARP_FOREACH_IFA(sc, ifa) {
 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 			if (IN6_IS_SCOPE_EMBED(&in6))
 				in6.s6_addr16[1] = 0;
 			if (ifa->ifa_addr->sa_family == AF_INET6 &&
 			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
 				cur6 = in6;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
 	} while (found);
 #endif /* INET6 */
 
 	/* convert ipad to opad */
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
 }
 
 static void
 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	SHA1_CTX sha1ctx;
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* fetch first half of inner hash */
 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
 
 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
 	SHA1Final(md, &sha1ctx);
 
 	/* outer hash */
 	SHA1Init(&sha1ctx);
 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sha1ctx, md, 20);
 	SHA1Final(md, &sha1ctx);
 }
 
 static int
 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	unsigned char md2[20];
 
 	CARP_LOCK_ASSERT(sc);
 
 	carp_hmac_generate(sc, counter, md2);
 
 	return (bcmp(md, md2, sizeof(md2)));
 }
 
 /*
  * process input packet.
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
 #ifdef INET
 int
 carp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_header *ch;
 	int iplen, len;
 
 	iplen = *offp;
 	*mp = NULL;
 
 	CARPSTATS_INC(carps_ipackets);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip->ip_ttl,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
 		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (iplen + sizeof(*ch) < m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
 			CARPSTATS_INC(carps_hdrops);
 			CARP_DEBUG("%s: pullup failed\n", __func__);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/*
 	 * verify that the received packet length is
 	 * equal to the CARP header
 	 */
 	len = iplen + sizeof(*ch);
 	if (len > m->m_pkthdr.len) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
 		    m->m_pkthdr.len,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if ((m = m_pullup(m, len)) == NULL) {
 		CARPSTATS_INC(carps_hdrops);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/* verify the CARP checksum */
 	m->m_data += iplen;
 	if (in_cksum(m, len - iplen)) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iplen;
 
 	carp_input_c(m, ch, AF_INET);
 	return (IPPROTO_DONE);
 }
 #endif
 
 #ifdef INET6
 int
 carp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
 	u_int len;
 
 	CARPSTATS_INC(carps_ipackets6);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		CARPSTATS_INC(carps_badif);
 		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
 		    __func__, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255 */
 	if (ip6->ip6_hlim != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that we have a complete carp packet */
 	len = m->m_len;
 	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
 	if (ch == NULL) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
 		return (IPPROTO_DONE);
 	}
 
 
 	/* verify the CARP checksum */
 	m->m_data += *offp;
 	if (in_cksum(m, sizeof(*ch))) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= *offp;
 
 	carp_input_c(m, ch, AF_INET6);
 	return (IPPROTO_DONE);
 }
 #endif /* INET6 */
 
 static void
 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ifaddr *ifa;
 	struct carp_softc *sc;
 	uint64_t tmp_counter;
 	struct timeval sc_tv, ch_tv;
 
 	/* verify that the VHID is valid on the receiving interface */
 	IF_ADDR_RLOCK(ifp);
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == af &&
 		    ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
 			ifa_ref(ifa);
 			break;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (ifa == NULL) {
 		CARPSTATS_INC(carps_badvhid);
 		m_freem(m);
 		return;
 	}
 
 	/* verify the CARP version. */
 	if (ch->carp_version != CARP_VERSION) {
 		CARPSTATS_INC(carps_badver);
 		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
 		    ch->carp_version);
 		ifa_free(ifa);
 		m_freem(m);
 		return;
 	}
 
 	sc = ifa->ifa_carp;
 	CARP_LOCK(sc);
 	ifa_free(ifa);
 
 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
 		CARPSTATS_INC(carps_badauth);
 		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
 		    sc->sc_vhid, ifp->if_xname);
 		goto out;
 	}
 
 	tmp_counter = ntohl(ch->carp_counter[0]);
 	tmp_counter = tmp_counter<<32;
 	tmp_counter += ntohl(ch->carp_counter[1]);
 
 	/* XXX Replay protection goes here */
 
 	sc->sc_init_counter = 0;
 	sc->sc_counter = tmp_counter;
 
 	sc_tv.tv_sec = sc->sc_advbase;
 	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
 	ch_tv.tv_sec = ch->carp_advbase;
 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
 
 	switch (sc->sc_state) {
 	case INIT:
 		break;
 	case MASTER:
 		/*
 		 * If we receive an advertisement from a master who's going to
 		 * be more frequent than us, go into BACKUP state.
 		 */
 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
 			callout_stop(&sc->sc_ad_tmo);
 			CARP_LOG("VHID %u@%s: MASTER -> BACKUP "
 			    "(more frequent advertisement received)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_set_state(sc, BACKUP);
 			carp_setrun(sc, 0);
 			carp_delroute(sc);
 		}
 		break;
 	case BACKUP:
 		/*
 		 * If we're pre-empting masters who advertise slower than us,
 		 * and this one claims to be slower, treat him as down.
 		 */
 		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
 			    "(preempting a slower master)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 *  If the master is going to advertise at such a low frequency
 		 *  that he's guaranteed to time out, we'd might as well just
 		 *  treat him as timed out now.
 		 */
 		sc_tv.tv_sec = sc->sc_advbase * 3;
 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
 			    "(master timed out)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 * Otherwise, we reset the counter and wait for the next
 		 * advertisement.
 		 */
 		carp_setrun(sc, af);
 		break;
 	}
 
 out:
 	CARP_UNLOCK(sc);
 	m_freem(m);
 }
 
 static int
 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
 {
 	struct m_tag *mtag;
 
 	if (sc->sc_init_counter) {
 		/* this could also be seconds since unix epoch */
 		sc->sc_counter = arc4random();
 		sc->sc_counter = sc->sc_counter << 32;
 		sc->sc_counter += arc4random();
 	} else
 		sc->sc_counter++;
 
 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 
 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
 
 	/* Tag packet for carp_output */
 	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
 	    M_NOWAIT)) == NULL) {
 		m_freem(m);
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
 	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
 }
 
 /*
  * To avoid LORs and possible recursions this function shouldn't
  * be called directly, but scheduled via taskqueue.
  */
 static void
 carp_send_ad_all(void *ctx __unused, int pending __unused)
 {
 	struct carp_softc *sc;
 
 	mtx_lock(&carp_mtx);
 	LIST_FOREACH(sc, &carp_list, sc_next)
 		if (sc->sc_state == MASTER) {
 			CARP_LOCK(sc);
 			CURVNET_SET(sc->sc_carpdev->if_vnet);
 			carp_send_ad_locked(sc);
 			CURVNET_RESTORE();
 			CARP_UNLOCK(sc);
 		}
 	mtx_unlock(&carp_mtx);
 }
 
 /* Send a periodic advertisement, executed in callout context. */
 static void
 carp_send_ad(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	carp_send_ad_locked(sc);
 	CURVNET_RESTORE();
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_send_ad_error(struct carp_softc *sc, int error)
 {
 
 	if (error) {
 		if (sc->sc_sendad_errors < INT_MAX)
 			sc->sc_sendad_errors++;
 		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 			static const char fmt[] = "send error %d on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
 			carp_demote_adj(V_carp_senderr_adj, msg);
 		}
 		sc->sc_sendad_success = 0;
 	} else {
 		if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
 		    ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
 			static const char fmt[] = "send ok on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, sc->sc_carpdev->if_xname);
 			carp_demote_adj(-V_carp_senderr_adj, msg);
 			sc->sc_sendad_errors = 0;
 		} else
 			sc->sc_sendad_errors = 0;
 	}
 }
 
 static void
 carp_send_ad_locked(struct carp_softc *sc)
 {
 	struct carp_header ch;
 	struct timeval tv;
 	struct sockaddr sa;
 	struct ifaddr *ifa;
 	struct carp_header *ch_ptr;
 	struct mbuf *m;
 	int len, advskew;
 
 	CARP_LOCK_ASSERT(sc);
 
 	advskew = DEMOTE_ADVSKEW(sc);
 	tv.tv_sec = sc->sc_advbase;
 	tv.tv_usec = advskew * 1000000 / 256;
 
 	ch.carp_version = CARP_VERSION;
 	ch.carp_type = CARP_ADVERTISEMENT;
 	ch.carp_vhid = sc->sc_vhid;
 	ch.carp_advbase = sc->sc_advbase;
 	ch.carp_advskew = advskew;
 	ch.carp_authlen = 7;	/* XXX DEFINE */
 	ch.carp_pad1 = 0;	/* must be zero */
 	ch.carp_cksum = 0;
 
 	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
 
 #ifdef INET
 	if (sc->sc_naddrs) {
 		struct ip *ip;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
-		MH_ALIGN(m, m->m_len);
+		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = htons(len);
 		ip->ip_id = ip_newid();
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = CARP_DFLTTL;
 		ip->ip_p = IPPROTO_CARP;
 		ip->ip_sum = 0;
 
 		bzero(&sa, sizeof(sa));
 		sa.sa_family = AF_INET;
 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
 		if (ifa != NULL) {
 			ip->ip_src.s_addr =
 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			ifa_free(ifa);
 		} else
 			ip->ip_src.s_addr = 0;
 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
 
 		ch_ptr = (struct carp_header *)(&ip[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
 		m->m_data -= sizeof(*ip);
 
 		CARPSTATS_INC(carps_opackets);
 
 		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
 		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
 	}
 #endif /* INET */
 #ifdef INET6
 	if (sc->sc_naddrs6) {
 		struct ip6_hdr *ip6;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip6) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
-		MH_ALIGN(m, m->m_len);
+		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip6 = mtod(m, struct ip6_hdr *);
 		bzero(ip6, sizeof(*ip6));
 		ip6->ip6_vfc |= IPV6_VERSION;
 		ip6->ip6_hlim = CARP_DFLTTL;
 		ip6->ip6_nxt = IPPROTO_CARP;
 		bzero(&sa, sizeof(sa));
 
 		/* set the source address */
 		sa.sa_family = AF_INET6;
 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
 		if (ifa != NULL) {
 			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
 			    sizeof(struct in6_addr));
 			ifa_free(ifa);
 		} else
 			/* This should never happen with IPv6. */
 			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
 
 		/* Set the multicast destination. */
 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
 		ip6->ip6_dst.s6_addr8[15] = 0x12;
 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 			m_freem(m);
 			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
 			goto resched;
 		}
 
 		ch_ptr = (struct carp_header *)(&ip6[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip6);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
 		m->m_data -= sizeof(*ip6);
 
 		CARPSTATS_INC(carps_opackets6);
 
 		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
 		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
 	}
 #endif /* INET6 */
 
 resched:
 	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
 }
 
 static void
 carp_addroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_addroute(ifa);
 }
 
 static void
 carp_ifa_addroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		in_addprefix(ifatoia(ifa), RTF_UP);
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_add_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 static void
 carp_delroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_delroute(ifa);
 }
 
 static void
 carp_ifa_delroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_rem_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 int
 carp_master(struct ifaddr *ifa)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	return (sc->sc_state == MASTER);
 }
 
 #ifdef INET
 /*
  * Broadcast a gratuitous ARP request containing
  * the virtual router MAC address for each IP address
  * associated with the virtual router.
  */
 static void
 carp_send_arp(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr));
 }
 
 int
 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	if (sc->sc_state == MASTER) {
 		*enaddr = LLADDR(&sc->sc_addr);
 		return (1);
 	}
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static void
 carp_send_na(struct carp_softc *sc)
 {
 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 	struct ifaddr *ifa;
 	struct in6_addr *in6;
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		in6 = IFA_IN6(ifa);
 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
 		DELAY(1000);	/* XXX */
 	}
 }
 
 /*
  * Returns ifa in case it's a carp address and it is MASTER, or if the address
  * matches and is not a carp address.  Returns NULL otherwise.
  */
 struct ifaddr *
 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	ifa = NULL;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
 			continue;
 		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
 			ifa = NULL;
 		else
 			ifa_ref(ifa);
 		break;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (ifa);
 }
 
 caddr_t
 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
 			struct carp_softc *sc = ifa->ifa_carp;
 			struct m_tag *mtag;
 
 			IF_ADDR_RUNLOCK(ifp);
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
 			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
 			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (NULL);
 }
 #endif /* INET6 */
 
 int
 carp_forus(struct ifnet *ifp, u_char *dhost)
 {
 	struct carp_softc *sc;
 	uint8_t *ena = dhost;
 
 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 		return (0);
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
 		    ETHER_ADDR_LEN)) {
 			CARP_UNLOCK(sc);
 			CIF_UNLOCK(ifp->if_carp);
 			return (1);
 		}
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 
 	return (0);
 }
 
 /* Master down timeout event, executed in callout context. */
 static void
 carp_master_down(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	if (sc->sc_state == BACKUP) {
 		CARP_LOG("VHID %u@%s: BACKUP -> MASTER (master down)\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		carp_master_down_locked(sc);
 	}
 	CURVNET_RESTORE();
 
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_master_down_locked(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	switch (sc->sc_state) {
 	case BACKUP:
 		carp_set_state(sc, MASTER);
 		carp_send_ad_locked(sc);
 #ifdef INET
 		carp_send_arp(sc);
 #endif
 #ifdef INET6
 		carp_send_na(sc);
 #endif
 		carp_setrun(sc, 0);
 		carp_addroute(sc);
 		break;
 	case INIT:
 	case MASTER:
 #ifdef INVARIANTS
 		panic("carp: VHID %u@%s: master_down event in %s state\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname,
 		    sc->sc_state ? "MASTER" : "INIT");
 #endif
 		break;
 	}
 }
 
 /*
  * When in backup state, af indicates whether to reset the master down timer
  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
  */
 static void
 carp_setrun(struct carp_softc *sc, sa_family_t af)
 {
 	struct timeval tv;
 
 	CARP_LOCK_ASSERT(sc);
 
 	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
 	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
 		return;
 
 	switch (sc->sc_state) {
 	case INIT:
 		CARP_LOG("VHID %u@%s: INIT -> BACKUP\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		carp_set_state(sc, BACKUP);
 		carp_setrun(sc, 0);
 		break;
 	case BACKUP:
 		callout_stop(&sc->sc_ad_tmo);
 		tv.tv_sec = 3 * sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 		default:
 #ifdef INET
 			if (sc->sc_naddrs)
 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 #ifdef INET6
 			if (sc->sc_naddrs6)
 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 			break;
 		}
 		break;
 	case MASTER:
 		tv.tv_sec = sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 		break;
 	}
 }
 
 /*
  * Setup multicast structures.
  */
 static int
 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 	int error = 0;
 
 	CIF_LOCK_ASSERT(cif);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip_moptions *imo = &cif->cif_imo;
 		struct in_addr addr;
 
 		if (imo->imo_membership)
 			return (0);
 
 		imo->imo_membership = (struct in_multi **)malloc(
 		    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
 		    M_NOWAIT);
 		if (imo->imo_membership == NULL)
 			return (ENOMEM);
 		imo->imo_mfilters = NULL;
 		imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 		imo->imo_multicast_vif = -1;
 
 		addr.s_addr = htonl(INADDR_CARP_GROUP);
 		if ((error = in_joingroup(ifp, &addr, NULL,
 		    &imo->imo_membership[0])) != 0) {
 			free(imo->imo_membership, M_CARP);
 			break;
 		}
 		imo->imo_num_memberships++;
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = CARP_DFLTTL;
 		imo->imo_multicast_loop = 0;
 		break;
 	   }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_moptions *im6o = &cif->cif_im6o;
 		struct in6_addr in6;
 		struct in6_multi *in6m;
 
 		if (im6o->im6o_membership)
 			return (0);
 
 		im6o->im6o_membership = (struct in6_multi **)malloc(
 		    (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
 		    M_ZERO | M_NOWAIT);
 		if (im6o->im6o_membership == NULL)
 			return (ENOMEM);
 		im6o->im6o_mfilters = NULL;
 		im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
 		im6o->im6o_multicast_hlim = CARP_DFLTTL;
 		im6o->im6o_multicast_ifp = ifp;
 
 		/* Join IPv6 CARP multicast group. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr8[15] = 0x12;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		in6m = NULL;
 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		im6o->im6o_membership[0] = in6m;
 		im6o->im6o_num_memberships++;
 
 		/* Join solicited multicast address. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr32[1] = 0;
 		in6.s6_addr32[2] = htonl(1);
 		in6.s6_addr32[3] = 0;
 		in6.s6_addr8[12] = 0xff;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		in6m = NULL;
 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		im6o->im6o_membership[1] = in6m;
 		im6o->im6o_num_memberships++;
 		break;
 	    }
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Free multicast structures.
  */
 static void
 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
 {
 
 	CIF_LOCK_ASSERT(cif);
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		if (cif->cif_naddrs == 0) {
 			struct ip_moptions *imo = &cif->cif_imo;
 
 			in_leavegroup(imo->imo_membership[0], NULL);
 			KASSERT(imo->imo_mfilters == NULL,
 			    ("%s: imo_mfilters != NULL", __func__));
 			free(imo->imo_membership, M_CARP);
 			imo->imo_membership = NULL;
 
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (cif->cif_naddrs6 == 0) {
 			struct ip6_moptions *im6o = &cif->cif_im6o;
 
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			in6_mc_leave(im6o->im6o_membership[1], NULL);
 			KASSERT(im6o->im6o_mfilters == NULL,
 			    ("%s: im6o_mfilters != NULL", __func__));
 			free(im6o->im6o_membership, M_CARP);
 			im6o->im6o_membership = NULL;
 		}
 		break;
 #endif
 	}
 }
 
 int
 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
 {
 	struct m_tag *mtag;
 	struct carp_softc *sc;
 
 	if (!sa)
 		return (0);
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return (0);
 	}
 
 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_BRIDGE:
 	case IFT_L2VLAN: {
 			struct ether_header *eh;
 
 			eh = mtod(m, struct ether_header *);
 			eh->ether_shost[0] = 0;
 			eh->ether_shost[1] = 0;
 			eh->ether_shost[2] = 0x5e;
 			eh->ether_shost[3] = 0;
 			eh->ether_shost[4] = 1;
 			eh->ether_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_FDDI: {
 			struct fddi_header *fh;
 
 			fh = mtod(m, struct fddi_header *);
 			fh->fddi_shost[0] = 0;
 			fh->fddi_shost[1] = 0;
 			fh->fddi_shost[2] = 0x5e;
 			fh->fddi_shost[3] = 0;
 			fh->fddi_shost[4] = 1;
 			fh->fddi_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_ISO88025: {
  			struct iso88025_header *th;
  			th = mtod(m, struct iso88025_header *);
 			th->iso88025_shost[0] = 3;
 			th->iso88025_shost[1] = 0;
 			th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[4] = 0;
 			th->iso88025_shost[5] = 0;
 		}
 		break;
 	default:
 		printf("%s: carp is not supported for the %d interface type\n",
 		    ifp->if_xname, ifp->if_type);
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static struct carp_softc*
 carp_alloc(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 	struct carp_if *cif;
 
 	if ((cif = ifp->if_carp) == NULL)
 		cif = carp_alloc_if(ifp);
 
 	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
 
 	sc->sc_advbase = CARP_DFLTINTV;
 	sc->sc_vhid = -1;	/* required setting */
 	sc->sc_init_counter = 1;
 	sc->sc_state = INIT;
 
 	sc->sc_ifasiz = sizeof(struct ifaddr *);
 	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
 	sc->sc_carpdev = ifp;
 
 	CARP_LOCK_INIT(sc);
 #ifdef INET
 	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 #ifdef INET6
 	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 
 	CIF_LOCK(cif);
 	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	return (sc);
 }
 
 static int
 carp_grow_ifas(struct carp_softc *sc)
 {
 	struct ifaddr **new;
 
 	CARP_LOCK_ASSERT(sc);
 
 	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO);
 	if (new == NULL)
 		return (ENOMEM);
 	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
 	free(sc->sc_ifas, M_CARP);
 	sc->sc_ifas = new;
 	sc->sc_ifasiz *= 2;
 
 	return (0);
 }
 
 static void
 carp_destroy(struct carp_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_carpdev;
 	struct carp_if *cif = ifp->if_carp;
 
 	CIF_LOCK_ASSERT(cif);
 
 	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
 
 	mtx_lock(&carp_mtx);
 	LIST_REMOVE(sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	CARP_LOCK(sc);
 	if (sc->sc_suppress)
 		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
 	callout_drain(&sc->sc_ad_tmo);
 #ifdef INET
 	callout_drain(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 	callout_drain(&sc->sc_md6_tmo);
 #endif
 	CARP_LOCK_DESTROY(sc);
 
 	free(sc->sc_ifas, M_CARP);
 	free(sc, M_CARP);
 }
 
 static struct carp_if*
 carp_alloc_if(struct ifnet *ifp)
 {
 	struct carp_if *cif;
 	int error;
 
 	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
 
 	if ((error = ifpromisc(ifp, 1)) != 0)
 		printf("%s: ifpromisc(%s) failed: %d\n",
 		    __func__, ifp->if_xname, error);
 	else
 		cif->cif_flags |= CIF_PROMISC;
 
 	CIF_LOCK_INIT(cif);
 	cif->cif_ifp = ifp;
 	TAILQ_INIT(&cif->cif_vrs);
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = cif;
 	if_ref(ifp);
 	IF_ADDR_WUNLOCK(ifp);
 
 	return (cif);
 }
 
 static void
 carp_free_if(struct carp_if *cif)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 
 	CIF_LOCK_ASSERT(cif);
 	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
 	    __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = NULL;
 	IF_ADDR_WUNLOCK(ifp);
 
 	CIF_LOCK_DESTROY(cif);
 
 	if (cif->cif_flags & CIF_PROMISC)
 		ifpromisc(ifp, 0);
 	if_rele(ifp);
 
 	free(cif, M_CARP);
 }
 
 static void
 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
 {
 
 	CARP_LOCK(sc);
 	carpr->carpr_state = sc->sc_state;
 	carpr->carpr_vhid = sc->sc_vhid;
 	carpr->carpr_advbase = sc->sc_advbase;
 	carpr->carpr_advskew = sc->sc_advskew;
 	if (priv)
 		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
 	else
 		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
 	CARP_UNLOCK(sc);
 }
 
 int
 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
 {
 	struct carpreq carpr;
 	struct ifnet *ifp;
 	struct carp_softc *sc = NULL;
 	int error = 0, locked = 0;
 
 	if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
 		return (error);
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_FDDI:
 	case IFT_ISO88025:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto out;
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		error = EADDRNOTAVAIL;
 		goto out;
 	}
 
 	switch (cmd) {
 	case SIOCSVH:
 		if ((error = priv_check(td, PRIV_NETINET_CARP)))
 			break;
 		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
 		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (ifp->if_carp) {
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			CIF_UNLOCK(ifp->if_carp);
 		}
 		if (sc == NULL) {
 			sc = carp_alloc(ifp);
 			CARP_LOCK(sc);
 			sc->sc_vhid = carpr.carpr_vhid;
 			LLADDR(&sc->sc_addr)[0] = 0;
 			LLADDR(&sc->sc_addr)[1] = 0;
 			LLADDR(&sc->sc_addr)[2] = 0x5e;
 			LLADDR(&sc->sc_addr)[3] = 0;
 			LLADDR(&sc->sc_addr)[4] = 1;
 			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
 		} else
 			CARP_LOCK(sc);
 		locked = 1;
 		if (carpr.carpr_advbase > 0) {
 			if (carpr.carpr_advbase > 255 ||
 			    carpr.carpr_advbase < CARP_DFLTINTV) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advbase = carpr.carpr_advbase;
 		}
 		if (carpr.carpr_advskew > 0) {
 			if (carpr.carpr_advskew >= 255) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advskew = carpr.carpr_advskew;
 		}
 		if (carpr.carpr_key[0] != '\0') {
 			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
 			carp_hmac_prepare(sc);
 		}
 		if (sc->sc_state != INIT &&
 		    carpr.carpr_state != sc->sc_state) {
 			switch (carpr.carpr_state) {
 			case BACKUP:
 				callout_stop(&sc->sc_ad_tmo);
 				carp_set_state(sc, BACKUP);
 				carp_setrun(sc, 0);
 				carp_delroute(sc);
 				break;
 			case MASTER:
 				carp_master_down_locked(sc);
 				break;
 			default:
 				break;
 			}
 		}
 		break;
 
 	case SIOCGVH:
 	    {
 		int priveleged;
 
 		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
 			error = EINVAL;
 			break;
 		}
 		if (carpr.carpr_count < 1) {
 			error = EMSGSIZE;
 			break;
 		}
 		if (ifp->if_carp == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
 		if (carpr.carpr_vhid != 0) {
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			CIF_UNLOCK(ifp->if_carp);
 			if (sc == NULL) {
 				error = ENOENT;
 				break;
 			}
 			carp_carprcp(&carpr, sc, priveleged);
 			error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
 		} else  {
 			int i, count;
 
 			count = 0;
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				count++;
 
 			if (count > carpr.carpr_count) {
 				CIF_UNLOCK(ifp->if_carp);
 				error = EMSGSIZE;
 				break;
 			}
 
 			i = 0;
 			IFNET_FOREACH_CARP(ifp, sc) {
 				carp_carprcp(&carpr, sc, priveleged);
 				carpr.carpr_count = count;
 				error = copyout(&carpr, ifr->ifr_data +
 				    (i * sizeof(carpr)), sizeof(carpr));
 				if (error) {
 					CIF_UNLOCK(ifp->if_carp);
 					break;
 				}
 				i++;
 			}
 			CIF_UNLOCK(ifp->if_carp);
 		}
 		break;
 	    }
 	default:
 		error = EINVAL;
 	}
 
 out:
 	if (locked)
 		CARP_UNLOCK(sc);
 	if_rele(ifp);
 
 	return (error);
 }
 
 static int
 carp_get_vhid(struct ifaddr *ifa)
 {
 
 	if (ifa == NULL || ifa->ifa_carp == NULL)
 		return (0);
 
 	return (ifa->ifa_carp->sc_vhid);
 }
 
 int
 carp_attach(struct ifaddr *ifa, int vhid)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc;
 	int index, error;
 
 	if (ifp->if_carp == NULL)
 		return (ENOPROTOOPT);
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 #endif
 #ifdef INET6
 	case AF_INET6:
 #endif
 		break;
 	default:
 		return (EPROTOTYPE);
 	}
 
 	CIF_LOCK(cif);
 	IFNET_FOREACH_CARP(ifp, sc)
 		if (sc->sc_vhid == vhid)
 			break;
 	if (sc == NULL) {
 		CIF_UNLOCK(cif);
 		return (ENOENT);
 	}
 
 	if (ifa->ifa_carp) {
 		if (ifa->ifa_carp->sc_vhid != vhid)
 			carp_detach_locked(ifa);
 		else {
 			CIF_UNLOCK(cif);
 			return (0);
 		}
 	}
 
 	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
 	if (error) {
 		CIF_FREE(cif);
 		return (error);
 	}
 
 	CARP_LOCK(sc);
 	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
 	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
 		if ((error = carp_grow_ifas(sc)) != 0) {
 			carp_multicast_cleanup(cif,
 			    ifa->ifa_addr->sa_family);
 			CARP_UNLOCK(sc);
 			CIF_FREE(cif);
 			return (error);
 		}
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs++;
 		sc->sc_naddrs++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6++;
 		sc->sc_naddrs6++;
 		break;
 #endif
 	}
 
 	ifa_ref(ifa);
 	sc->sc_ifas[index - 1] = ifa;
 	ifa->ifa_carp = sc;
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	CARP_UNLOCK(sc);
 	CIF_UNLOCK(cif);
 
 	return (0);
 }
 
 void
 carp_detach(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 
 	CIF_LOCK(cif);
 	carp_detach_locked(ifa);
 	CIF_FREE(cif);
 }
 
 static void
 carp_detach_locked(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc = ifa->ifa_carp;
 	int i, index;
 
 	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
 
 	CIF_LOCK_ASSERT(cif);
 	CARP_LOCK(sc);
 
 	/* Shift array. */
 	index = sc->sc_naddrs + sc->sc_naddrs6;
 	for (i = 0; i < index; i++)
 		if (sc->sc_ifas[i] == ifa)
 			break;
 	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
 	for (; i < index - 1; i++)
 		sc->sc_ifas[i] = sc->sc_ifas[i+1];
 	sc->sc_ifas[index - 1] = NULL;
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs--;
 		sc->sc_naddrs--;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6--;
 		sc->sc_naddrs6--;
 		break;
 #endif
 	}
 
 	carp_ifa_delroute(ifa);
 	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
 
 	ifa->ifa_carp = NULL;
 	ifa_free(ifa);
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
 		CARP_UNLOCK(sc);
 		carp_destroy(sc);
 	} else
 		CARP_UNLOCK(sc);
 }
 
 static void
 carp_set_state(struct carp_softc *sc, int state)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_state != state) {
 		const char *carp_states[] = { CARP_STATES };
 		char subsys[IFNAMSIZ+5];
 
 		sc->sc_state = state;
 
 		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		devctl_notify("CARP", subsys, carp_states[state], NULL);
 	}
 }
 
 static void
 carp_linkstate(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		carp_sc_state(sc);
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 }
 
 static void
 carp_sc_state(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    !(sc->sc_carpdev->if_flags & IFF_UP)) {
 		callout_stop(&sc->sc_ad_tmo);
 #ifdef INET
 		callout_stop(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 		callout_stop(&sc->sc_md6_tmo);
 #endif
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (!sc->sc_suppress)
 			carp_demote_adj(V_carp_ifdown_adj, "interface down");
 		sc->sc_suppress = 1;
 	} else {
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (sc->sc_suppress)
 			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
 		sc->sc_suppress = 0;
 	}
 }
 
 static void
 carp_demote_adj(int adj, char *reason)
 {
 	atomic_add_int(&V_carp_demotion, adj);
 	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
 	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
 }
 
 static int
 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_demotion;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	carp_demote_adj(new, "sysctl");
 
 	return (0);
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 #ifdef INET6
 extern	struct domain inet6domain;
 static struct protosw in6_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 };
 #endif
 
 static void
 carp_mod_cleanup(void)
 {
 
 #ifdef INET
 	if (proto_reg[CARP_INET] == 0) {
 		(void)ipproto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET] = -1;
 	}
 	carp_iamatch_p = NULL;
 #endif
 #ifdef INET6
 	if (proto_reg[CARP_INET6] == 0) {
 		(void)ip6proto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET6] = -1;
 	}
 	carp_iamatch6_p = NULL;
 	carp_macmatch6_p = NULL;
 #endif
 	carp_ioctl_p = NULL;
 	carp_attach_p = NULL;
 	carp_detach_p = NULL;
 	carp_get_vhid_p = NULL;
 	carp_linkstate_p = NULL;
 	carp_forus_p = NULL;
 	carp_output_p = NULL;
 	carp_demote_adj_p = NULL;
 	carp_master_p = NULL;
 	mtx_unlock(&carp_mtx);
 	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
 	mtx_destroy(&carp_mtx);
 }
 
 static int
 carp_mod_load(void)
 {
 	int err;
 
 	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
 	LIST_INIT(&carp_list);
 	carp_get_vhid_p = carp_get_vhid;
 	carp_forus_p = carp_forus;
 	carp_output_p = carp_output;
 	carp_linkstate_p = carp_linkstate;
 	carp_ioctl_p = carp_ioctl;
 	carp_attach_p = carp_attach;
 	carp_detach_p = carp_detach;
 	carp_demote_adj_p = carp_demote_adj;
 	carp_master_p = carp_master;
 #ifdef INET6
 	carp_iamatch6_p = carp_iamatch6;
 	carp_macmatch6_p = carp_macmatch6;
 	proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
 	    (struct protosw *)&in6_carp_protosw);
 	if (proto_reg[CARP_INET6]) {
 		printf("carp: error %d attaching to PF_INET6\n",
 		    proto_reg[CARP_INET6]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET6]);
 	}
 	err = ip6proto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET6\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 #ifdef INET
 	carp_iamatch_p = carp_iamatch;
 	proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
 	if (proto_reg[CARP_INET]) {
 		printf("carp: error %d attaching to PF_INET\n",
 		    proto_reg[CARP_INET]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET]);
 	}
 	err = ipproto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 	return (0);
 }
 
 static int
 carp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		return carp_mod_load();
 		/* NOTREACHED */
 	case MOD_UNLOAD:
 		mtx_lock(&carp_mtx);
 		if (LIST_EMPTY(&carp_list))
 			carp_mod_cleanup();
 		else {
 			mtx_unlock(&carp_mtx);
 			return (EBUSY);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static moduledata_t carp_mod = {
 	"carp",
 	carp_modevent,
 	0
 };
 
 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
Index: head/sys/netinet/sctp_os_bsd.h
===================================================================
--- head/sys/netinet/sctp_os_bsd.h	(revision 276691)
+++ head/sys/netinet/sctp_os_bsd.h	(revision 276692)
@@ -1,504 +1,500 @@
 /*-
  * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *   this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *   the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_OS_BSD_H_
 #define _NETINET_SCTP_OS_BSD_H_
 /*
  * includes
  */
 #include "opt_ipsec.h"
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/jail.h>
 #include <sys/sysctl.h>
 #include <sys/resourcevar.h>
 #include <sys/uio.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/kthread.h>
 #include <sys/priv.h>
 #include <sys/random.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <machine/cpu.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif				/* IPSEC */
 
 #ifdef INET6
 #include <sys/domain.h>
 #ifdef IPSEC
 #include <netipsec/ipsec6.h>
 #endif
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 #endif				/* INET6 */
 
 
 #include <netinet/ip_options.h>
 
 #include <crypto/sha1.h>
 #include <crypto/sha2/sha2.h>
 
 #ifndef in6pcb
 #define in6pcb		inpcb
 #endif
 /* Declare all the malloc names for all the various mallocs */
 MALLOC_DECLARE(SCTP_M_MAP);
 MALLOC_DECLARE(SCTP_M_STRMI);
 MALLOC_DECLARE(SCTP_M_STRMO);
 MALLOC_DECLARE(SCTP_M_ASC_ADDR);
 MALLOC_DECLARE(SCTP_M_ASC_IT);
 MALLOC_DECLARE(SCTP_M_AUTH_CL);
 MALLOC_DECLARE(SCTP_M_AUTH_KY);
 MALLOC_DECLARE(SCTP_M_AUTH_HL);
 MALLOC_DECLARE(SCTP_M_AUTH_IF);
 MALLOC_DECLARE(SCTP_M_STRESET);
 MALLOC_DECLARE(SCTP_M_CMSG);
 MALLOC_DECLARE(SCTP_M_COPYAL);
 MALLOC_DECLARE(SCTP_M_VRF);
 MALLOC_DECLARE(SCTP_M_IFA);
 MALLOC_DECLARE(SCTP_M_IFN);
 MALLOC_DECLARE(SCTP_M_TIMW);
 MALLOC_DECLARE(SCTP_M_MVRF);
 MALLOC_DECLARE(SCTP_M_ITER);
 MALLOC_DECLARE(SCTP_M_SOCKOPT);
 MALLOC_DECLARE(SCTP_M_MCORE);
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 
 #define SCTP_GET_CYCLECOUNT get_cyclecount()
 #define SCTP_CTR6 sctp_log_trace
 
 #else
 #define SCTP_CTR6 CTR6
 #endif
 
 /*
  * Macros to expand out globals defined by various modules
  * to either a real global or a virtualized instance of one,
  * depending on whether VIMAGE is defined.
  */
 /* then define the macro(s) that hook into the vimage macros */
 #define MODULE_GLOBAL(__SYMBOL) V_##__SYMBOL
 
 #define V_system_base_info VNET(system_base_info)
 #define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m
 #define SCTP_BASE_STATS V_system_base_info.sctpstat
 #define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m
 #define SCTP_BASE_SYSCTL(__m) V_system_base_info.sctpsysctl.__m
 #define SCTP_BASE_VAR(__m) V_system_base_info.__m
 
 #define SCTP_PRINTF(params...)	printf(params)
 #if defined(SCTP_DEBUG)
 #define SCTPDBG(level, params...)					\
 {									\
 	do {								\
 		if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) {		\
 			SCTP_PRINTF(params);				\
 		}							\
 	} while (0);							\
 }
 #define SCTPDBG_ADDR(level, addr)					\
 {									\
 	do {								\
 		if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) {		\
 			sctp_print_address(addr);			\
 		}							\
 	} while (0);							\
 }
 #else
 #define SCTPDBG(level, params...)
 #define SCTPDBG_ADDR(level, addr)
 #endif
 
 #ifdef SCTP_LTRACE_CHUNKS
 #define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d)
 #else
 #define SCTP_LTRACE_CHK(a, b, c, d)
 #endif
 
 #ifdef SCTP_LTRACE_ERRORS
 #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
 		SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
 		            m, inp, stcb, net, file, __LINE__, err);
 #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
 		SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
 		            inp, stcb, net, file, __LINE__, err);
 #else
 #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err)
 #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err)
 #endif
 
 
 /*
  * Local address and interface list handling
  */
 #define SCTP_MAX_VRF_ID		0
 #define SCTP_SIZE_OF_VRF_HASH	3
 #define SCTP_IFNAMSIZ		IFNAMSIZ
 #define SCTP_DEFAULT_VRFID	0
 #define SCTP_VRF_ADDR_HASH_SIZE	16
 #define SCTP_VRF_IFN_HASH_SIZE	3
 #define	SCTP_INIT_VRF_TABLEID(vrf)
 
 #define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP)
 #define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifa && (ro)->ro_rt->rt_ifa->ifa_ifp && (ro)->ro_rt->rt_ifa->ifa_ifp->if_type == IFT_LOOP)
 
 /*
  * Access to IFN's to help with src-addr-selection
  */
 /* This could return VOID if the index works but for BSD we provide both. */
 #define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_rt->rt_ifp
 #define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_rt->rt_ifp->if_index
 #define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifp)
 
 /*
  * general memory allocation
  */
 #define SCTP_MALLOC(var, type, size, name) \
 	do { \
 		var = (type)malloc(size, name, M_NOWAIT); \
 	} while (0)
 
 #define SCTP_FREE(var, type)	free(var, type)
 
 #define SCTP_MALLOC_SONAME(var, type, size) \
 	do { \
 		var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \
 	} while (0)
 
 #define SCTP_FREE_SONAME(var)	free(var, M_SONAME)
 
 #define SCTP_PROCESS_STRUCT struct proc *
 
 /*
  * zone allocation functions
  */
 #include <vm/uma.h>
 
 /* SCTP_ZONE_INIT: initialize the zone */
 typedef struct uma_zone *sctp_zone_t;
 
 #define SCTP_ZONE_INIT(zone, name, size, number) { \
 	zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,\
 		0); \
 	uma_zone_set_max(zone, number); \
 }
 
 #define SCTP_ZONE_DESTROY(zone) uma_zdestroy(zone)
 
 /* SCTP_ZONE_GET: allocate element from the zone */
 #define SCTP_ZONE_GET(zone, type) \
 	(type *)uma_zalloc(zone, M_NOWAIT);
 
 /* SCTP_ZONE_FREE: free element from the zone */
 #define SCTP_ZONE_FREE(zone, element) \
 	uma_zfree(zone, element);
 
 #define SCTP_HASH_INIT(size, hashmark) hashinit_flags(size, M_PCB, hashmark, HASH_NOWAIT)
 #define SCTP_HASH_FREE(table, hashmark) hashdestroy(table, M_PCB, hashmark)
 
 #define SCTP_M_COPYM	m_copym
 
 /*
  * timers
  */
 #include <sys/callout.h>
 typedef struct callout sctp_os_timer_t;
 
 
 #define SCTP_OS_TIMER_INIT(tmr)	callout_init(tmr, 1)
 #define SCTP_OS_TIMER_START	callout_reset
 #define SCTP_OS_TIMER_STOP	callout_stop
 #define SCTP_OS_TIMER_STOP_DRAIN callout_drain
 #define SCTP_OS_TIMER_PENDING	callout_pending
 #define SCTP_OS_TIMER_ACTIVE	callout_active
 #define SCTP_OS_TIMER_DEACTIVATE callout_deactivate
 
 #define sctp_get_tick_count() (ticks)
 
 #define SCTP_UNUSED __attribute__((unused))
 
 /*
  * Functions
  */
 /* Mbuf manipulation and access macros  */
 #define SCTP_BUF_LEN(m) (m->m_len)
 #define SCTP_BUF_NEXT(m) (m->m_next)
 #define SCTP_BUF_NEXT_PKT(m) (m->m_nextpkt)
 #define SCTP_BUF_RESV_UF(m, size) m->m_data += size
 #define SCTP_BUF_AT(m, size) m->m_data + size
 #define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT)
 #define SCTP_BUF_EXTEND_SIZE(m) (m->m_ext.ext_size)
 #define SCTP_BUF_TYPE(m) (m->m_type)
 #define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif)
 #define SCTP_BUF_PREPEND	M_PREPEND
 
-#define SCTP_ALIGN_TO_END(m, len) if(m->m_flags & M_PKTHDR) { \
-                                     MH_ALIGN(m, len); \
-                                  } else if ((m->m_flags & M_EXT) == 0) { \
-                                     M_ALIGN(m, len); \
-                                  }
+#define SCTP_ALIGN_TO_END(m, len) M_ALIGN(m, len)
 
 /* We make it so if you have up to 4 threads
  * writing based on the default size of
  * the packet log 65 k, that would be
  * 4 16k packets before we would hit
  * a problem.
  */
 #define SCTP_PKTLOG_WRITERS_NEED_LOCK 3
 
 /*************************/
 /*      MTU              */
 /*************************/
 #define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu
 #define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((uint32_t)((rt != NULL) ? rt->rt_mtu : 0))
 #define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0)
 #define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \
                                               if (rt != NULL) \
                                                  rt->rt_mtu = mtu; \
                                            } while(0)
 
 /* (de-)register interface event notifications */
 #define SCTP_REGISTER_INTERFACE(ifhandle, af)
 #define SCTP_DEREGISTER_INTERFACE(ifhandle, af)
 
 
 /*************************/
 /* These are for logging */
 /*************************/
 /* return the base ext data pointer */
 #define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf)
  /* return the refcnt of the data pointer */
 #define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ext_cnt)
 /* return any buffer related flags, this is
  * used beyond logging for apple only.
  */
 #define SCTP_BUF_GET_FLAGS(m) (m->m_flags)
 
 /* For BSD this just accesses the M_PKTHDR length
  * so it operates on an mbuf with hdr flag. Other
  * O/S's may have separate packet header and mbuf
  * chain pointers.. thus the macro.
  */
 #define SCTP_HEADER_TO_CHAIN(m) (m)
 #define SCTP_DETACH_HEADER_FROM_CHAIN(m)
 #define SCTP_HEADER_LEN(m) ((m)->m_pkthdr.len)
 #define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0
 #define SCTP_RELEASE_HEADER(m)
 #define SCTP_RELEASE_PKT(m)	sctp_m_freem(m)
 #define SCTP_ENABLE_UDP_CSUM(m) do { \
 					m->m_pkthdr.csum_flags = CSUM_UDP; \
 					m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); \
 				} while (0)
 
 #define SCTP_GET_PKT_VRFID(m, vrf_id)  ((vrf_id = SCTP_DEFAULT_VRFID) != SCTP_DEFAULT_VRFID)
 
 
 
 /* Attach the chain of data into the sendable packet. */
 #define SCTP_ATTACH_CHAIN(pak, m, packet_length) do { \
                                                  pak = m; \
                                                  pak->m_pkthdr.len = packet_length; \
                          } while(0)
 
 /* Other m_pkthdr type things */
 #define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0)
 #define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP)))
 
 
 /* This converts any input packet header
  * into the chain of data holders, for BSD
  * its a NOP.
  */
 
 /* get the v6 hop limit */
 #define SCTP_GET_HLIM(inp, ro)	in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL)));
 
 /* is the endpoint v6only? */
 #define SCTP_IPV6_V6ONLY(inp)	(((struct inpcb *)inp)->inp_flags & IN6P_IPV6_V6ONLY)
 /* is the socket non-blocking? */
 #define SCTP_SO_IS_NBIO(so)	((so)->so_state & SS_NBIO)
 #define SCTP_SET_SO_NBIO(so)	((so)->so_state |= SS_NBIO)
 #define SCTP_CLEAR_SO_NBIO(so)	((so)->so_state &= ~SS_NBIO)
 /* get the socket type */
 #define SCTP_SO_TYPE(so)	((so)->so_type)
 /* Use a macro for renaming sb_cc to sb_ccc */
 #define sb_cc sb_ccc
 /* reserve sb space for a socket */
 #define SCTP_SORESERVE(so, send, recv)	soreserve(so, send, recv)
 /* wakeup a socket */
 #define SCTP_SOWAKEUP(so)	wakeup(&(so)->so_timeo)
 /* clear the socket buffer state */
 #define SCTP_SB_CLEAR(sb)	\
 	(sb).sb_cc = 0;		\
 	(sb).sb_mb = NULL;	\
 	(sb).sb_mbcnt = 0;
 
 #define SCTP_SB_LIMIT_RCV(so) so->so_rcv.sb_hiwat
 #define SCTP_SB_LIMIT_SND(so) so->so_snd.sb_hiwat
 
 /*
  * routes, output, etc.
  */
 typedef struct route sctp_route_t;
 typedef struct rtentry sctp_rtentry_t;
 
 /*
  * XXX multi-FIB support was backed out in r179783 and it seems clear that the
  * VRF support as currently in FreeBSD is not ready to support multi-FIB.
  * It might be best to implement multi-FIB support for both v4 and v6 indepedent
  * of VRFs and leave those to a real MPLS stack.
  */
 #define SCTP_RTALLOC(ro, vrf_id) rtalloc_ign((struct route *)ro, 0UL)
 
 /* Future zero copy wakeup/send  function */
 #define SCTP_ZERO_COPY_EVENT(inp, so)
 /* This is re-pulse ourselves for sendbuf */
 #define SCTP_ZERO_COPY_SENDQ_EVENT(inp, so)
 
 /*
  * SCTP protocol specific mbuf flags.
  */
 #define	M_NOTIFICATION		M_PROTO1	/* SCTP notification */
 
 /*
  * IP output routines
  */
 #define SCTP_IP_OUTPUT(result, o_pak, ro, stcb, vrf_id) \
 { \
 	int o_flgs = IP_RAWOUTPUT; \
 	struct sctp_tcb *local_stcb = stcb; \
 	if (local_stcb && \
 	    local_stcb->sctp_ep && \
 	    local_stcb->sctp_ep->sctp_socket) \
 		o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \
 	m_clrprotoflags(o_pak); \
 	result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \
 }
 
 #define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \
 { \
 	struct sctp_tcb *local_stcb = stcb; \
 	m_clrprotoflags(o_pak); \
 	if (local_stcb && local_stcb->sctp_ep) \
 		result = ip6_output(o_pak, \
 				    ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \
 				    (ro), 0, 0, ifp, NULL); \
 	else \
 		result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \
 }
 
 struct mbuf *
 sctp_get_mbuf_for_msg(unsigned int space_needed,
     int want_header, int how, int allonebuf, int type);
 
 
 /*
  * SCTP AUTH
  */
 #define SCTP_READ_RANDOM(buf, len)	read_random(buf, len)
 
 /* map standard crypto API names */
 #define SCTP_SHA1_CTX		SHA1_CTX
 #define SCTP_SHA1_INIT		SHA1Init
 #define SCTP_SHA1_UPDATE	SHA1Update
 #define SCTP_SHA1_FINAL(x,y)	SHA1Final((caddr_t)x, y)
 
 #define SCTP_SHA256_CTX		SHA256_CTX
 #define SCTP_SHA256_INIT	SHA256_Init
 #define SCTP_SHA256_UPDATE	SHA256_Update
 #define SCTP_SHA256_FINAL(x,y)	SHA256_Final((caddr_t)x, y)
 
 #endif
 
 #define SCTP_DECREMENT_AND_CHECK_REFCOUNT(addr) (atomic_fetchadd_int(addr, -1) == 1)
 #if defined(INVARIANTS)
 #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
 { \
 	int32_t oldval; \
 	oldval = atomic_fetchadd_int(addr, -val); \
 	if (oldval < val) { \
 		panic("Counter goes negative"); \
 	} \
 }
 #else
 #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
 { \
 	int32_t oldval; \
 	oldval = atomic_fetchadd_int(addr, -val); \
 	if (oldval < val) { \
 		*addr = 0; \
 	} \
 }
 #endif
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 276691)
+++ head/sys/netinet/tcp_output.c	(revision 276692)
@@ -1,1684 +1,1684 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, path_mtu_discovery) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(path_mtu_discovery), 1,
 	"Enable Path MTU Discovery");
 
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_sendspace) = 1024*32;
 #define	V_tcp_sendspace	VNET(tcp_sendspace)
 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 #define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
 	"Enable automatic send buffer sizing");
 
 VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
 #define	V_tcp_autosndbuf_inc	VNET(tcp_autosndbuf_inc)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
 #define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
 static void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to,
 			    long len, int tso);
 static void inline	cc_after_idle(struct tcpcb *tp);
 
 /*
  * Wrapper for the TCP established output helper hook.
  */
 static void inline
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
     struct tcpopt *to, long len, int tso)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 		hhook_data.len = len;
 		hhook_data.tso = tso;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 }
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error = 0;	/* Keep compiler happy */
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #ifdef IPSEC
 	unsigned ipsec_optlen = 0;
 #endif
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
 		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((long)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((long)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < sbused(&so->so_snd))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			tcp_timer_activate(tp, TT_PERSIST, 0);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
 			    off);
 		else {
 			long cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
 			    off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
 			 * isn't really necessary), to work around a gcc 
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd - 
 					(tp->snd_nxt - tp->sack_newdata) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = lmin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	if (len < 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 */
 		len = 0;
 		if (sendwin == 0) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!tcp_timer_active(tp, TT_PERSIST))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwith product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwith (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 *
 	 * XXXGL: should there be used sbused() or sbavail()?
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
 		    sendwin >= (sbused(&so->so_snd) -
 		    (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 */
 #ifdef IPSEC
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 #ifdef IPSEC
 	    ipsec_optlen == 0 &&
 #endif
 	    tp->t_inpcb->inp_options == NULL &&
 	    tp->t_inpcb->in6p_options == NULL)
 		tso = 1;
 
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 		    sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= sbavail(&so->so_snd) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read
 	 * from the receive buffer, no matter how small, causes a window
 	 * update to be sent.  We also should avoid sending a flurry of
 	 * window updates when the socket buffer had queued a lot of data
 	 * and the application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending
 	 * an update when we can increase the advertized window by more
 	 * than 1/4th of the socket buffer capacity.  When the buffer is
 	 * getting full or is very small be more aggressive and send an
 	 * update whenever we can increase by two mss sized segments.
 	 * In all other situations the ACK's to new incoming data will
 	 * carry further window increases.
 	 *
 	 * Don't send an independent window update if a delayed
 	 * ACK is pending (it will get piggy-backed on it) or the
 	 * remote side already has done a half-close and won't send
 	 * more data.  Skip this if the connection is in T/TCP
 	 * half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/* 
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (long)(2 * tp->t_maxseg) &&
 		    (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		goto just_return;
 	} 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * tcp_timer_active(tp, TT_PERSIST)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * tcp_timer_active(tp, TT_REXMT)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		to.to_flags = 0;
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = tcp_ts_getticks();
 		}
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 	}
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxopd length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxopd) {
 		flags &= ~TH_FIN;
 
 		if (tso) {
 			u_int if_hw_tsomax;
 			u_int if_hw_tsomaxsegcount;
 			u_int if_hw_tsomaxsegsize;
 			struct mbuf *mb;
 			u_int moff;
 			int max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 
 			/*
 			 * Limit a TSO burst to prevent it from
 			 * overflowing or exceeding the maximum length
 			 * allowed by the network interface:
 			 */
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 
 			/*
 			 * Check if we should limit by maximum segment
 			 * size and count:
 			 */
 			if (if_hw_tsomaxsegcount != 0 &&
 			    if_hw_tsomaxsegsize != 0) {
 				max_len = 0;
 				mb = sbsndmbuf(&so->so_snd, off, &moff);
 
 				while (mb != NULL && max_len < len) {
 					u_int mlen;
 					u_int frags;
 
 					/*
 					 * Get length of mbuf fragment
 					 * and how many hardware frags,
 					 * rounded up, it would use:
 					 */
 					mlen = (mb->m_len - moff);
 					frags = howmany(mlen,
 					    if_hw_tsomaxsegsize);
 
 					/* Handle special case: Zero Length Mbuf */
 					if (frags == 0)
 						frags = 1;
 
 					/*
 					 * Check if the fragment limit
 					 * will be reached or exceeded:
 					 */
 					if (frags >= if_hw_tsomaxsegcount) {
 						max_len += min(mlen,
 						    if_hw_tsomaxsegcount *
 						    if_hw_tsomaxsegsize);
 						break;
 					}
 					max_len += mlen;
 					if_hw_tsomaxsegcount -= frags;
 					moff = 0;
 					mb = mb->m_next;
 				}
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 
 			/*
 			 * Prevent the last segment from being
 			 * fractional unless the send sockbuf can be
 			 * emptied:
 			 */
 			max_len = (tp->t_maxopd - optlen);
 			if ((off + len) < sbavail(&so->so_snd)) {
 				moff = len % max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 
 			/*
 			 * In case there are too many small fragments
 			 * don't use TSO:
 			 */
 			if (len <= max_len) {
 				len = max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 
 			/*
 			 * Send the FIN in a separate segment
 			 * after the bulk sending is done.
 			 * We don't trust the TSO implementations
 			 * to clear the FIN flag on all but the
 			 * last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			len = tp->t_maxopd - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		struct mbuf *mb;
 		u_int moff;
 
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
 			TCPSTAT_INC(tcps_sndprobe);
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf
 		 * to the offset in the socket buffer chain.
 		 */
 		mb = sbsndptr(&so->so_snd, off, len, &moff);
 
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
 			m->m_next = m_copy(mb, moff, (int)len);
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (off + len == sbused(&so->so_snd))
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
-			MH_ALIGN(m, hdrlen);
+			M_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup
 	 * SYN packet. If we are on a retransmit, we may
 	 * resend those bits a number of times as per
 	 * RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE|TH_CWR;
 		} else
 			flags |= TH_ECE|TH_CWR;
 	}
 	
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with
 		 * ECN capable transmission (ECT).
 		 * Ignore pure ack packets, retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 		
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags & TF_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags &= ~TF_ECN_SND_CWR;
 		} 
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN)) ||
 		    tcp_timer_active(tp, TT_PERSIST))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data than can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 #ifdef TCP_SIGNATURE
 	if (tp->t_flags & TF_SIGNATURE) {
 		int sigoff = to.to_signature - opt;
 		tcp_signature_compute(m, 0, len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 		    optlen + len, IPPROTO_TCP, 0);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 * XXX: Fixme: This is currently not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxopd - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
 	}
 
 #ifdef IPSEC
 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
 #else
 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
 #endif
 
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif /* TCPDEBUG */
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before checksum calculation,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		struct route_in6 ro;
 
 		bzero(&ro, sizeof(ro));
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace probes.
 		 * ip6_output() will set it properly; it's supposed to include
 		 * the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
 		if (error == EMSGSIZE && ro.ro_rt != NULL)
 			mtu = ro.ro_rt->rt_mtu;
 		RO_RTFREE(&ro);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
     {
 	struct route ro;
 
 	bzero(&ro, sizeof(ro));
 	ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 *
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) {
 		ip->ip_off |= htons(IP_DF);
 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 	} else {
 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 	TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
 	if (error == EMSGSIZE && ro.ro_rt != NULL)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
     }
 #endif /* INET */
 
 out:
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!tcp_timer_active(tp, TT_REXMT) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (tcp_timer_active(tp, TT_PERSIST)) {
 				tcp_timer_activate(tp, TT_PERSIST, 0);
 				tp->t_rxtshift = 0;
 			}
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 	                if (!tcp_timer_active(tp, TT_REXMT) &&
 			    !tcp_timer_active(tp, TT_PERSIST))
 	                        tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * If we obtained mtu from ip_output() then update
 			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	tp->t_flags &= ~TF_PREVVALID;
 	if (tcp_timer_active(tp, TT_REXMT))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
 	tcp_timer_activate(tp, TT_PERSIST, tt);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		if (optlen == TCP_MAXOLEN)
 			break;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
 				continue;
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
 				continue;
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
 				continue;
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
 				continue;
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
 				continue;
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	/*
 	 * According to RFC 793 (STD0007):
 	 *   "The content of the header beyond the End-of-Option option
 	 *    must be header padding (i.e., zero)."
 	 *   and later: "The padding is composed of zeros."
 	 */
 	while (optlen % 4) {
 		optlen += TCPOLEN_PAD;
 		*optp++ = TCPOPT_PAD;
 	}
 
 	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 276691)
+++ head/sys/netinet6/ip6_output.c	(revision 276692)
@@ -1,2978 +1,2978 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/in_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #include <netipsec/key.h>
 #include <netinet6/ip6_ipsec.h>
 #endif /* IPSEC */
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef FLOWTABLE
 #include <net/flowtable.h>
 #endif
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, struct route_in6 *,
 	struct ifnet *, struct in6_addr *, u_long *, int *, u_int);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
 /*
  * Make an extension header from option data.  hp is the source, and
  * mp is the destination.
  */
 #define MAKE_EXTHDR(hp, mp)						\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("assumption failed: hdr not split"); \
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(u_short) > m->m_len) {
 		printf("%s: delayed m_pullup, m->len: %d plen %u off %u "
 		    "csum_flags=%b\n", __func__, m->m_len, plen, offset,
 		    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
 		/*
 		 * XXX this should not happen, but if it does, the correct
 		 * behavior may be to insert the checksum in the appropriate
 		 * next mbuf in the chain.
 		 */
 		return;
 	}
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
  * header (with pri, len, nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
  * which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 /*
  * XXX TODO: no flowid is assigned for outbound flows?
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6, *mhip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev = NULL;
 	int hlen, tlen, len, off;
 	struct route_in6 ip6route;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *dst, src_sa, dst_sa;
 	struct in6_addr odst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst, src0, dst0;
 	u_int32_t zone;
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6 == NULL) {
 		printf ("ip6 is NULL");
 		goto bad;
 	}
 
 	if (inp != NULL) {
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* unconditionally set flowid */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 	}
 
 	finaldst = ip6->ip6_dst;
 	bzero(&exthdrs, sizeof(exthdrs));
 	if (opt) {
 		/* Hop-by-Hop options header */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
 		/* Destination options header(1st part) */
 		if (opt->ip6po_rthdr) {
 			/*
 			 * Destination options header(1st part)
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
 		}
 		/* Routing header */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
 		/* Destination options header(2nd part) */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
 	}
 
 #ifdef IPSEC
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	switch(ip6_ipsec_output(&m, inp, &error))
 	{
 	case 1:                 /* Bad packet */
 		goto freehdrs;
 	case -1:                /* IPSec done */
 		goto done;
 	case 0:                 /* No IPSec */
 	default:
 		break;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	optlen = 0;
 	if (exthdrs.ip6e_hbh)
 		optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1)
 		optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr)
 		optlen += exthdrs.ip6e_rthdr->m_len;
 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 
 	/* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */
 	if (exthdrs.ip6e_dest2)
 		optlen += exthdrs.ip6e_dest2->m_len;
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	if (optlen && !hdrsplit) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		hdrsplit++;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf packet header length */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			hdrsplit++;
 		}
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 
 	/*
 	 * Concatenate headers and fill in next header fields.
 	 * Here we have, on "m"
 	 *	IPv6 payload
 	 * and we insert headers accordingly.  Finally, we should be getting:
 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
 	 *
 	 * during the header composing process, "m" points to IPv6 header.
 	 * "mprev" points to an extension header prior to esp.
 	 */
 	u_char *nexthdrp = &ip6->ip6_nxt;
 	mprev = m;
 
 	/*
 	 * we treat dest2 specially.  this makes IPsec processing
 	 * much easier.  the goal here is to make mprev point the
 	 * mbuf prior to dest2.
 	 *
 	 * result: IPv6 dest2 payload
 	 * m and mprev will point to IPv6 header.
 	 */
 	if (exthdrs.ip6e_dest2) {
 		if (!hdrsplit)
 			panic("assumption failed: hdr not split");
 		exthdrs.ip6e_dest2->m_next = m->m_next;
 		m->m_next = exthdrs.ip6e_dest2;
 		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 		ip6->ip6_nxt = IPPROTO_DSTOPTS;
 	}
 
 	/*
 	 * result: IPv6 hbh dest1 rthdr dest2 payload
 	 * m will point to IPv6 header.  mprev will point to the
 	 * extension header prior to dest2 (rthdr in the above case).
 	 */
 	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 		   IPPROTO_DSTOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 		   IPPROTO_ROUTING);
 
 	/*
 	 * If there is a routing header, discard the packet.
 	 */
 	if (exthdrs.ip6e_rthdr) {
 		 error = EINVAL;
 		 goto bad;
 	}
 
 	/* Source address validation */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/*
 	 * Route packet.
 	 */
 	if (ro == 0) {
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL)
 		(void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
 #endif
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 again:
 	/*
 	 * if specified, try to fill in the traffic class field.
 	 * do not override if a non-zero value is already set.
 	 * we check the diffserv field and the ecn field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
 			mask |= 0xfc;
 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (ro->ro_rt && fwd_tag == NULL) {
 		rt = ro->ro_rt;
 		ifp = ro->ro_rt->rt_ifp;
 	} else {
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 		error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp,
 		    &rt, fibnum);
 		if (error != 0) {
 			if (ifp != NULL)
 				in6_ifstat_inc(ifp, ifs6_out_discard);
 			goto bad;
 		}
 	}
 	if (rt == NULL) {
 		/*
 		 * If in6_selectroute() does not return a route entry,
 		 * dst may not have been updated.
 		 */
 		*dst = dst_sa;	/* XXX */
 	}
 
 	/*
 	 * then rt (for unicast) and ifp must be non-NULL valid values.
 	 */
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 	if (rt != NULL) {
 		ia = (struct in6_ifaddr *)(rt->rt_ifa);
 		counter_u64_add(rt->rt_pksent, 1);
 	}
 
 
 	/*
 	 * The outgoing interface must be in the zone of source and
 	 * destination addresses.
 	 */
 	origifp = ifp;
 
 	src0 = ip6->ip6_src;
 	if (in6_setscope(&src0, origifp, &zone))
 		goto badscope;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
 		goto badscope;
 
 	dst0 = ip6->ip6_dst;
 	if (in6_setscope(&dst0, origifp, &zone))
 		goto badscope;
 	/* re-initialize to be sure */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
 		goto badscope;
 	}
 
 	/* We should use ia_ifp to support the case of
 	 * sending packets to an address of our own.
 	 */
 	if (ia != NULL && ia->ia_ifp)
 		ifp = ia->ia_ifp;
 
 	/* scope check is done. */
 	goto routefound;
 
   badscope:
 	IP6STAT_INC(ip6s_badscope);
 	in6_ifstat_inc(origifp, ifs6_out_discard);
 	if (error == 0)
 		error = EHOSTUNREACH; /* XXX */
 	goto bad;
 
   routefound:
 	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_rt) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((rt->rt_flags & RTF_GATEWAY))
 			dst = (struct sockaddr_in6 *)rt->rt_gateway;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m, dst);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
 	    &alwaysfrag, fibnum)) != 0)
 		goto bad;
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else
 			needfiblookup = 1; /* Redo the routing table lookup. */
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		RO_RTFREE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * the logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
 		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
 		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
 	}
 #ifdef SCTP
 	if (sw_csum & CSUM_SCTP_IPV6) {
 		sw_csum &= ~CSUM_SCTP_IPV6;
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 	}
 #endif
 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* case 4 */
 		/* conflicting request - can't transmit */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* case 2-b */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data as
 		 * well as returning an error code (the latter is not described
 		 * in the API spec.)
 		 */
 		u_int32_t mtu32;
 		struct ip6ctlparam ip6cp;
 
 		mtu32 = (u_int32_t)mtu;
 		bzero(&ip6cp, sizeof(ip6cp));
 		ip6cp.ip6c_cmdarg = (void *)&mtu32;
 		pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst,
 		    (void *)&ip6cp);
 
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/*
 	 * transmit packet without fragmentation
 	 */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 			ifa_free(&ia6->ia_ifa);
 		}
 		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		goto done;
 	}
 
 	/*
 	 * try to fragment the packet.  case 1-b and 3
 	 */
 	if (mtu < IPV6_MMTU) {
 		/* path MTU cannot be less than IPV6_MMTU */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* jumbo payload cannot be fragmented */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		struct mbuf **mnext, *m_frgpart;
 		struct ip6_frag *ip6f;
 		u_int32_t id = htonl(ip6_randomid());
 		u_char nextproto;
 
 		int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		hlen = unfragpartlen;
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * Verify that we have any chance at all of being able to queue
 		 *      the packet or packet fragments
 		 */
 		if (qslots <= 0 || ((u_int)qslots * (mtu - hlen)
 		    < tlen  /* - hlen */)) {
 			error = ENOBUFS;
 			IP6STAT_INC(ip6s_odropped);
 			goto bad;
 		}
 
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			in6_delayed_cksum(m, plen, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 			sctp_delayed_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
 #endif
 		mnext = &m->m_nextpkt;
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		for (off = hlen; off < tlen; off += len) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (!m) {
 				error = ENOBUFS;
 				IP6STAT_INC(ip6s_odropped);
 				goto sendorfree;
 			}
 			m->m_flags = m0->m_flags & M_COPYFLAGS;
 			*mnext = m;
 			mnext = &m->m_nextpkt;
 			m->m_data += max_linkhdr;
 			mhip6 = mtod(m, struct ip6_hdr *);
 			*mhip6 = *ip6;
 			m->m_len = sizeof(*mhip6);
 			error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 			if (error) {
 				IP6STAT_INC(ip6s_odropped);
 				goto sendorfree;
 			}
 			ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 			if (off + len >= tlen)
 				len = tlen - off;
 			else
 				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 			mhip6->ip6_plen = htons((u_short)(len + hlen +
 			    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 			if ((m_frgpart = m_copy(m0, off, len)) == 0) {
 				error = ENOBUFS;
 				IP6STAT_INC(ip6s_odropped);
 				goto sendorfree;
 			}
 			m_cat(m, m_frgpart);
 			m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
 			m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum;
 			m->m_pkthdr.rcvif = NULL;
 			ip6f->ip6f_reserved = 0;
 			ip6f->ip6f_ident = id;
 			ip6f->ip6f_nxt = nextproto;
 			IP6STAT_INC(ip6s_ofragments);
 			in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 		}
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/*
 	 * Remove leading garbages.
 	 */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (m0 = m; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	if (ro == &ip6route)
 		RO_RTFREE(ro);
 	if (ro_pmtu == &ip6route)
 		RO_RTFREE(ro_pmtu);
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == 0) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == 0)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
     struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum)
 {
 	u_int32_t mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (ro_pmtu != ro) {
 		/* The first hop and the final destination may differ. */
 		struct sockaddr_in6 *sa6_dst =
 		    (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 		if (ro_pmtu->ro_rt &&
 		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
 		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
 			RTFREE(ro_pmtu->ro_rt);
 			ro_pmtu->ro_rt = (struct rtentry *)NULL;
 		}
 		if (ro_pmtu->ro_rt == NULL) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_rtalloc(ro_pmtu, fibnum);
 		}
 	}
 	if (ro_pmtu->ro_rt) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		if (ifp == NULL)
 			ifp = ro_pmtu->ro_rt->rt_ifp;
 		ifmtu = IN6_LINKMTU(ifp);
 		mtu = tcp_hc_getmtu(&inc);
 		if (mtu)
 			mtu = min(mtu, ro_pmtu->ro_rt->rt_mtu);
 		else
 			mtu = ro_pmtu->ro_rt->rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *in6p = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					in6p->inp_flags2 |= INP_REUSEADDR;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					in6p->inp_flags2 |= INP_REUSEPORT;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(in6p);
 				in6p->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				error = ip6_pcbopts(&in6p->in6p_outputopts,
 						    m, so, sopt);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						in6p->in6p_hops = optval;
 						if ((in6p->inp_vflag &
 						     INP_IPV4) != 0)
 							in6p->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	in6p->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(in6p);						\
 	if (val)							\
 		in6p->inp_flags2 |= bit;				\
 	else								\
 		in6p->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(in6p);						\
 } while (0)
 #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0)
 
 				case IPV6_RECVPKTINFO:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					/*
 					 * make setsockopt(IPV6_V6ONLY)
 					 * available only prior to bind(2).
 					 * see ipng mailing list, Jun 22 2001.
 					 */
 					if (in6p->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_IPV6_V6ONLY);
 					if (optval)
 						in6p->inp_vflag &= ~INP_IPV4;
 					else
 						in6p->inp_vflag |= INP_IPV4;
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						in6p->inp_rss_listen_bucket = optval;
 						OPTSET2(INP_RSS_BUCKET_SET, 1);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				optp = &in6p->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(in6p, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(in6p);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					in6p->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(in6p);
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			{
 				caddr_t req;
 				struct mbuf *m;
 
 				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 					break;
 				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 					break;
 				req = mtod(m, caddr_t);
 				error = ipsec_set_policy(in6p, optname, req,
 				    m->m_len, (sopt->sopt_td != NULL) ?
 				    sopt->sopt_td->td_ucred : NULL);
 				m_freem(m);
 				break;
 			}
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 #endif
 				switch (optname) {
 
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = in6p->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = in6p->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = in6p->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = in6p->inp_flowtype;
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(in6p->inp_flowid,
 					    in6p->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				}
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct route_in6 sro;
 
 				bzero(&sro, sizeof(sro));
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 */
 				error = ip6_getpmtu(&sro, NULL, NULL,
 				    &in6p->in6p_faddr, &pmtu, NULL,
 				    so->so_fibnum);
 				if (sro.ro_rt)
 					RTFREE(sro.ro_rt);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(in6p->in6p_outputopts,
 				    optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(in6p, sopt);
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			  {
 				caddr_t req = NULL;
 				size_t len = 0;
 				struct mbuf *m = NULL;
 				struct mbuf **mp = &m;
 				size_t ovalsize = sopt->sopt_valsize;
 				caddr_t oval = (caddr_t)sopt->sopt_val;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				sopt->sopt_valsize = ovalsize;
 				sopt->sopt_val = oval;
 				if (m) {
 					req = mtod(m, caddr_t);
 					len = m->m_len;
 				}
 				error = ipsec_get_policy(in6p, req, len, mp);
 				if (error == 0)
 					error = soopt_mcopyout(sopt, m); /* XXX */
 				if (error == 0 && m)
 					m_freem(m);
 				break;
 			  }
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *in6p = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if ((optval % 2) != 0) {
 				/* the API assumes even offset values */
 				error = EINVAL;
 			} else if (so->so_proto->pr_protocol ==
 			    IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				in6p->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = in6p->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		return (error);
 	}
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct ip6_pktopts *opt;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_WAITOK);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
 }
 
 static int
 ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	int optdatalen = 0;
 	struct ip6_ext *ip6e;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		if (pktopt && pktopt->ip6po_pktinfo)
 			optdata = (void *)pktopt->ip6po_pktinfo;
 		else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 			optdata = (void *)&null_pktinfo;
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			optdata = (void *)&pktopt->ip6po_tclass;
 		else
 			optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		if (pktopt && pktopt->ip6po_hbh) {
 			optdata = (void *)pktopt->ip6po_hbh;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDR:
 		if (pktopt && pktopt->ip6po_rthdr) {
 			optdata = (void *)pktopt->ip6po_rthdr;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		if (pktopt && pktopt->ip6po_dest1) {
 			optdata = (void *)pktopt->ip6po_dest1;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_DSTOPTS:
 		if (pktopt && pktopt->ip6po_dest2) {
 			optdata = (void *)pktopt->ip6po_dest2;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_NEXTHOP:
 		if (pktopt && pktopt->ip6po_nexthop) {
 			optdata = (void *)pktopt->ip6po_nexthop;
 			optdatalen = pktopt->ip6po_nexthop->sa_len;
 		}
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_minmtu;
 		else
 			optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
 		else
 			optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		return (ENOPROTOOPT);
 	}
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_rt) {
 			RTFREE(pktopt->ip6po_nextroute.ro_rt);
 			pktopt->ip6po_nextroute.ro_rt = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_rt) {
 			RTFREE(pktopt->ip6po_route.ro_rt);
 			pktopt->ip6po_route.ro_rt = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL && canwait == M_NOWAIT)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = 0;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex > V_if_index)
 			 return (ENXIO);
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (
 		    ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copy(m, 0, M_COPYALL);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 
 #ifdef DIAGNOSTIC
 	if (copym->m_len < sizeof(*ip6)) {
 		m_freem(copym);
 		return;
 	}
 #endif
 
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
-		MH_ALIGN(mh, sizeof(*ip6));
+		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *in6p)
 {
 	int len;
 
 	if (!in6p->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
 	if (in6p->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
Index: head/sys/netinet6/mld6.c
===================================================================
--- head/sys/netinet6/mld6.c	(revision 276691)
+++ head/sys/netinet6/mld6.c	(revision 276692)
@@ -1,3313 +1,3313 @@
 /*-
  * Copyright (c) 2009 Bruce Simpson.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/ktr.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6.h>
 #include <netinet6/mld6_var.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 static struct mld_ifinfo *
 		mli_alloc_locked(struct ifnet *);
 static void	mli_delete_locked(const struct ifnet *);
 static void	mld_dispatch_packet(struct mbuf *);
 static void	mld_dispatch_queue(struct ifqueue *, int);
 static void	mld_final_leave(struct in6_multi *, struct mld_ifinfo *);
 static void	mld_fasttimo_vnet(void);
 static int	mld_handle_state_change(struct in6_multi *,
 		    struct mld_ifinfo *);
 static int	mld_initial_join(struct in6_multi *, struct mld_ifinfo *,
 		    const int);
 #ifdef KTR
 static char *	mld_rec_type_to_str(const int);
 #endif
 static void	mld_set_version(struct mld_ifinfo *, const int);
 static void	mld_slowtimo_vnet(void);
 static int	mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static int	mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static void	mld_v1_process_group_timer(struct mld_ifinfo *,
 		    struct in6_multi *);
 static void	mld_v1_process_querier_timers(struct mld_ifinfo *);
 static int	mld_v1_transmit_report(struct in6_multi *, const int);
 static void	mld_v1_update_group(struct in6_multi *, const int);
 static void	mld_v2_cancel_link_timers(struct mld_ifinfo *);
 static void	mld_v2_dispatch_general_query(struct mld_ifinfo *);
 static struct mbuf *
 		mld_v2_encap_report(struct ifnet *, struct mbuf *);
 static int	mld_v2_enqueue_filter_change(struct ifqueue *,
 		    struct in6_multi *);
 static int	mld_v2_enqueue_group_record(struct ifqueue *,
 		    struct in6_multi *, const int, const int, const int,
 		    const int);
 static int	mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
 		    struct mbuf *, const int, const int);
 static int	mld_v2_merge_state_changes(struct in6_multi *,
 		    struct ifqueue *);
 static void	mld_v2_process_group_timers(struct mld_ifinfo *,
 		    struct ifqueue *, struct ifqueue *,
 		    struct in6_multi *, const int);
 static int	mld_v2_process_group_query(struct in6_multi *,
 		    struct mld_ifinfo *mli, int, struct mbuf *, const int);
 static int	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
 
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  *
  * Locking:
  *  * The MLD subsystem lock ends up being system-wide for the moment,
  *    but could be per-VIMAGE later on.
  *  * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * IN6_MULTI_LOCK covers in_multi.
  *  * MLD_LOCK covers per-link state and any global variables in this file.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *
  *  XXX LOR PREVENTION
  *  A special case for IPv6 is the in6_setscope() routine. ip6_output()
  *  will not accept an ifp; it wants an embedded scope ID, unlike
  *  ip_output(), which happily takes the ifp given to it. The embedded
  *  scope ID is only used by MLD to select the outgoing interface.
  *
  *  During interface attach and detach, MLD will take MLD_LOCK *after*
  *  the IF_AFDATA_LOCK.
  *  As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
  *  it with MLD_LOCK held without triggering an LOR. A netisr with indirect
  *  dispatch could work around this, but we'd rather not do that, as it
  *  can introduce other races.
  *
  *  As such, we exploit the fact that the scope ID is just the interface
  *  index, and embed it in the IPv6 destination address accordingly.
  *  This is potentially NOT VALID for MLDv1 reports, as they
  *  are always sent to the multicast group itself; as MLDv2
  *  reports are always sent to ff02::16, this is not an issue
  *  when MLDv2 is in use.
  *
  *  This does not however eliminate the LOR when ip6_output() itself
  *  calls in6_setscope() internally whilst MLD_LOCK is held. This will
  *  trigger a LOR warning in WITNESS when the ifnet is detached.
  *
  *  The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
  *  how it's used across the network stack. Here we're simply exploiting
  *  the fact that MLD runs at a similar layer in the stack to scope6.c.
  *
  * VIMAGE:
  *  * Each in6_multi corresponds to an ifp, and each ifp corresponds
  *    to a vnet in ifp->if_vnet.
  */
 static struct mtx		 mld_mtx;
 static MALLOC_DEFINE(M_MLD, "mld", "mld state");
 
 #define	MLD_EMBEDSCOPE(pin6, zoneid)					\
 	if (IN6_IS_SCOPE_LINKLOCAL(pin6) ||				\
 	    IN6_IS_ADDR_MC_INTFACELOCAL(pin6))				\
 		(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF)		\
 
 /*
  * VIMAGE-wide globals.
  */
 static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
 static VNET_DEFINE(LIST_HEAD(, mld_ifinfo), mli_head);
 static VNET_DEFINE(int, interface_timers_running6);
 static VNET_DEFINE(int, state_change_timers_running6);
 static VNET_DEFINE(int, current_state_timers_running6);
 
 #define	V_mld_gsrdelay			VNET(mld_gsrdelay)
 #define	V_mli_head			VNET(mli_head)
 #define	V_interface_timers_running6	VNET(interface_timers_running6)
 #define	V_state_change_timers_running6	VNET(state_change_timers_running6)
 #define	V_current_state_timers_running6	VNET(current_state_timers_running6)
 
 SYSCTL_DECL(_net_inet6);	/* Note: Not in any common header. */
 
 SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
     "IPv6 Multicast Listener Discovery");
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
     "Rate limit for MLDv2 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
 static int	mld_v1enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
     &mld_v1enable, 0, "Enable fallback to MLDv1");
 
 static int	mld_use_allow = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
     &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
  */
 struct mld_raopt {
 	struct ip6_hbh		hbh;
 	struct ip6_opt		pad;
 	struct ip6_opt_router	ra;
 } __packed;
 
 /*
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
 	.hbh = { 0, 0 },
 	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
 	.ra = {
 	    .ip6or_type = IP6OPT_ROUTER_ALERT,
 	    .ip6or_len = IP6OPT_RTALERT_LEN - 2,
 	    .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
 	    .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
 	}
 };
 static struct ip6_pktopts mld_po;
 
 static __inline void
 mld_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 mld_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 mld_restore_context(struct mbuf *m)
 {
 
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by MLD lock.
  */
 static int
 sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	MLD_LOCK();
 
 	i = V_mld_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
 	     V_mld_gsrdelay.tv_sec, i);
 	V_mld_gsrdelay.tv_sec = i;
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct mld_ifinfo to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct mld_ifinfo	*mli;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
 	if (error)
 		return (error);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		if (ifp == mli->mli_ifp) {
 			error = SYSCTL_OUT(req, mli,
 			    sizeof(struct mld_ifinfo));
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 mld_dispatch_queue(struct ifqueue *ifq, int limit)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		_IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 		CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, ifq, m);
 		mld_dispatch_packet(m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing MLD report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
  * and node-local addresses. However, kernel and socket consumers
  * always embed the KAME scope ID in the address provided, so strip it
  * when performing comparison.
  * Note: This is not the same as the *multicast* scope.
  *
  * Return zero if the given group is one for which MLD reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 mld_is_addr_reported(const struct in6_addr *addr)
 {
 
 	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
 		return (0);
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
 		struct in6_addr tmp = *addr;
 		in6_clearscope(&tmp);
 		if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * Attach MLD when PF_INET6 is attached to an interface.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 struct mld_ifinfo *
 mld_domifattach(struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 
 	mli = mli_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		mli->mli_flags |= MLIF_SILENT;
 	if (mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_UNLOCK();
 
 	return (mli);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct mld_ifinfo *
 mli_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli;
 
 	MLD_LOCK_ASSERT();
 
 	mli = malloc(sizeof(struct mld_ifinfo), M_MLD, M_NOWAIT|M_ZERO);
 	if (mli == NULL)
 		goto out;
 
 	mli->mli_ifp = ifp;
 	mli->mli_version = MLD_VERSION_2;
 	mli->mli_flags = 0;
 	mli->mli_rv = MLD_RV_INIT;
 	mli->mli_qi = MLD_QI_INIT;
 	mli->mli_qri = MLD_QRI_INIT;
 	mli->mli_uri = MLD_URI_INIT;
 
 	SLIST_INIT(&mli->mli_relinmhead);
 
 	/*
 	 * Responses to general queries are subject to bounds.
 	 */
 	IFQ_SET_MAXLEN(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
 
 	CTR2(KTR_MLD, "allocate mld_ifinfo for ifp %p(%s)",
 	     ifp, if_name(ifp));
 
 out:
 	return (mli);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  * Run before link-layer cleanup; cleanup groups, but do not free MLD state.
  *
  * SMPng: Caller must hold IN6_MULTI_LOCK().
  * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
  * XXX This routine is also bitten by unlocked ifma_protospec access.
  */
 void
 mld_ifdetach(struct ifnet *ifp)
 {
 	struct mld_ifinfo	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
 	    if_name(ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	if (mli->mli_version == MLD_VERSION_2) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			if (inm->in6m_state == MLD_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 			in6m_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 			in6m_release_locked(inm);
 		}
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  * Runs after link-layer cleanup; free MLD state.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 void
 mld_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 	mli_delete_locked(ifp);
 	MLD_UNLOCK();
 }
 
 static void
 mli_delete_locked(const struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli, *tmli;
 
 	CTR3(KTR_MLD, "%s: freeing mld_ifinfo for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
 		if (mli->mli_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			_IF_DRAIN(&mli->mli_gq);
 
 			LIST_REMOVE(mli, mli_link);
 
 			KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(mli, M_MLD);
 			return;
 		}
 	}
 #ifdef INVARIANTS
 	panic("%s: mld_ifinfo not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received MLDv1 general or address-specific query.
  * Assumes that the query header has been pulled up to sizeof(mld_hdr).
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct ifmultiaddr	*ifma;
 	struct mld_ifinfo	*mli;
 	struct in6_multi	*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * Do address field validation upfront before we accept
 	 * the query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * MLDv1 General Query.
 		 * If this was not sent to the all-nodes group, ignore it.
 		 */
 		struct in6_addr		 dst;
 
 		dst = ip6->ip6_dst;
 		in6_clearscope(&dst);
 		if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * Switch to MLDv1 host compatibility mode.
 	 */
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 	mld_set_version(mli, MLD_VERSION_1);
 
 	timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	IF_ADDR_RLOCK(ifp);
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			mld_v1_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * MLDv1 Group-Specific Query.
 		 * If this is a group-specific MLDv1 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm != NULL) {
 			CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 			mld_v1_update_group(inm, timer);
 		}
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an MLDv1 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to MLDv2. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike MLDv2, the delay per group should be jittered
  * to avoid bursts of MLDv1 reports.
  */
 static void
 mld_v1_update_group(struct in6_multi *inm, const int timer)
 {
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp), timer);
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (inm->in6m_timer != 0 &&
 		    inm->in6m_timer <= timer) {
 			CTR1(KTR_MLD, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		break;
 	case MLD_SLEEPING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
 		inm->in6m_state = MLD_AWAKENING_MEMBER;
 		break;
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received MLDv2 general, group-specific or
  * group-and-source-specific query.
  *
  * Assumes that the query header has been pulled up to sizeof(mldv2_query).
  *
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     struct mbuf *m, const int off, const int icmp6len)
 {
 	struct mld_ifinfo	*mli;
 	struct mldv2_query	*mld;
 	struct in6_multi	*inm;
 	uint32_t		 maxdelay, nsrc, qqi;
 	int			 is_general_query;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
 
 	mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off);
 
 	maxdelay = ntohs(mld->mld_maxdelay);	/* in 1/10ths of a second */
 	if (maxdelay >= 32768) {
 		maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
 			   (MLD_MRC_EXP(maxdelay) + 3);
 	}
 	timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	qrv = MLD_QRV(mld->mld_misc);
 	if (qrv < 2) {
 		CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
 		    qrv, MLD_RV_INIT);
 		qrv = MLD_RV_INIT;
 	}
 
 	qqi = mld->mld_qqi;
 	if (qqi >= 128) {
 		qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
 		     (MLD_QQIC_EXP(mld->mld_qqi) + 3);
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 	if (nsrc > MLD_MAX_GS_SOURCES)
 		return (EMSGSIZE);
 	if (icmp6len < sizeof(struct mldv2_query) +
 	    (nsrc * sizeof(struct in6_addr)))
 		return (EMSGSIZE);
 
 	/*
 	 * Do further input validation upfront to avoid resetting timers
 	 * should we need to discard this query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		if (nsrc > 0)
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks (due to KAME
 		 * locking lameness). We own this mbuf chain just now.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * Discard the v2 query if we're in Compatibility Mode.
 	 * The RFC is pretty clear that hosts need to stay in MLDv1 mode
 	 * until the Old Version Querier Present timer expires.
 	 */
 	if (mli->mli_version != MLD_VERSION_2)
 		goto out_locked;
 
 	mld_set_version(mli, MLD_VERSION_2);
 	mli->mli_rv = qrv;
 	mli->mli_qi = qqi;
 	mli->mli_qri = maxdelay;
 
 	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
 	    maxdelay);
 
 	if (is_general_query) {
 		/*
 		 * MLDv2 General Query.
 		 *
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 *
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
 			mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
 			V_interface_timers_running6 = 1;
 		}
 	} else {
 		/*
 		 * MLDv2 Group-specific or Group-and-source-specific Query.
 		 *
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm == NULL) {
 			IF_ADDR_RUNLOCK(ifp);
 			goto out_locked;
 		}
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->in6m_lastgsrtv,
 			    &V_mld_gsrdelay)) {
 				CTR1(KTR_MLD, "%s: GS query throttled.",
 				    __func__);
 				IF_ADDR_RUNLOCK(ifp);
 				goto out_locked;
 			}
 		}
 		CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
 		     ifp, if_name(ifp));
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
 			mld_v2_process_group_query(inm, mli, timer, m, off);
 
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved MLDv2 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifinfo *mli,
     int timer, struct mbuf *m0, const int off)
 {
 	struct mldv2_query	*mld;
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	retval = 0;
 	mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off);
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 		    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
 			in6m_clear_recorded(inm);
 			timer = min(inm->in6m_timer, timer);
 		}
 		inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->in6m_timer, timer);
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 */
 	if (inm->in6m_nsrc > 0) {
 		struct mbuf		*m;
 		uint8_t			*sp;
 		int			 i, nrecorded;
 		int			 soff;
 
 		m = m0;
 		soff = off + sizeof(struct mldv2_query);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++) {
 			sp = mtod(m, uint8_t *) + soff;
 			retval = in6m_record_source(inm,
 			    (const struct in6_addr *)sp);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 			soff += sizeof(struct in6_addr);
 			if (soff >= m->m_len) {
 				soff = soff - m->m_len;
 				m = m->m_next;
 				if (m == NULL)
 					break;
 			}
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_MLD,
 			    "%s: schedule response to SG query", __func__);
 			inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
 			inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 			V_current_state_timers_running6 = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received MLDv1 host membership report.
  * Assumes mld points to mld_hdr in pulled up mbuf chain.
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct in6_addr		 src, dst;
 	struct in6_ifaddr	*ia;
 	struct in6_multi	*inm;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	/*
 	 * MLDv1 reports must originate from a host's link-local address,
 	 * or the unspecified address (when booting).
 	 */
 	src = ip6->ip6_src;
 	in6_clearscope(&src);
 	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC2710 Section 4: MLDv1 reports must pertain to a multicast
 	 * group, and must be directed to the group itself.
 	 */
 	dst = ip6->ip6_dst;
 	in6_clearscope(&dst);
 	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
 	    !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
 		CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * Make sure we don't hear our own membership report, as fast
 	 * leave requires knowing that we are the only member of a
 	 * group. Assume we used the link-local address if available,
 	 * otherwise look for ::.
 	 *
 	 * XXX Note that scope ID comparison is needed for the address
 	 * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
 	 * performed for the on-wire address.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
 	    (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
 	    ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
 
 	/*
 	 * Embed scope ID of receiving interface in MLD query for lookup
 	 * whilst we don't hold other locks (due to KAME locking lameness).
 	 */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 	IF_ADDR_RLOCK(ifp);
 
 	/*
 	 * MLDv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 	if (inm != NULL) {
 		struct mld_ifinfo *mli;
 
 		mli = inm->in6m_mli;
 		KASSERT(mli != NULL,
 		    ("%s: no mli for ifp %p", __func__, ifp));
 
 		/*
 		 * If we are in MLDv2 host mode, do not allow the
 		 * other host's MLDv1 report to suppress our reports.
 		 */
 		if (mli->mli_version == MLD_VERSION_2)
 			goto out_locked;
 
 		inm->in6m_timer = 0;
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			CTR3(KTR_MLD,
 			    "report suppressed for %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 		case MLD_LAZY_MEMBER:
 			inm->in6m_state = MLD_LAZY_MEMBER;
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	/* XXX Clear embedded scope ID as userland won't expect it. */
 	in6_clearscope(&mld->mld_addr);
 
 	return (0);
 }
 
 /*
  * MLD input path.
  *
  * Assume query messages which fit in a single ICMPv6 message header
  * have been pulled up.
  * Assume that userland will want to see the message, even if it
  * otherwise fails kernel input validation; do not free it.
  * Pullup may however free the mbuf chain m if it fails.
  *
  * Return IPPROTO_DONE if we freed m. Otherwise, return 0.
  */
 int
 mld_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet	*ifp;
 	struct ip6_hdr	*ip6;
 	struct mld_hdr	*mld;
 	int		 mldlen;
 
 	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
 
 	ifp = m->m_pkthdr.rcvif;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* Pullup to appropriate size. */
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 	if (mld->mld_type == MLD_LISTENER_QUERY &&
 	    icmp6len >= sizeof(struct mldv2_query)) {
 		mldlen = sizeof(struct mldv2_query);
 	} else {
 		mldlen = sizeof(struct mld_hdr);
 	}
 	IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
 	if (mld == NULL) {
 		ICMP6STAT_INC(icp6s_badlen);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Userland needs to see all of this traffic for implementing
 	 * the endpoint discovery portion of multicast routing.
 	 */
 	switch (mld->mld_type) {
 	case MLD_LISTENER_QUERY:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
 		if (icmp6len == sizeof(struct mld_hdr)) {
 			if (mld_v1_input_query(ifp, ip6, mld) != 0)
 				return (0);
 		} else if (icmp6len >= sizeof(struct mldv2_query)) {
 			if (mld_v2_input_query(ifp, ip6, m, off,
 			    icmp6len) != 0)
 				return (0);
 		}
 		break;
 	case MLD_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		if (mld_v1_input_report(ifp, ip6, mld) != 0)
 			return (0);
 		break;
 	case MLDV2_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		break;
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 mld_fasttimo_vnet(void)
 {
 	struct ifqueue		 scq;	/* State-change packets */
 	struct ifqueue		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct mld_ifinfo	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 	int			 uri_fasthz;
 
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running6 &&
 	    !V_interface_timers_running6 &&
 	    !V_state_change_timers_running6)
 		return;
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv2 General Query response timer processing.
 	 */
 	if (V_interface_timers_running6) {
 		CTR1(KTR_MLD, "%s: interface timers running", __func__);
 
 		V_interface_timers_running6 = 0;
 		LIST_FOREACH(mli, &V_mli_head, mli_link) {
 			if (mli->mli_v2_timer == 0) {
 				/* Do nothing. */
 			} else if (--mli->mli_v2_timer == 0) {
 				mld_v2_dispatch_general_query(mli);
 			} else {
 				V_interface_timers_running6 = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running6 &&
 	    !V_state_change_timers_running6)
 		goto out_locked;
 
 	V_current_state_timers_running6 = 0;
 	V_state_change_timers_running6 = 0;
 
 	CTR1(KTR_MLD, "%s: state change timers running", __func__);
 
 	/*
 	 * MLD host report and state-change timer processing.
 	 * Note: Processing a v2 group timer may remove a node.
 	 */
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		ifp = mli->mli_ifp;
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
 			    PR_FASTHZ);
 
 			memset(&qrq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&qrq, MLD_MAX_G_GS_PACKETS);
 
 			memset(&scq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			switch (mli->mli_version) {
 			case MLD_VERSION_1:
 				mld_v1_process_group_timer(mli, inm);
 				break;
 			case MLD_VERSION_2:
 				mld_v2_process_group_timers(mli, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * Transmit reports for this lifecycle.  This
 			 * is done while not holding IF_ADDR_LOCK
 			 * since this can call
 			 * in6ifa_ifpforlinklocal() which locks
 			 * IF_ADDR_LOCK internally as well as
 			 * ip6_output() to transmit a packet.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				(void)mld_v1_transmit_report(inm,
 				    MLD_LISTENER_REPORT);
 			}
 			break;
 		case MLD_VERSION_2:
 			mld_dispatch_queue(&qrq, 0);
 			mld_dispatch_queue(&scq, 0);
 
 			/*
 			 * Free the in_multi reference(s) for
 			 * this lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				in6m_release_locked(inm);
 			}
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer.
  * Will update the global pending timer flags.
  */
 static void
 mld_v1_process_group_timer(struct mld_ifinfo *mli, struct in6_multi *inm)
 {
 	int report_timer_expired;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	if (inm->in6m_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 		return;
 	}
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->in6m_state = MLD_IDLE_MEMBER;
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 		}
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for MLDv2.
  * Will update the global pending timer flags.
  * Note: Unlocked read from mli.
  */
 static void
 mld_v2_process_group_timers(struct mld_ifinfo *mli,
     struct ifqueue *qrq, struct ifqueue *scq,
     struct in6_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from compatibility mode back to MLDv2,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->in6m_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 	}
 
 	if (inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running6 = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
 			    0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			in6m_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case MLD_REPORTING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->in6m_scrv > 0) {
 				inm->in6m_sctimer = uri_fasthz;
 				V_state_change_timers_running6 = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)mld_v2_merge_state_changes(inm, scq);
 
 			in6m_commit(inm);
 			CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp));
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release MLD's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER &&
 			    inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Switch to a different version on the given interface,
  * as per Section 9.12.
  */
 static void
 mld_set_version(struct mld_ifinfo *mli, const int version)
 {
 	int old_version_timer;
 
 	MLD_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, mli->mli_ifp, if_name(mli->mli_ifp));
 
 	if (version == MLD_VERSION_1) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 9.12.
 		 */
 		old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
 		old_version_timer *= PR_SLOWHZ;
 		mli->mli_v1_timer = old_version_timer;
 	}
 
 	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
 		mli->mli_version = MLD_VERSION_1;
 		mld_v2_cancel_link_timers(mli);
 	}
 }
 
 /*
  * Cancel pending MLDv2 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  */
 static void
 mld_v2_cancel_link_timers(struct mld_ifinfo *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
 	    mli->mli_ifp, if_name(mli->mli_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * Fast-track this potentially expensive operation
 	 * by checking all the global 'timer pending' flags.
 	 */
 	if (!V_interface_timers_running6 &&
 	    !V_state_change_timers_running6 &&
 	    !V_current_state_timers_running6)
 		return;
 
 	mli->mli_v2_timer = 0;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			break;
 		case MLD_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching
 			 * version, we need to release the final
 			 * reference held for issuing the INCLUDE {}.
 			 */
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 			/* FALLTHROUGH */
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 			in6m_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case MLD_REPORTING_MEMBER:
 			inm->in6m_sctimer = 0;
 			inm->in6m_timer = 0;
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			/*
 			 * Free any pending MLDv2 state-change records.
 			 */
 			_IF_DRAIN(&inm->in6m_scq);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 		in6m_release_locked(inm);
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 mld_slowtimo_vnet(void)
 {
 	struct mld_ifinfo *mli;
 
 	MLD_LOCK();
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		mld_v1_process_querier_timers(mli);
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 9.12 of RFC 3810.
  */
 static void
 mld_v1_process_querier_timers(struct mld_ifinfo *mli)
 {
 
 	MLD_LOCK_ASSERT();
 
 	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
 		/*
 		 * MLDv1 Querier Present timer expired; revert to MLDv2.
 		 */
 		CTR5(KTR_MLD,
 		    "%s: transition from v%d -> v%d on %p(%s)",
 		    __func__, mli->mli_version, MLD_VERSION_2,
 		    mli->mli_ifp, if_name(mli->mli_ifp));
 		mli->mli_version = MLD_VERSION_2;
 	}
 }
 
 /*
  * Transmit an MLDv1 report immediately.
  */
 static int
 mld_v1_transmit_report(struct in6_multi *in6m, const int type)
 {
 	struct ifnet		*ifp;
 	struct in6_ifaddr	*ia;
 	struct ip6_hdr		*ip6;
 	struct mbuf		*mh, *md;
 	struct mld_hdr		*mld;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	ifp = in6m->in6m_ifp;
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	/* ia may be NULL if link-local address is tentative. */
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	md = m_get(M_NOWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	mh->m_next = md;
 
 	/*
 	 * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
 	 * that ether_output() does not need to allocate another mbuf
 	 * for the header in the most common case.
 	 */
-	MH_ALIGN(mh, sizeof(struct ip6_hdr));
+	M_ALIGN(mh, sizeof(struct ip6_hdr));
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	ip6->ip6_dst = in6m->in6m_addr;
 
 	md->m_len = sizeof(struct mld_hdr);
 	mld = mtod(md, struct mld_hdr *);
 	mld->mld_type = type;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_maxdelay = 0;
 	mld->mld_reserved = 0;
 	mld->mld_addr = in6m->in6m_addr;
 	in6_clearscope(&mld->mld_addr);
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
 
 	mld_save_context(mh, ifp);
 	mh->m_flags |= M_MLDV1;
 
 	mld_dispatch_packet(mh);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv6 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to MLD to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the MLDv2 state machine at group level. The MLd module
  * however makes the decision as to which MLD protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * If delay is non-zero, and the state change is an initial multicast
  * join, the state change report will be delayed by 'delay' ticks
  * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
  * the initial MLDv2 state change report will be delayed by whichever
  * is sooner, a pending state-change timer or delay itself.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 mld_change_state(struct in6_multi *inm, const int delay)
 {
 	struct mld_ifinfo *mli;
 	struct ifnet *ifp;
 	int error;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet6's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an MLD
 	 * life cycle for this group.
 	 */
 	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
 		CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
 		    inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
 		if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: initial join", __func__);
 			error = mld_initial_join(inm, mli, delay);
 			goto out_locked;
 		} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: final leave", __func__);
 			mld_final_leave(inm, mli);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_MLD, "%s: filter set change", __func__);
 	}
 
 	error = mld_handle_state_change(inm, mli);
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an MLD group.
  *
  * When joining a group:
  *  If the group should have its MLD traffic suppressed, do nothing.
  *  MLDv1 starts sending MLDv1 host membership reports.
  *  MLDv2 will schedule an MLDv2 state-change report containing the
  *  initial state of the membership.
  *
  * If the delay argument is non-zero, then we must delay sending the
  * initial state change for delay ticks (in units of PR_FASTHZ).
  */
 static int
 mld_initial_join(struct in6_multi *inm, struct mld_ifinfo *mli,
     const int delay)
 {
 	struct ifnet		*ifp;
 	struct ifqueue		*ifq;
 	int			 error, retval, syncstates;
 	int			 odelay;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * enter the MLD_SILENT_MEMBER state and
 	 * are never reported in any protocol exchanges.
 	 * All other groups enter the appropriate state machine
 	 * for the version in use on this link.
 	 * A link marked as MLIF_SILENT causes MLD to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr)) {
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		inm->in6m_state = MLD_SILENT_MEMBER;
 		inm->in6m_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (mli->mli_version == MLD_VERSION_2 &&
 		    inm->in6m_state == MLD_LEAVING_MEMBER)
 			in6m_release_locked(inm);
 
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * If a delay was provided, only use it if
 			 * it is greater than the delay normally
 			 * used for an MLDv1 state change report,
 			 * and delay sending the initial MLDv1 report
 			 * by not transitioning to the IDLE state.
 			 */
 			odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
 			if (delay) {
 				inm->in6m_timer = max(delay, odelay);
 				V_current_state_timers_running6 = 1;
 			} else {
 				inm->in6m_state = MLD_IDLE_MEMBER;
 				error = mld_v1_transmit_report(inm,
 				     MLD_LISTENER_REPORT);
 				if (error == 0) {
 					inm->in6m_timer = odelay;
 					V_current_state_timers_running6 = 1;
 				}
 			}
 			break;
 
 		case MLD_VERSION_2:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			ifq = &inm->in6m_scq;
 			_IF_DRAIN(ifq);
 			retval = mld_v2_enqueue_group_record(ifq, inm, 1,
 			    0, 0, (mli->mli_flags & MLIF_USEALLOW));
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next mld_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 *
 			 * If a delay was provided to this function, only
 			 * use this delay if sooner than the existing one.
 			 */
 			KASSERT(mli->mli_rv > 1,
 			   ("%s: invalid robustness %d", __func__,
 			    mli->mli_rv));
 			inm->in6m_scrv = mli->mli_rv;
 			if (delay) {
 				if (inm->in6m_sctimer > 1) {
 					inm->in6m_sctimer =
 					    min(inm->in6m_sctimer, delay);
 				} else
 					inm->in6m_sctimer = delay;
 			} else
 				inm->in6m_sctimer = 1;
 			V_state_change_timers_running6 = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the life-cycle.
  */
 static int
 mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp,
 	    ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr) ||
 	    (mli->mli_version != MLD_VERSION_2)) {
 		if (!mld_is_addr_reported(&inm->in6m_addr)) {
 			CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_MLD, "%s: nothing to do", __func__);
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	_IF_DRAIN(&inm->in6m_scq);
 
 	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
 	    (mli->mli_flags & MLIF_USEALLOW));
 	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->in6m_scrv = mli->mli_rv;
 	inm->in6m_sctimer = 1;
 	V_state_change_timers_running6 = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for a multicast address.
  *
  * When leaving a group:
  *  MLDv1 sends a DONE message, if and only if we are the reporter.
  *  MLDv2 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 mld_final_leave(struct in6_multi *inm, struct mld_ifinfo *mli)
 {
 	int syncstates;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	syncstates = 1;
 
 	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		if (mli->mli_version == MLD_VERSION_1) {
 #ifdef INVARIANTS
 			if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 			    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
 			panic("%s: MLDv2 state reached, not MLDv2 mode",
 			     __func__);
 #endif
 			mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
 			inm->in6m_state = MLD_NOT_MEMBER;
 			V_current_state_timers_running6 = 1;
 		} else if (mli->mli_version == MLD_VERSION_2) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			_IF_DRAIN(&inm->in6m_scq);
 			inm->in6m_timer = 0;
 			inm->in6m_scrv = mli->mli_rv;
 			CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp), inm->in6m_scrv);
 			if (inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				inm->in6m_sctimer = 0;
 			} else {
 				int retval;
 
 				in6m_acquire_locked(inm);
 
 				retval = mld_v2_enqueue_group_record(
 				    &inm->in6m_scq, inm, 1, 0, 0,
 				    (mli->mli_flags & MLIF_USEALLOW));
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->in6m_state = MLD_LEAVING_MEMBER;
 				inm->in6m_sctimer = 1;
 				V_state_change_timers_running6 = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
 		    __func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
 	}
 }
 
 /*
  * Enqueue an MLDv2 group record to the given output queue.
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * If use_block_allow is non-zero, state change reports for initial join
  * and final leave, on an inclusive mode group with a source list, will be
  * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IPv6+ICMP headers to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_group_record(struct ifqueue *ifq, struct in6_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query, const int use_block_allow)
 {
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ifnet		*ifp;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	uint8_t			 mode;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->in6m_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pmr = NULL;
 	type = MLD_DO_NOTHING;
 	mode = inm->in6m_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
 	    inm->in6m_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 *
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources. A transition to/from this state is
 		 * considered inclusive with some special treatment.
 		 *
 		 * If we are rewriting initial joins/leaves to use
 		 * ALLOW/BLOCK, and the group's membership is inclusive,
 		 * we need to send sources in all cases.
 		 */
 		if (mode != inm->in6m_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_MLD, "%s: change to EXCLUDE",
 				    __func__);
 				type = MLD_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_MLD, "%s: change to INCLUDE",
 				    __func__);
 				if (use_block_allow) {
 					/*
 					 * XXX
 					 * Here we're interested in state
 					 * edges either direction between
 					 * MCAST_UNDEFINED and MCAST_INCLUDE.
 					 * Perhaps we should just check
 					 * the group state, rather than
 					 * the filter mode.
 					 */
 					if (mode == MCAST_UNDEFINED) {
 						type = MLD_BLOCK_OLD_SOURCES;
 					} else {
 						type = MLD_ALLOW_NEW_SOURCES;
 					}
 				} else {
 					type = MLD_CHANGE_TO_INCLUDE_MODE;
 					if (mode == MCAST_UNDEFINED)
 						record_has_sources = 0;
 				}
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = MLD_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = MLD_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = MLD_MODE_IS_INCLUDE;
 			KASSERT(inm->in6m_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->in6m_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (mld_v2_enqueue_filter_change(ifq, inm));
 
 	if (type == MLD_DO_NOTHING) {
 		CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
 		    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct mldv2_record);
 	if (record_has_sources)
 		minrec0len += sizeof(struct in6_addr);
 
 	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
 	    mld_rec_type_to_str(type),
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp));
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP6+RA+ICMPV6+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = ifq->ifq_tail;
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - MLD_MTUSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct mldv2_record)) /
 			    sizeof(struct in6_addr);
 		m = m0;
 		CTR1(KTR_MLD, "%s: use existing packet", __func__);
 	} else {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 		if (!is_state_change && !is_group_query)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 
 		mld_save_context(m, ifp);
 
 		CTR1(KTR_MLD, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	mr.mr_type = type;
 	mr.mr_datalen = 0;
 	mr.mr_numsrc = 0;
 	mr.mr_addr = inm->in6m_addr;
 	in6_clearscope(&mr.mr_addr);
 	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct mldv2_record);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 *
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, and
 	 * use_block_allow is zero, do not include source entries.
 	 * Otherwise, we need to include this source in the report.
 	 *
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
 		    nims) {
 			CTR2(KTR_MLD, "%s: visit node %s", __func__,
 			    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			CTR2(KTR_MLD, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(struct in6_addr);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		_IF_ENQUEUE(ifq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 		mld_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_MLD, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct mldv2_record);
 
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 			CTR2(KTR_MLD, "%s: visit node %s",
 			    __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 
 		CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
 		_IF_ENQUEUE(ifq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an MLDv2 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_filter_change(struct ifqueue *ifq, struct in6_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct mldv2_record) + sizeof(struct in6_addr);
 	struct ifnet		*ifp;
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	if (inm->in6m_nsrc == 0 ||
 	    (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->in6m_ifp;			/* interface */
 	mode = inm->in6m_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	npbytes = 0;	/* # of bytes appended this packet */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = ifq->ifq_tail;
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     MLD_V2_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - MLD_MTUSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct mldv2_record)) /
 					    sizeof(struct in6_addr);
 				CTR1(KTR_MLD,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m == NULL)
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 				if (m == NULL) {
 					CTR1(KTR_MLD,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				mld_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 				    sizeof(struct mldv2_record)) /
 				    sizeof(struct in6_addr);
 				npbytes = 0;
 				CTR1(KTR_MLD,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the MLD group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&mr, 0, sizeof(mr));
 			mr.mr_addr = inm->in6m_addr;
 			in6_clearscope(&mr.mr_addr);
 			if (!m_append(m, sizeof(mr), (void *)&mr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct mldv2_record);
 			if (m != m0) {
 				/* new packet; offset in chain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct mldv2_record), &off);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct mldv2_record));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL) {
 				nims = RB_MIN(ip6_msource_tree,
 				    &inm->in6m_srcs);
 			}
 			RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 				CTR2(KTR_MLD, "%s: visit node %s", __func__,
 				    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 				now = im6s_get_mode(inm, ims, 1);
 				then = im6s_get_mode(inm, ims, 0);
 				CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_MLD,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_MLD,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				if (!m_append(m, sizeof(struct in6_addr),
 				    (void *)&ims->im6s_addr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_MLD,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct mldv2_record);
 				if (m != m0) {
 					CTR1(KTR_MLD,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_MLD,
 					    "%s: m_adj(m, -mr)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct mldv2_record)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(struct in6_addr));
 			if (crt == REC_ALLOW)
 				pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
 			pmr->mr_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				_IF_ENQUEUE(ifq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 mld_v2_merge_state_changes(struct in6_multi *inm, struct ifqueue *ifscq)
 {
 	struct ifqueue	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->in6m_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->in6m_scq;
 #ifdef KTR
 	if (gq->ifq_head == NULL) {
 		CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = gq->ifq_head;
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an MLDv2 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = ifscq->ifq_tail;
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    MLD_V2_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && _IF_QFULL(gq)) {
 			CTR2(KTR_MLD,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
 			_IF_DEQUEUE(gq, m0);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_MLD, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_MLD, "%s: queueing %p to ifscq %p)",
 			    __func__, m0, ifscq);
 			_IF_ENQUEUE(ifscq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending MLDv2 General Query.
  */
 static void
 mld_v2_dispatch_general_query(struct mld_ifinfo *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 	int			 retval;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli->mli_version == MLD_VERSION_2,
 	    ("%s: called when version %d", __func__, mli->mli_version));
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->in6m_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			retval = mld_v2_enqueue_group_record(&mli->mli_gq,
 			    inm, 0, 0, 0, 0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mli->mli_gq.ifq_head != NULL) {
 		mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
 		    MLD_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running6 = 1;
 	}
 }
 
 /*
  * Transmit the next pending message in the output queue.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as MLD traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 mld_dispatch_packet(struct mbuf *m)
 {
 	struct ip6_moptions	 im6o;
 	struct ifnet		*ifp;
 	struct ifnet		*oifp;
 	struct mbuf		*m0;
 	struct mbuf		*md;
 	struct ip6_hdr		*ip6;
 	struct mld_hdr		*mld;
 	int			 error;
 	int			 off;
 	int			 type;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	ifindex = mld_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IP6STAT_INC(ip6s_noroute);
 		goto out;
 	}
 
 	im6o.im6o_multicast_hlim  = 1;
 	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
 	im6o.im6o_multicast_ifp = ifp;
 
 	if (m->m_flags & M_MLDV1) {
 		m0 = m;
 	} else {
 		m0 = mld_v2_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
 			IP6STAT_INC(ip6s_odropped);
 			goto out;
 		}
 	}
 
 	mld_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 #if 0
 	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL);	/* XXX LOR */
 #else
 	/*
 	 * XXX XXX Break some KPI rules to prevent an LOR which would
 	 * occur if we called in6_setscope() at transmission.
 	 * See comments at top of file.
 	 */
 	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
 #endif
 
 	/*
 	 * Retrieve the ICMPv6 type before handoff to ip6_output(),
 	 * so we can bump the stats.
 	 */
 	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
 	mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
 	type = mld->mld_type;
 
 	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
 	    &oifp, NULL);
 	if (error) {
 		CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	if (oifp != NULL) {
 		icmp6_ifstat_inc(oifp, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_REPORT:
 		case MLDV2_LISTENER_REPORT:
 			icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
 			break;
 		}
 	}
 out:
 	return;
 }
 
 /*
  * Encapsulate an MLDv2 report.
  *
  * KAME IPv6 requires that hop-by-hop options be passed separately,
  * and that the IPv6 header be prepended in a separate mbuf.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf		*mh;
 	struct mldv2_report	*mld;
 	struct ip6_hdr		*ip6;
 	struct in6_ifaddr	*ia;
 	int			 mldreclen;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	/*
 	 * RFC3590: OK to send as :: or tentative during DAD.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if (ia == NULL)
 		CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		m_freem(m);
 		return (NULL);
 	}
-	MH_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
+	M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
 
 	mldreclen = m_length(m, NULL);
 	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
 
 	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
 	    sizeof(struct mldv2_report) + mldreclen;
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	ip6->ip6_dst = in6addr_linklocal_allv2routers;
 	/* scope ID will be set in netisr */
 
 	mld = (struct mldv2_report *)(ip6 + 1);
 	mld->mld_type = MLDV2_LISTENER_REPORT;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_v2_reserved = 0;
 	mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	mh->m_next = m;
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
 	return (mh);
 }
 
 #ifdef KTR
 static char *
 mld_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case MLD_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case MLD_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case MLD_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case MLD_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case MLD_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case MLD_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 mld_init(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 	MLD_LOCK_INIT();
 
 	ip6_initpktopts(&mld_po);
 	mld_po.ip6po_hlim = 1;
 	mld_po.ip6po_hbh = &mld_ra.hbh;
 	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
 	mld_po.ip6po_flags = IP6PO_DONTFRAG;
 }
 SYSINIT(mld_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_init, NULL);
 
 static void
 mld_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 	MLD_LOCK_DESTROY();
 }
 SYSUNINIT(mld_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_uninit, NULL);
 
 static void
 vnet_mld_init(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 
 	LIST_INIT(&V_mli_head);
 }
 VNET_SYSINIT(vnet_mld_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_init,
     NULL);
 
 static void
 vnet_mld_uninit(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 
 	KASSERT(LIST_EMPTY(&V_mli_head),
 	    ("%s: mli list not empty; ifnets not detached?", __func__));
 }
 VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_uninit,
     NULL);
 
 static int
 mld_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t mld_mod = {
     "mld",
     mld_modevent,
     0
 };
 DECLARE_MODULE(mld, mld_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: head/sys/netinet6/nd6_nbr.c
===================================================================
--- head/sys/netinet6/nd6_nbr.c	(revision 276691)
+++ head/sys/netinet6/nd6_nbr.c	(revision 276692)
@@ -1,1566 +1,1566 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 #include <sys/refcount.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #define	L3_ADDR_SIN6(le)	((struct sockaddr_in6 *) L3_ADDR(le))
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_carp.h>
 #include <netinet6/send.h>
 
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 struct dadq;
 static struct dadq *nd6_dad_find(struct ifaddr *);
 static void nd6_dad_add(struct dadq *dp);
 static void nd6_dad_del(struct dadq *dp);
 static void nd6_dad_rele(struct dadq *);
 static void nd6_dad_starttimer(struct dadq *, int);
 static void nd6_dad_stoptimer(struct dadq *);
 static void nd6_dad_timer(struct dadq *);
 static void nd6_dad_duplicated(struct ifaddr *, struct dadq *);
 static void nd6_dad_ns_output(struct dadq *, struct ifaddr *);
 static void nd6_dad_ns_input(struct ifaddr *);
 static void nd6_dad_na_input(struct ifaddr *);
 static void nd6_na_output_fib(struct ifnet *, const struct in6_addr *,
     const struct in6_addr *, u_long, int, struct sockaddr *, u_int);
 
 static VNET_DEFINE(int, dad_ignore_ns) = 0;	/* ignore NS in DAD
 						   - specwise incorrect */
 static VNET_DEFINE(int, dad_maxtry) = 15;	/* max # of *tries* to
 						   transmit DAD packet */
 #define	V_dad_ignore_ns			VNET(dad_ignore_ns)
 #define	V_dad_maxtry			VNET(dad_maxtry)
 
 /*
  * Input a Neighbor Solicitation Message.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  */
 void
 nd6_ns_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	struct in6_addr myaddr6;
 	char *lladdr = NULL;
 	struct ifaddr *ifa = NULL;
 	int lladdrlen = 0;
 	int anycast = 0, proxy = 0, tentative = 0;
 	int tlladdr;
 	int rflag;
 	union nd_opts ndopts;
 	struct sockaddr_dl proxydl;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0;
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif)
 		rflag = 0;
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
 	if (nd_ns == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return;
 	}
 #endif
 	ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */
 	taddr6 = nd_ns->nd_ns_target;
 	if (in6_setscope(&taddr6, ifp, NULL) != 0)
 		goto bad;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		/* dst has to be a solicited node multicast address. */
 		if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
 		    /* don't check ifindex portion */
 		    daddr6.s6_addr32[1] == 0 &&
 		    daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
 		    daddr6.s6_addr8[12] == 0xff) {
 			; /* good */
 		} else {
 			nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 			    "(wrong ip6 dst)\n"));
 			goto bad;
 		}
 	} else if (!V_nd6_onlink_ns_rfc4861) {
 		struct sockaddr_in6 src_sa6;
 
 		/*
 		 * According to recent IETF discussions, it is not a good idea
 		 * to accept a NS from an address which would not be deemed
 		 * to be a neighbor otherwise.  This point is expected to be
 		 * clarified in future revisions of the specification.
 		 */
 		bzero(&src_sa6, sizeof(src_sa6));
 		src_sa6.sin6_family = AF_INET6;
 		src_sa6.sin6_len = sizeof(src_sa6);
 		src_sa6.sin6_addr = saddr6;
 		if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) {
 			nd6log((LOG_INFO, "nd6_ns_input: "
 				"NS packet from non-neighbor\n"));
 			goto bad;
 		}
 	}
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_ns);
 	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ns_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 		    "(link-layer address option)\n"));
 		goto bad;
 	}
 
 	/*
 	 * Attaching target link-layer address to the NA?
 	 * (RFC 2461 7.2.4)
 	 *
 	 * NS IP dst is unicast/anycast			MUST NOT add
 	 * NS IP dst is solicited-node multicast	MUST add
 	 *
 	 * In implementation, we add target link-layer address by default.
 	 * We do not add one in MUST NOT cases.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
 		tlladdr = 0;
 	else
 		tlladdr = 1;
 
 	/*
 	 * Target address (taddr6) must be either:
 	 * (1) Valid unicast/anycast address for my receiving interface,
 	 * (2) Unicast address for which I'm offering proxy service, or
 	 * (3) "tentative" address on which DAD is being performed.
 	 */
 	/* (1) and (3) check. */
 	if (ifp->if_carp)
 		ifa = (*carp_iamatch6_p)(ifp, &taddr6);
 	else
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/* (2) check. */
 	if (ifa == NULL) {
 		struct route_in6 ro;
 		int need_proxy;
 
 		bzero(&ro, sizeof(ro));
 		ro.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		ro.ro_dst.sin6_family = AF_INET6;
 		ro.ro_dst.sin6_addr = taddr6;
 
 		/* Always use the default FIB. */
 #ifdef RADIX_MPATH
 		rtalloc_mpath_fib((struct route *)&ro, ntohl(taddr6.s6_addr32[3]),
 		    RT_DEFAULT_FIB);
 #else
 		in6_rtalloc(&ro, RT_DEFAULT_FIB);
 #endif
 		need_proxy = (ro.ro_rt &&
 		    (ro.ro_rt->rt_flags & RTF_ANNOUNCE) != 0 &&
 		    ro.ro_rt->rt_gateway->sa_family == AF_LINK);
 		if (ro.ro_rt != NULL) {
 			if (need_proxy)
 				proxydl = *SDL(ro.ro_rt->rt_gateway);
 			RTFREE(ro.ro_rt);
 		}
 		if (need_proxy) {
 			/*
 			 * proxy NDP for single entry
 			 */
 			ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 				IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 			if (ifa)
 				proxy = 1;
 		}
 	}
 	if (ifa == NULL) {
 		/*
 		 * We've got an NS packet, and we don't have that adddress
 		 * assigned for us.  We MUST silently ignore it.
 		 * See RFC2461 7.2.3.
 		 */
 		goto freeit;
 	}
 	myaddr6 = *IFA_IN6(ifa);
 	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
 	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
 	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
 		goto freeit;
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
 		    "(if %d, NS packet %d)\n",
 		    ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto freeit;
 	}
 
 	/*
 	 * We have neighbor solicitation packet, with target address equals to
 	 * one of my tentative address.
 	 *
 	 * src addr	how to process?
 	 * ---		---
 	 * multicast	of course, invalid (rejected in ip6_input)
 	 * unicast	somebody is doing address resolution -> ignore
 	 * unspec	dup address detection
 	 *
 	 * The processing is defined in RFC 2462.
 	 */
 	if (tentative) {
 		/*
 		 * If source address is unspecified address, it is for
 		 * duplicate address detection.
 		 *
 		 * If not, the packet is for addess resolution;
 		 * silently ignore it.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 			nd6_dad_ns_input(ifa);
 
 		goto freeit;
 	}
 
 	/*
 	 * If the source address is unspecified address, entries must not
 	 * be created or updated.
 	 * It looks that sender is performing DAD.  Output NA toward
 	 * all-node multicast address, to tell the sender that I'm using
 	 * the address.
 	 * S bit ("solicited") must be zero.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		struct in6_addr in6_all;
 
 		in6_all = in6addr_linklocal_allnodes;
 		if (in6_setscope(&in6_all, ifp, NULL) != 0)
 			goto bad;
 		nd6_na_output_fib(ifp, &in6_all, &taddr6,
 		    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 		    rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL,
 		    M_GETFIB(m));
 		goto freeit;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
 	    ND_NEIGHBOR_SOLICIT, 0);
 
 	nd6_na_output_fib(ifp, &saddr6, &taddr6,
 	    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 	    rflag | ND_NA_FLAG_SOLICITED, tlladdr,
 	    proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m));
  freeit:
 	if (ifa != NULL)
 		ifa_free(ifa);
 	m_freem(m);
 	return;
 
  bad:
 	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
 		ip6_sprintf(ip6bufs, &saddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
 		ip6_sprintf(ip6bufs, &daddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
 		ip6_sprintf(ip6bufs, &taddr6)));
 	ICMP6STAT_INC(icp6s_badns);
 	if (ifa != NULL)
 		ifa_free(ifa);
 	m_freem(m);
 }
 
 /*
  * Output a Neighbor Solicitation Message. Caller specifies:
  *	- ICMP6 header source IP6 address
  *	- ND6 header target IP6 address
  *	- ND6 header source datalink address
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  *   ln - for source address determination
  *  dad - duplicate address detection
  */
 void
 nd6_ns_output(struct ifnet *ifp, const struct in6_addr *daddr6, 
     const struct in6_addr *taddr6, struct llentry *ln, int dad)
 {
 	struct mbuf *m;
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_solicit *nd_ns;
 	struct ip6_moptions im6o;
 	int icmp6len;
 	int maxlen;
 	caddr_t mac;
 	struct route_in6 ro;
 
 	if (IN6_IS_ADDR_MULTICAST(taddr6))
 		return;
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_ns);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	if (max_linkhdr + maxlen > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 
 	bzero(&ro, sizeof(ro));
 
 	if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_ns);
 	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
-	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
+	m->m_data += max_linkhdr;	/* or M_ALIGN() equivalent? */
 
 	/* fill neighbor solicitation packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (daddr6)
 		ip6->ip6_dst = *daddr6;
 	else {
 		ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		ip6->ip6_dst.s6_addr16[1] = 0;
 		ip6->ip6_dst.s6_addr32[1] = 0;
 		ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
 		ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
 		ip6->ip6_dst.s6_addr8[12] = 0xff;
 		if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 			goto bad;
 	}
 	if (!dad) {
 		struct ifaddr *ifa;
 
 		/*
 		 * RFC2461 7.2.2:
 		 * "If the source address of the packet prompting the
 		 * solicitation is the same as one of the addresses assigned
 		 * to the outgoing interface, that address SHOULD be placed
 		 * in the IP Source Address of the outgoing solicitation.
 		 * Otherwise, any one of the addresses assigned to the
 		 * interface should be used."
 		 *
 		 * We use the source address for the prompting packet
 		 * (saddr6), if:
 		 * - saddr6 is given from the caller (by giving "ln"), and
 		 * - saddr6 belongs to the outgoing interface.
 		 * Otherwise, we perform the source address selection as usual.
 		 */
 		struct in6_addr *hsrc;
 
 		hsrc = NULL;
 		if (ln != NULL) {
 			LLE_RLOCK(ln);
 			if (ln->la_hold != NULL) {
 				struct ip6_hdr *hip6;		/* hold ip6 */
 
 				/*
 				 * assuming every packet in la_hold has the same IP
 				 * header
 				 */
 				hip6 = mtod(ln->la_hold, struct ip6_hdr *);
 				/* XXX pullup? */
 				if (sizeof(*hip6) < ln->la_hold->m_len) {
 					ip6->ip6_src = hip6->ip6_src;
 					hsrc = &hip6->ip6_src;
 				}
 			}
 			LLE_RUNLOCK(ln);
 		}
 		if (hsrc && (ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp,
 		    hsrc)) != NULL) {
 			/* ip6_src set already. */
 			ifa_free(ifa);
 		} else {
 			int error;
 			struct sockaddr_in6 dst_sa;
 			struct in6_addr src_in;
 			struct ifnet *oifp;
 
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 
 			oifp = ifp;
 			error = in6_selectsrc(&dst_sa, NULL,
 			    NULL, &ro, NULL, &oifp, &src_in);
 			if (error) {
 				char ip6buf[INET6_ADDRSTRLEN];
 				nd6log((LOG_DEBUG,
 				    "nd6_ns_output: source can't be "
 				    "determined: dst=%s, error=%d\n",
 				    ip6_sprintf(ip6buf, &dst_sa.sin6_addr),
 				    error));
 				goto bad;
 			}
 			ip6->ip6_src = src_in;
 		}
 	} else {
 		/*
 		 * Source address for DAD packet must always be IPv6
 		 * unspecified address. (0::0)
 		 * We actually don't have to 0-clear the address (we did it
 		 * above), but we do so here explicitly to make the intention
 		 * clearer.
 		 */
 		bzero(&ip6->ip6_src, sizeof(ip6->ip6_src));
 	}
 	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
 	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
 	nd_ns->nd_ns_code = 0;
 	nd_ns->nd_ns_reserved = 0;
 	nd_ns->nd_ns_target = *taddr6;
 	in6_clearscope(&nd_ns->nd_ns_target); /* XXX */
 
 	/*
 	 * Add source link-layer address option.
 	 *
 	 *				spec		implementation
 	 *				---		---
 	 * DAD packet			MUST NOT	do not add the option
 	 * there's no link layer address:
 	 *				impossible	do not add the option
 	 * there's link layer address:
 	 *	Multicast NS		MUST add one	add the option
 	 *	Unicast NS		SHOULD add one	add the option
 	 */
 	if (!dad && (mac = nd6_ifptomac(ifp))) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
 		/* 8 byte alignments... */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	}
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_ns->nd_ns_cksum = 0;
 	nd_ns->nd_ns_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);
 
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 			sizeof(unsigned short), M_NOWAIT);
 		if (mtag == NULL)
 			goto bad;
 		*(unsigned short *)(mtag + 1) = nd_ns->nd_ns_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	ip6_output(m, NULL, &ro, dad ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
 	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]);
 
 	/* We don't cache this route. */
 	RO_RTFREE(&ro);
 
 	return;
 
   bad:
 	if (ro.ro_rt) {
 		RTFREE(ro.ro_rt);
 	}
 	m_freem(m);
 	return;
 }
 
 /*
  * Neighbor advertisement input handling.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  */
 void
 nd6_na_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_advert *nd_na;
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	int flags;
 	int is_router;
 	int is_solicited;
 	int is_override;
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	int checklink = 0;
 	struct ifaddr *ifa;
 	struct llentry *ln = NULL;
 	union nd_opts ndopts;
 	struct mbuf *chain = NULL;
 	struct m_tag *mtag;
 	struct sockaddr_in6 sin6;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
 	if (nd_na == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return;
 	}
 #endif
 
 	flags = nd_na->nd_na_flags_reserved;
 	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
 	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
 	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
 
 	taddr6 = nd_na->nd_na_target;
 	if (in6_setscope(&taddr6, ifp, NULL))
 		goto bad;	/* XXX: impossible */
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid target address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6)));
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&daddr6))
 		if (is_solicited) {
 			nd6log((LOG_ERR,
 			    "nd6_na_input: a solicited adv is multicasted\n"));
 			goto bad;
 		}
 
 	icmp6len -= sizeof(*nd_na);
 	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_na_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	/*
 	 * This effectively disables the DAD check on a non-master CARP
 	 * address.
 	 */
 	if (ifp->if_carp)
 		ifa = (*carp_iamatch6_p)(ifp, &taddr6);
 	else
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/*
 	 * Target address matches one of my interface address.
 	 *
 	 * If my address is tentative, this means that there's somebody
 	 * already using the same address as mine.  This indicates DAD failure.
 	 * This is defined in RFC 2462.
 	 *
 	 * Otherwise, process as defined in RFC 2461.
 	 */
 	if (ifa
 	 && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
 		ifa_free(ifa);
 		nd6_dad_na_input(ifa);
 		goto freeit;
 	}
 
 	/* Just for safety, maybe unnecessary. */
 	if (ifa) {
 		ifa_free(ifa);
 		log(LOG_ERR,
 		    "nd6_na_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6));
 		goto freeit;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
 		    "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	/*
 	 * If no neighbor cache entry is found, NA SHOULD silently be
 	 * discarded.
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(&taddr6, LLE_EXCLUSIVE, ifp);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (ln == NULL) {
 		goto freeit;
 	}
 
 	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 		/*
 		 * If the link-layer has address, and no lladdr option came,
 		 * discard the packet.
 		 */
 		if (ifp->if_addrlen && lladdr == NULL) {
 			goto freeit;
 		}
 
 		/*
 		 * Record link-layer address, and update the state.
 		 */
 		bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 		ln->la_flags |= LLE_VALID;
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 		if (is_solicited) {
 			ln->ln_state = ND6_LLINFO_REACHABLE;
 			ln->ln_byhint = 0;
 			if (!ND6_LLINFO_PERMANENT(ln)) {
 				nd6_llinfo_settimer_locked(ln,
 				    (long)ND_IFINFO(ln->lle_tbl->llt_ifp)->reachable * hz);
 			}
 		} else {
 			ln->ln_state = ND6_LLINFO_STALE;
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 		}
 		if ((ln->ln_router = is_router) != 0) {
 			/*
 			 * This means a router's state has changed from
 			 * non-reachable to probably reachable, and might
 			 * affect the status of associated prefixes..
 			 */
 			checklink = 1;
 		}
 	} else {
 		int llchange;
 
 		/*
 		 * Check if the link-layer address has changed or not.
 		 */
 		if (lladdr == NULL)
 			llchange = 0;
 		else {
 			if (ln->la_flags & LLE_VALID) {
 				if (bcmp(lladdr, &ln->ll_addr, ifp->if_addrlen))
 					llchange = 1;
 				else
 					llchange = 0;
 			} else
 				llchange = 1;
 		}
 
 		/*
 		 * This is VERY complex.  Look at it with care.
 		 *
 		 * override solicit lladdr llchange	action
 		 *					(L: record lladdr)
 		 *
 		 *	0	0	n	--	(2c)
 		 *	0	0	y	n	(2b) L
 		 *	0	0	y	y	(1)    REACHABLE->STALE
 		 *	0	1	n	--	(2c)   *->REACHABLE
 		 *	0	1	y	n	(2b) L *->REACHABLE
 		 *	0	1	y	y	(1)    REACHABLE->STALE
 		 *	1	0	n	--	(2a)
 		 *	1	0	y	n	(2a) L
 		 *	1	0	y	y	(2a) L *->STALE
 		 *	1	1	n	--	(2a)   *->REACHABLE
 		 *	1	1	y	n	(2a) L *->REACHABLE
 		 *	1	1	y	y	(2a) L *->REACHABLE
 		 */
 		if (!is_override && (lladdr != NULL && llchange)) {  /* (1) */
 			/*
 			 * If state is REACHABLE, make it STALE.
 			 * no other updates should be done.
 			 */
 			if (ln->ln_state == ND6_LLINFO_REACHABLE) {
 				ln->ln_state = ND6_LLINFO_STALE;
 				nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 			}
 			goto freeit;
 		} else if (is_override				   /* (2a) */
 			|| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
 			|| lladdr == NULL) {			   /* (2c) */
 			/*
 			 * Update link-local address, if any.
 			 */
 			if (lladdr != NULL) {
 				bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 				ln->la_flags |= LLE_VALID;
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_RESOLVED);
 			}
 
 			/*
 			 * If solicited, make the state REACHABLE.
 			 * If not solicited and the link-layer address was
 			 * changed, make it STALE.
 			 */
 			if (is_solicited) {
 				ln->ln_state = ND6_LLINFO_REACHABLE;
 				ln->ln_byhint = 0;
 				if (!ND6_LLINFO_PERMANENT(ln)) {
 					nd6_llinfo_settimer_locked(ln,
 					    (long)ND_IFINFO(ifp)->reachable * hz);
 				}
 			} else {
 				if (lladdr != NULL && llchange) {
 					ln->ln_state = ND6_LLINFO_STALE;
 					nd6_llinfo_settimer_locked(ln,
 					    (long)V_nd6_gctimer * hz);
 				}
 			}
 		}
 
 		if (ln->ln_router && !is_router) {
 			/*
 			 * The peer dropped the router flag.
 			 * Remove the sender from the Default Router List and
 			 * update the Destination Cache entries.
 			 */
 			struct nd_defrouter *dr;
 			struct in6_addr *in6;
 
 			in6 = &L3_ADDR_SIN6(ln)->sin6_addr;
 
 			/*
 			 * Lock to protect the default router list.
 			 * XXX: this might be unnecessary, since this function
 			 * is only called under the network software interrupt
 			 * context.  However, we keep it just for safety.
 			 */
 			dr = defrouter_lookup(in6, ln->lle_tbl->llt_ifp);
 			if (dr)
 				defrtrlist_del(dr);
 			else if (ND_IFINFO(ln->lle_tbl->llt_ifp)->flags &
 			    ND6_IFF_ACCEPT_RTADV) {
 				/*
 				 * Even if the neighbor is not in the default
 				 * router list, the neighbor may be used
 				 * as a next hop for some destinations
 				 * (e.g. redirect case). So we must
 				 * call rt6_flush explicitly.
 				 */
 				rt6_flush(&ip6->ip6_src, ifp);
 			}
 		}
 		ln->ln_router = is_router;
 	}
         /* XXX - QL
 	 *  Does this matter?
 	 *  rt->rt_flags &= ~RTF_REJECT;
 	 */
 	ln->la_asked = 0;
 	if (ln->la_hold) {
 		struct mbuf *m_hold, *m_hold_next;
 
 		/*
 		 * reset the la_hold in advance, to explicitly
 		 * prevent a la_hold lookup in nd6_output()
 		 * (wouldn't happen, though...)
 		 */
 		for (m_hold = ln->la_hold, ln->la_hold = NULL;
 		    m_hold; m_hold = m_hold_next) {
 			m_hold_next = m_hold->m_nextpkt;
 			m_hold->m_nextpkt = NULL;
 			/*
 			 * we assume ifp is not a loopback here, so just set
 			 * the 2nd argument as the 1st one.
 			 */
 
 			if (send_sendso_input_hook != NULL) {
 				mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 				    sizeof(unsigned short), M_NOWAIT);
 				if (mtag == NULL)
 					goto bad;
 				m_tag_prepend(m, mtag);
 			}
 
 			nd6_output_lle(ifp, ifp, m_hold, L3_ADDR_SIN6(ln), NULL, ln, &chain);
 		}
 	}
  freeit:
 	if (ln != NULL) {
 		if (chain)
 			memcpy(&sin6, L3_ADDR_SIN6(ln), sizeof(sin6));
 		LLE_WUNLOCK(ln);
 
 		if (chain)
 			nd6_output_flush(ifp, ifp, chain, &sin6);
 	}
 	if (checklink)
 		pfxlist_onlink_check();
 
 	m_freem(m);
 	return;
 
  bad:
 	if (ln != NULL)
 		LLE_WUNLOCK(ln);
 
 	ICMP6STAT_INC(icp6s_badna);
 	m_freem(m);
 }
 
 /*
  * Neighbor advertisement output handling.
  *
  * Based on RFC 2461
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  *
  * tlladdr - 1 if include target link-layer address
  * sdl0 - sockaddr_dl (= proxy NA) or NULL
  */
 static void
 nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
     const struct in6_addr *taddr6, u_long flags, int tlladdr,
     struct sockaddr *sdl0, u_int fibnum)
 {
 	struct mbuf *m;
 	struct m_tag *mtag;
 	struct ifnet *oifp;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_advert *nd_na;
 	struct ip6_moptions im6o;
 	struct in6_addr src, daddr6;
 	struct sockaddr_in6 dst_sa;
 	int icmp6len, maxlen, error;
 	caddr_t mac = NULL;
 	struct route_in6 ro;
 
 	bzero(&ro, sizeof(ro));
 
 	daddr6 = *daddr6_0;	/* make a local copy for modification */
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_na);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	if (max_linkhdr + maxlen > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 	M_SETFIB(m, fibnum);
 
 	if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_na);
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
-	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
+	m->m_data += max_linkhdr;	/* or M_ALIGN() equivalent? */
 
 	/* fill neighbor advertisement packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
 		/* reply to DAD */
 		daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		daddr6.s6_addr16[1] = 0;
 		daddr6.s6_addr32[1] = 0;
 		daddr6.s6_addr32[2] = 0;
 		daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
 		if (in6_setscope(&daddr6, ifp, NULL))
 			goto bad;
 
 		flags &= ~ND_NA_FLAG_SOLICITED;
 	}
 	ip6->ip6_dst = daddr6;
 	bzero(&dst_sa, sizeof(struct sockaddr_in6));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(struct sockaddr_in6);
 	dst_sa.sin6_addr = daddr6;
 
 	/*
 	 * Select a source whose scope is the same as that of the dest.
 	 */
 	bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa));
 	oifp = ifp;
 	error = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, &oifp, &src);
 	if (error) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
 		    "determined: dst=%s, error=%d\n",
 		    ip6_sprintf(ip6buf, &dst_sa.sin6_addr), error));
 		goto bad;
 	}
 	ip6->ip6_src = src;
 	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
 	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
 	nd_na->nd_na_code = 0;
 	nd_na->nd_na_target = *taddr6;
 	in6_clearscope(&nd_na->nd_na_target); /* XXX */
 
 	/*
 	 * "tlladdr" indicates NS's condition for adding tlladdr or not.
 	 * see nd6_ns_input() for details.
 	 * Basically, if NS packet is sent to unicast/anycast addr,
 	 * target lladdr option SHOULD NOT be included.
 	 */
 	if (tlladdr) {
 		/*
 		 * sdl0 != NULL indicates proxy NA.  If we do proxy, use
 		 * lladdr in sdl0.  If we are not proxying (sending NA for
 		 * my address) use lladdr configured for the interface.
 		 */
 		if (sdl0 == NULL) {
 			if (ifp->if_carp)
 				mac = (*carp_macmatch6_p)(ifp, m, taddr6);
 			if (mac == NULL)
 				mac = nd6_ifptomac(ifp);
 		} else if (sdl0->sa_family == AF_LINK) {
 			struct sockaddr_dl *sdl;
 			sdl = (struct sockaddr_dl *)sdl0;
 			if (sdl->sdl_alen == ifp->if_addrlen)
 				mac = LLADDR(sdl);
 		}
 	}
 	if (tlladdr && mac) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
 
 		/* roundup to 8 bytes alignment! */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	} else
 		flags &= ~ND_NA_FLAG_OVERRIDE;
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_na->nd_na_flags_reserved = flags;
 	nd_na->nd_na_cksum = 0;
 	nd_na->nd_na_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);
 
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 		    sizeof(unsigned short), M_NOWAIT);
 		if (mtag == NULL)
 			goto bad;
 		*(unsigned short *)(mtag + 1) = nd_na->nd_na_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	ip6_output(m, NULL, &ro, 0, &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
 	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]);
 
 	/* We don't cache this route. */
 	RO_RTFREE(&ro);
 
 	return;
 
   bad:
 	if (ro.ro_rt) {
 		RTFREE(ro.ro_rt);
 	}
 	m_freem(m);
 	return;
 }
 
 #ifndef BURN_BRIDGES
 void
 nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0,
     const struct in6_addr *taddr6, u_long flags, int tlladdr,
     struct sockaddr *sdl0)
 {
 
 	nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0,
 	    RT_DEFAULT_FIB);
 }
 #endif
 
 caddr_t
 nd6_ifptomac(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 #ifdef IFT_L2VLAN
 	case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 	case IFT_INFINIBAND:
 	case IFT_BRIDGE:
 	case IFT_ISO88025:
 		return IF_LLADDR(ifp);
 	default:
 		return NULL;
 	}
 }
 
 struct dadq {
 	TAILQ_ENTRY(dadq) dad_list;
 	struct ifaddr *dad_ifa;
 	int dad_count;		/* max NS to send */
 	int dad_ns_tcount;	/* # of trials to send NS */
 	int dad_ns_ocount;	/* NS sent so far */
 	int dad_ns_icount;
 	int dad_na_icount;
 	struct callout dad_timer_ch;
 	struct vnet *dad_vnet;
 	u_int dad_refcnt;
 };
 
 static VNET_DEFINE(TAILQ_HEAD(, dadq), dadq);
 static VNET_DEFINE(struct rwlock, dad_rwlock);
 #define	V_dadq			VNET(dadq)
 #define	V_dad_rwlock		VNET(dad_rwlock)
 
 #define	DADQ_RLOCK()		rw_rlock(&V_dad_rwlock)	
 #define	DADQ_RUNLOCK()		rw_runlock(&V_dad_rwlock)	
 #define	DADQ_WLOCK()		rw_wlock(&V_dad_rwlock)	
 #define	DADQ_WUNLOCK()		rw_wunlock(&V_dad_rwlock)	
 
 static void
 nd6_dad_add(struct dadq *dp)
 {
 
 	DADQ_WLOCK();
 	TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list);
 	DADQ_WUNLOCK();
 }
 
 static void
 nd6_dad_del(struct dadq *dp)
 {
 
 	DADQ_WLOCK();
 	TAILQ_REMOVE(&V_dadq, dp, dad_list);
 	DADQ_WUNLOCK();
 	nd6_dad_rele(dp);
 }
 
 static struct dadq *
 nd6_dad_find(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	DADQ_RLOCK();
 	TAILQ_FOREACH(dp, &V_dadq, dad_list)
 		if (dp->dad_ifa == ifa) {
 			refcount_acquire(&dp->dad_refcnt);
 			break;
 		}
 	DADQ_RUNLOCK();
 
 	return (dp);
 }
 
 static void
 nd6_dad_starttimer(struct dadq *dp, int ticks)
 {
 
 	callout_reset(&dp->dad_timer_ch, ticks,
 	    (void (*)(void *))nd6_dad_timer, (void *)dp);
 }
 
 static void
 nd6_dad_stoptimer(struct dadq *dp)
 {
 
 	callout_drain(&dp->dad_timer_ch);
 }
 
 static void
 nd6_dad_rele(struct dadq *dp)
 {
 
 	if (refcount_release(&dp->dad_refcnt)) {
 		ifa_free(dp->dad_ifa);
 		free(dp, M_IP6NDP);
 	}
 }
 
 void
 nd6_dad_init(void)
 {
 
 	rw_init(&V_dad_rwlock, "nd6 DAD queue");
 	TAILQ_INIT(&V_dadq);
 }
 
 /*
  * Start Duplicate Address Detection (DAD) for specified interface address.
  */
 void
 nd6_dad_start(struct ifaddr *ifa, int delay)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/*
 	 * If we don't need DAD, don't do it.
 	 * There are several cases:
 	 * - DAD is disabled (ip6_dad_count == 0)
 	 * - the interface address is anycast
 	 */
 	if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		log(LOG_DEBUG,
 			"nd6_dad_start: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	if (ia->ia6_flags & IN6_IFF_ANYCAST) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (!V_ip6_dad_count) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (ifa->ifa_ifp == NULL)
 		panic("nd6_dad_start: ifa->ifa_ifp == NULL");
 	if (!(ifa->ifa_ifp->if_flags & IFF_UP)) {
 		return;
 	}
 	if (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED)
 		return;
 	if ((dp = nd6_dad_find(ifa)) != NULL) {
 		/* DAD already in progress */
 		nd6_dad_rele(dp);
 		return;
 	}
 
 	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	callout_init(&dp->dad_timer_ch, 0);
 #ifdef VIMAGE
 	dp->dad_vnet = curvnet;
 #endif
 	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
 	    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 	/*
 	 * Send NS packet for DAD, ip6_dad_count times.
 	 * Note that we must delay the first transmission, if this is the
 	 * first packet to be sent from the interface after interface
 	 * (re)initialization.
 	 */
 	dp->dad_ifa = ifa;
 	ifa_ref(dp->dad_ifa);
 	dp->dad_count = V_ip6_dad_count;
 	dp->dad_ns_icount = dp->dad_na_icount = 0;
 	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
 	refcount_init(&dp->dad_refcnt, 1);
 	nd6_dad_add(dp);
 	if (delay == 0) {
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp,
 		    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 	} else {
 		nd6_dad_starttimer(dp, delay);
 	}
 }
 
 /*
  * terminate DAD unconditionally.  used for address removals.
  */
 void
 nd6_dad_stop(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	dp = nd6_dad_find(ifa);
 	if (!dp) {
 		/* DAD wasn't started yet */
 		return;
 	}
 
 	nd6_dad_stoptimer(dp);
 
 	/*
 	 * The DAD queue entry may have been removed by nd6_dad_timer() while
 	 * we were waiting for it to stop, so re-do the lookup.
 	 */
 	nd6_dad_rele(dp);
 	if (nd6_dad_find(ifa) == NULL)
 		return;
 
 	nd6_dad_del(dp);
 	nd6_dad_rele(dp);
 }
 
 static void
 nd6_dad_timer(struct dadq *dp)
 {
 	CURVNET_SET(dp->dad_vnet);
 	struct ifaddr *ifa = dp->dad_ifa;
 	struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* Sanity check */
 	if (ia == NULL) {
 		log(LOG_ERR, "nd6_dad_timer: called with null parameter\n");
 		goto err;
 	}
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 		/* Do not need DAD for ifdisabled interface. */
 		log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of "
 		    "ND6_IFF_IFDISABLED.\n", ifp->if_xname);
 		goto err;
 	}
 	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
 		log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto err;
 	}
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
 		log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto err;
 	}
 
 	/* timeouted with IFF_{RUNNING,UP} check */
 	if (dp->dad_ns_tcount > V_dad_maxtry) {
 		nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n",
 		    if_name(ifa->ifa_ifp)));
 		goto err;
 	}
 
 	/* Need more checks? */
 	if (dp->dad_ns_ocount < dp->dad_count) {
 		/*
 		 * We have more NS to go.  Send NS packet for DAD.
 		 */
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp,
 		    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 		goto done;
 	} else {
 		/*
 		 * We have transmitted sufficient number of DAD packets.
 		 * See what we've got.
 		 */
 		if (dp->dad_ns_icount > 0 || dp->dad_na_icount > 0)
 			/* We've seen NS or NA, means DAD has failed. */
 			nd6_dad_duplicated(ifa, dp);
 		else {
 			/*
 			 * We are done with DAD.  No NA came, no NS came.
 			 * No duplicate address found.  Check IFDISABLED flag
 			 * again in case that it is changed between the
 			 * beginning of this function and here.
 			 */
 			if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0)
 				ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 
 			nd6log((LOG_DEBUG,
 			    "%s: DAD complete for %s - no duplicates found\n",
 			    if_name(ifa->ifa_ifp),
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 		}
 	}
 err:
 	nd6_dad_del(dp);
 done:
 	CURVNET_RESTORE();
 }
 
 static void
 nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
 	    "NS in/out=%d/%d, NA in=%d\n",
 	    if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 	    dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount);
 
 	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 	ia->ia6_flags |= IN6_IFF_DUPLICATED;
 
 	ifp = ifa->ifa_ifp;
 	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
 	    if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
 	log(LOG_ERR, "%s: manual intervention required\n",
 	    if_name(ifp));
 
 	/*
 	 * If the address is a link-local address formed from an interface
 	 * identifier based on the hardware address which is supposed to be
 	 * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
 	 * operation on the interface SHOULD be disabled.
 	 * [RFC 4862, Section 5.4.5]
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
 		struct in6_addr in6;
 
 		/*
 		 * To avoid over-reaction, we only apply this logic when we are
 		 * very sure that hardware addresses are supposed to be unique.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 		case IFT_ATM:
 		case IFT_IEEE1394:
 #ifdef IFT_IEEE80211
 		case IFT_IEEE80211:
 #endif
 		case IFT_INFINIBAND:
 			in6 = ia->ia_addr.sin6_addr;
 			if (in6_get_hw_ifid(ifp, &in6) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
 				ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "%s: possible hardware address "
 				    "duplication detected, disable IPv6\n",
 				    if_name(ifp));
 			}
 			break;
 		}
 	}
 }
 
 static void
 nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp = ifa->ifa_ifp;
 
 	dp->dad_ns_tcount++;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		return;
 	}
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		return;
 	}
 
 	dp->dad_ns_ocount++;
 	nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1);
 }
 
 static void
 nd6_dad_ns_input(struct ifaddr *ifa)
 {
 	struct in6_ifaddr *ia;
 	struct ifnet *ifp;
 	const struct in6_addr *taddr6;
 	struct dadq *dp;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_ns_input");
 
 	ia = (struct in6_ifaddr *)ifa;
 	ifp = ifa->ifa_ifp;
 	taddr6 = &ia->ia_addr.sin6_addr;
 	dp = nd6_dad_find(ifa);
 	if (dp == NULL)
 		return;
 
 	/* Quickhack - completely ignore DAD NS packets */
 	if (V_dad_ignore_ns) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		nd6log((LOG_INFO,
 		    "nd6_dad_ns_input: ignoring DAD NS packet for "
 		    "address %s(%s)\n", ip6_sprintf(ip6buf, taddr6),
 		    if_name(ifa->ifa_ifp)));
 		return;
 	}
 
 	/* XXX more checks for loopback situation - see nd6_dad_timer too */
 
 	dp->dad_ns_icount++;
 	nd6_dad_rele(dp);
 }
 
 static void
 nd6_dad_na_input(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_na_input");
 
 	dp = nd6_dad_find(ifa);
 	if (dp != NULL) {
 		dp->dad_na_icount++;
 		nd6_dad_rele(dp);
 	}
 }
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h	(revision 276691)
+++ head/sys/sys/mbuf.h	(revision 276692)
@@ -1,1199 +1,1198 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  */
 #define	MLEN		((int)(MSIZE - sizeof(struct m_hdr)))
 #define	MHLEN		((int)(MLEN - sizeof(struct pkthdr)))
 #define	MINCLSIZE	(MHLEN + 1)
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Header present at the beginning of every mbuf.
  * Size ILP32: 24
  *	 LP64: 32
  */
 struct m_hdr {
 	struct mbuf	*mh_next;	/* next buffer in chain */
 	struct mbuf	*mh_nextpkt;	/* next chain in queue/record */
 	caddr_t		 mh_data;	/* location of data */
 	int32_t		 mh_len;	/* amount of data in this mbuf */
 	uint32_t	 mh_type:8,	/* type of data in this mbuf */
 			 mh_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 mh_pad;	/* pad for 64bit alignment */
 #endif
 };
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  */
 struct pkthdr {
 	struct ifnet	*rcvif;		/* rcv interface */
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint64_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 cosqos;	/* class/quality of service */
 	uint8_t		 rsstype;	/* hash type */
 	uint8_t		 l2hlen;	/* layer 2 header length */
 	uint8_t		 l3hlen;	/* layer 3 header length */
 	uint8_t		 l4hlen;	/* layer 4 header length */
 	uint8_t		 l5hlen;	/* layer 5 header length */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]
 #define	tso_segsz	PH_per.sixteen[1]
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
 #define	pkt_tcphdr	PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  */
 struct m_ext {
 	volatile u_int	*ext_cnt;	/* pointer to ref count info */
 	caddr_t		 ext_buf;	/* start of buffer */
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	void		(*ext_free)	/* free routine if not the usual */
 			    (struct mbuf *, void *, void *);
 	void		*ext_arg1;	/* optional argument pointer */
 	void		*ext_arg2;	/* optional argument pointer */
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	struct m_hdr	m_hdr;
 	union {
 		struct {
 			struct pkthdr	MH_pkthdr;	/* M_PKTHDR set */
 			union {
 				struct m_ext	MH_ext;	/* M_EXT set */
 				char		MH_databuf[MHLEN];
 			} MH_dat;
 		} MH;
 		char	M_databuf[MLEN];		/* !M_PKTHDR, !M_EXT */
 	} M_dat;
 };
 #define	m_next		m_hdr.mh_next
 #define	m_len		m_hdr.mh_len
 #define	m_data		m_hdr.mh_data
 #define	m_type		m_hdr.mh_type
 #define	m_flags		m_hdr.mh_flags
 #define	m_nextpkt	m_hdr.mh_nextpkt
 #define	m_pkthdr	M_dat.MH.MH_pkthdr
 #define	m_ext		M_dat.MH.MH_dat.MH_ext
 #define	m_pktdat	M_dat.MH.MH_dat.MH_databuf
 #define	m_dat		M_dat.M_databuf
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-12] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_FLOWID	0x00000100 /* deprecated: flowid is valid */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 
 #define	M_PROTO1	0x00001000 /* protocol-specific */
 #define	M_PROTO2	0x00002000 /* protocol-specific */
 #define	M_PROTO3	0x00004000 /* protocol-specific */
 #define	M_PROTO4	0x00008000 /* protocol-specific */
 #define	M_PROTO5	0x00010000 /* protocol-specific */
 #define	M_PROTO6	0x00020000 /* protocol-specific */
 #define	M_PROTO7	0x00040000 /* protocol-specific */
 #define	M_PROTO8	0x00080000 /* protocol-specific */
 #define	M_PROTO9	0x00100000 /* protocol-specific */
 #define	M_PROTO10	0x00200000 /* protocol-specific */
 #define	M_PROTO11	0x00400000 /* protocol-specific */
 #define	M_PROTO12	0x00800000 /* protocol-specific */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_FLOWID| \
      M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\11M_FLOWID"
 #define	M_FLAG_PROTOBITS \
     "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
     "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
     "\27M_PROTO11\30M_PROTO12"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE can be set by non-RSS cards or configurations
  * that provide an opaque flow identifier, allowing for ordering and
  * distribution without explicit affinity.
  */
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		1	/* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		2	/* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		3	/* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		4	/* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		5	/* IPv6 2-tuple + ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	6	/* TCPv6 4-tiple + ext hdrs */
 /* Non-standard RSS hash types */
 #define	M_HASHTYPE_RSS_UDP_IPV4		7	/* IPv4 UDP 4-tuple */
 #define	M_HASHTYPE_RSS_UDP_IPV4_EX	8	/* IPv4 UDP 4-tuple + ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV6		9	/* IPv6 UDP 4-tuple */
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	10	/* IPv6 UDP 4-tuple + ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		255	/* ordering, not affinity */
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 
 /*
  * COS/QOS class and quality of service tags.
  * It uses DSCP code points as base.
  */
 #define	QOS_DSCP_CS0		0x00
 #define	QOS_DSCP_DEF		QOS_DSCP_CS0
 #define	QOS_DSCP_CS1		0x20
 #define	QOS_DSCP_AF11		0x28
 #define	QOS_DSCP_AF12		0x30
 #define	QOS_DSCP_AF13		0x38
 #define	QOS_DSCP_CS2		0x40
 #define	QOS_DSCP_AF21		0x48
 #define	QOS_DSCP_AF22		0x50
 #define	QOS_DSCP_AF23		0x58
 #define	QOS_DSCP_CS3		0x60
 #define	QOS_DSCP_AF31		0x68
 #define	QOS_DSCP_AF32		0x70
 #define	QOS_DSCP_AF33		0x78
 #define	QOS_DSCP_CS4		0x80
 #define	QOS_DSCP_AF41		0x88
 #define	QOS_DSCP_AF42		0x90
 #define	QOS_DSCP_AF43		0x98
 #define	QOS_DSCP_CS5		0xa0
 #define	QOS_DSCP_EF		0xb8
 #define	QOS_DSCP_CS6		0xc0
 #define	QOS_DSCP_CS7		0xe0
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
 #define	EXT_JUMBOP	3	/* jumbo cluster 4096 bytes */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference (M_IOVEC) */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_cnt, notyet */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR2	0x020000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR3	0x040000	/* for vendor-internal use */
 #define	EXT_FLAG_VENDOR4	0x080000	/* for vendor-internal use */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * External reference/free functions.
  */
 void sf_ext_ref(void *, void *);
 void sf_ext_free(void *, void *);
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESED		0x40000000	/* contains merged segments */
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI" \
     "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
     "\16CSUM_IP6_ISCSI" \
     "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
     "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESED"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_OOBDATA	15	/* expedited data  */
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 
 #ifdef _KERNEL
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 extern uma_zone_t	zone_ext_refcnt;
 
 void		 mb_free_ext(struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, caddr_t buf, u_int size, u_int *ref_cnt,
     void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, uma_zone_t zone, int size, int how, short type,
     int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR) {
 		if ((error = m_pkthdr_init(m, how)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	return (uma_zalloc_arg(zone_mbuf, &args, how));
 }
 
 /*
  * XXX This should be deprecated, very little use.
  */
 static __inline struct mbuf *
 m_getclr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m != NULL)
 		bzero(m->m_data, MLEN);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	return (uma_zalloc_arg(zone_mbuf, &args, how));
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	return (uma_zalloc_arg(zone_pack, &args, how));
 }
 
 static __inline void
 m_clget(struct mbuf *m, int how)
 {
 
 	if (m->m_flags & M_EXT)
 		printf("%s: %p mbuf already has external storage\n", __func__, m);
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		zone_drain(zone_pack);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 static __inline void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 
 	if (m && m->m_flags & M_EXT)
 		printf("%s: %p mbuf already has external storage\n", __func__, m);
 	if (m != NULL)
 		m->m_ext.ext_buf = NULL;
 
 	zone = m_getzone(size);
 	return (uma_zalloc_arg(zone, m, how));
 }
 
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	uma_zone_t zone;
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		zone = zone_jumbop;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		zone = zone_jumbo9;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = 0;
 	m->m_ext.ext_cnt = uma_find_refcnt(zone, cl);
 	m->m_flags |= M_EXT;
 
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     (void )m_extadd((m), (caddr_t)(buf), (size), (free), (arg1), (arg2),\
     (flags), (type), M_NOWAIT)
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) &&			\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (*((m)->m_ext.ext_cnt) == 1)) )		\
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /*
- * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place an
- * object of the specified size at the end of the mbuf, longword aligned.
- */
-#define	M_ALIGN(m, len) do {						\
-	KASSERT(!((m)->m_flags & (M_PKTHDR|M_EXT)),			\
-		("%s: M_ALIGN not normal mbuf", __func__));		\
-	KASSERT((m)->m_data == (m)->m_dat,				\
-		("%s: M_ALIGN not a virgin mbuf", __func__));		\
-	(m)->m_data += (MLEN - (len)) & ~(sizeof(long) - 1);		\
-} while (0)
-
-/*
- * As above, for mbufs allocated with m_gethdr/MGETHDR or initialized by
- * M_DUP/MOVE_PKTHDR.
- */
-#define	MH_ALIGN(m, len) do {						\
-	KASSERT((m)->m_flags & M_PKTHDR && !((m)->m_flags & M_EXT),	\
-		("%s: MH_ALIGN not PKTHDR mbuf", __func__));		\
-	KASSERT((m)->m_data == (m)->m_pktdat,				\
-		("%s: MH_ALIGN not a virgin mbuf", __func__));		\
-	(m)->m_data += (MHLEN - (len)) & ~(sizeof(long) - 1);		\
-} while (0)
-
-/*
- * As above, for mbuf with external storage.
- */
-#define	MEXT_ALIGN(m, len) do {						\
-	KASSERT((m)->m_flags & M_EXT,					\
-		("%s: MEXT_ALIGN not an M_EXT mbuf", __func__));	\
-	KASSERT((m)->m_data == (m)->m_ext.ext_buf,			\
-		("%s: MEXT_ALIGN not a virgin mbuf", __func__));	\
-	(m)->m_data += ((m)->m_ext.ext_size - (len)) &			\
-	    ~(sizeof(long) - 1); 					\
-} while (0)
-
-/*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
+
+/*
+ * Set the m_data pointer of a newly allocated mbuf to place an object of the
+ * specified size at the end of the mbuf, longword aligned.
+ *
+ * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
+ * separate macros, each asserting that it was called at the proper moment.
+ * This required callers to themselves test the storage type and call the
+ * right one.  Rather than require callers to be aware of those layout
+ * decisions, we centralize here.
+ */
+static __inline void
+m_align(struct mbuf *m, int len)
+{
+#ifdef INVARIANTS
+	const char *msg = "%s: not a virgin mbuf";
+#endif
+	int adjust;
+
+	KASSERT(m->m_data == M_START(m), (msg, __func__));
+
+	if (m->m_flags & M_EXT) {
+		adjust = m->m_ext.ext_size - len;
+	} else if (m->m_flags & M_PKTHDR) {
+		adjust = MHLEN - len;
+	} else {
+		adjust = MLEN - len;
+	}
+
+	m->m_data += adjust &~ (sizeof(long)-1);
+}
+
+#define	M_ALIGN(m, len)		m_align(m, len)
+#define	MH_ALIGN(m, len)	m_align(m, len)
+#define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 /* Compatibility with 4.3. */
 #define	m_copy(m, o, l)	m_copym((m), (o), (l), M_NOWAIT)
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 
 struct uio;
 
 void		 m_adj(struct mbuf *, int);
 void		 m_align(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(struct mbuf *, void *, void *), void *, void *,
 		    int, int, int);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copymdata(struct mbuf *, struct mbuf *,
 		    int, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 struct mbuf	*m_dup(struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static int inline
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 #endif /* _KERNEL */
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 
 #endif /* !_SYS_MBUF_H_ */