Index: head/sys/contrib/pf/net/if_pfsync.c
===================================================================
--- head/sys/contrib/pf/net/if_pfsync.c	(revision 171636)
+++ head/sys/contrib/pf/net/if_pfsync.c	(revision 171637)
@@ -1,2329 +1,2329 @@
 /*	$OpenBSD: if_pfsync.c,v 1.73 2006/11/16 13:13:38 henning Exp $	*/
 
 /*
  * Copyright (c) 2002 Michael Shalayeff
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifdef __FreeBSD__
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_carp.h"
 #include "opt_bpf.h"
 #include "opt_pf.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef DEV_BPF
 #define	NBPFILTER	DEV_BPF
 #else
 #define	NBPFILTER	0
 #endif
 
 #ifdef DEV_PFSYNC
 #define	NPFSYNC		DEV_PFSYNC
 #else
 #define	NPFSYNC		0
 #endif
 
 #ifdef DEV_CARP
 #define	NCARP		DEV_CARP
 #else
 #define	NCARP		0
 #endif
 #endif /* __FreeBSD__ */
 
 #include <sys/param.h>
 #ifdef __FreeBSD__
 #include <sys/priv.h>
 #endif
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #ifdef __FreeBSD__
 #include <sys/endian.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sockio.h>
 #include <sys/taskqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #else
 #include <sys/ioctl.h>
 #include <sys/timeout.h>
 #endif
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #ifdef __FreeBSD__
 #include <net/if_clone.h>
 #endif
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 
 #ifdef	INET
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #endif
 
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif /* INET6 */
 
 #ifndef __FreeBSD__
 #include "carp.h"
 #endif
 #if NCARP > 0
 #include <netinet/ip_carp.h>
 #endif
 
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 
 #ifndef __FreeBSD__
 #include "bpfilter.h"
 #include "pfsync.h"
 #endif
 
 #define PFSYNC_MINMTU	\
     (sizeof(struct pfsync_header) + sizeof(struct pf_state))
 
 #ifdef PFSYNCDEBUG
 #define DPRINTF(x)    do { if (pfsyncdebug) printf x ; } while (0)
 int pfsyncdebug;
 #else
 #define DPRINTF(x)
 #endif
 
 struct pfsync_softc	*pfsyncif = NULL;
 struct pfsyncstats	 pfsyncstats;
 #ifdef __FreeBSD__
 SYSCTL_DECL(_net_inet_pfsync);
 SYSCTL_STRUCT(_net_inet_pfsync, 0, stats, CTLFLAG_RW,
     &pfsyncstats, pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 #endif
 
 void	pfsyncattach(int);
 #ifdef __FreeBSD__
 int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 void	pfsync_clone_destroy(struct ifnet *);
 #else
 int	pfsync_clone_create(struct if_clone *, int);
 int	pfsync_clone_destroy(struct ifnet *);
 #endif
 void	pfsync_setmtu(struct pfsync_softc *, int);
 int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
 	    struct pf_state_peer *);
 int	pfsync_insert_net_state(struct pfsync_state *, u_int8_t);
 #ifdef PFSYNC_TDB
 void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
 	    struct rtentry *);
 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
 void	pfsyncstart(struct ifnet *);
 
 struct mbuf *pfsync_get_mbuf(struct pfsync_softc *, u_int8_t, void **);
 int	pfsync_request_update(struct pfsync_state_upd *, struct in_addr *);
 int	pfsync_sendout(struct pfsync_softc *);
 #ifdef PFSYNC_TDB
 int	pfsync_tdb_sendout(struct pfsync_softc *);
 #endif
 int	pfsync_sendout_mbuf(struct pfsync_softc *, struct mbuf *);
 void	pfsync_timeout(void *);
 #ifdef PFSYNC_TDB
 void	pfsync_tdb_timeout(void *);
 #endif
 void	pfsync_send_bus(struct pfsync_softc *, u_int8_t);
 void	pfsync_bulk_update(void *);
 void	pfsync_bulkfail(void *);
 
 #ifdef __FreeBSD__
 void	pfsync_ifdetach(void *, struct ifnet *);
 void	pfsync_senddef(void *, int);
 
 /* XXX: ugly */
 #define	betoh64		(unsigned long long)be64toh
 #define	timeout_del	callout_stop
 #endif
 
 int	pfsync_sync_ok;
 #ifndef __FreeBSD__
 extern int ifqmaxlen;
 #endif
 
 #ifdef __FreeBSD__
 IFC_SIMPLE_DECLARE(pfsync, 1);
 #else
 struct if_clone	pfsync_cloner =
     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
 #endif
 
 void
 pfsyncattach(int npfsync)
 {
 	if_clone_attach(&pfsync_cloner);
 }
 
 int
 #ifdef __FreeBSD__
 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
 #else
 pfsync_clone_create(struct if_clone *ifc, int unit)
 #endif
 {
 	struct ifnet *ifp;
 
 	if (unit != 0)
 		return (EINVAL);
 
 	pfsync_sync_ok = 1;
 	if ((pfsyncif = malloc(sizeof(*pfsyncif), M_DEVBUF, M_NOWAIT)) == NULL)
 		return (ENOMEM);
 	bzero(pfsyncif, sizeof(*pfsyncif));
 #ifdef __FreeBSD__
 	if ((pfsyncif->sc_imo.imo_membership = (struct in_multi **)malloc(
 	    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_DEVBUF,
 	    M_NOWAIT)) == NULL) {
 		free(pfsyncif, M_DEVBUF);
 		return (ENOSPC);
 	}
 	pfsyncif->sc_imo.imo_mfilters = NULL;
 	pfsyncif->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	pfsyncif->sc_imo.imo_multicast_vif = -1;
 
 	ifp = pfsyncif->sc_ifp = if_alloc(IFT_PFSYNC);
 	if (ifp == NULL) {
 		free(pfsyncif->sc_imo.imo_membership, M_DEVBUF);
 		free(pfsyncif, M_DEVBUF);
 		return (ENOSPC);
 	}
 	if_initname(ifp, ifc->ifc_name, unit);
 
 	pfsyncif->sc_detachtag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    pfsync_ifdetach, pfsyncif, EVENTHANDLER_PRI_ANY);
 	if (pfsyncif->sc_detachtag == NULL) {
 		if_free(ifp);
 		free(pfsyncif->sc_imo.imo_membership, M_DEVBUF);
 		free(pfsyncif, M_DEVBUF);
 		return (ENOSPC);
 	}
 
 	pfsyncif->sc_ifq.ifq_maxlen = ifqmaxlen;
 	mtx_init(&pfsyncif->sc_ifq.ifq_mtx, ifp->if_xname,
 	    "pfsync send queue", MTX_DEF);
 	TASK_INIT(&pfsyncif->sc_send_task, 0, pfsync_senddef, pfsyncif);
 #endif
 	pfsyncif->sc_mbuf = NULL;
 	pfsyncif->sc_mbuf_net = NULL;
 #ifdef PFSYNC_TDB
 	pfsyncif->sc_mbuf_tdb = NULL;
 #endif
 	pfsyncif->sc_statep.s = NULL;
 	pfsyncif->sc_statep_net.s = NULL;
 #ifdef PFSYNC_TDB
 	pfsyncif->sc_statep_tdb.t = NULL;
 #endif
 	pfsyncif->sc_maxupdates = 128;
 #ifdef __FreeBSD__
 	pfsyncif->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
 	pfsyncif->sc_sendaddr.s_addr = htonl(INADDR_PFSYNC_GROUP);
 #else
 	pfsyncif->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
 	pfsyncif->sc_sendaddr.s_addr = INADDR_PFSYNC_GROUP;
 #endif
 	pfsyncif->sc_ureq_received = 0;
 	pfsyncif->sc_ureq_sent = 0;
 	pfsyncif->sc_bulk_send_next = NULL;
 	pfsyncif->sc_bulk_terminator = NULL;
 #ifndef __FreeBSD__
 	ifp = &pfsyncif->sc_if;
 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
 #endif
 	ifp->if_softc = pfsyncif;
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_start = pfsyncstart;
 	ifp->if_type = IFT_PFSYNC;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = PFSYNC_HDRLEN;
 	pfsync_setmtu(pfsyncif, ETHERMTU);
 #ifdef __FreeBSD__
-	callout_init(&pfsyncif->sc_tmo, NET_CALLOUT_MPSAFE);
+	callout_init(&pfsyncif->sc_tmo, CALLOUT_MPSAFE);
 #ifdef PFSYNC_TDB
-	callout_init(&pfsyncif->sc_tdb_tmo, NET_CALLOUT_MPSAFE);
+	callout_init(&pfsyncif->sc_tdb_tmo, CALLOUT_MPSAFE);
 #endif
-	callout_init(&pfsyncif->sc_bulk_tmo, NET_CALLOUT_MPSAFE);
-	callout_init(&pfsyncif->sc_bulkfail_tmo, NET_CALLOUT_MPSAFE);
+	callout_init(&pfsyncif->sc_bulk_tmo, CALLOUT_MPSAFE);
+	callout_init(&pfsyncif->sc_bulkfail_tmo, CALLOUT_MPSAFE);
 #else
 	timeout_set(&pfsyncif->sc_tmo, pfsync_timeout, pfsyncif);
 	timeout_set(&pfsyncif->sc_tdb_tmo, pfsync_tdb_timeout, pfsyncif);
 	timeout_set(&pfsyncif->sc_bulk_tmo, pfsync_bulk_update, pfsyncif);
 	timeout_set(&pfsyncif->sc_bulkfail_tmo, pfsync_bulkfail, pfsyncif);
 #endif
 	if_attach(ifp);
 #ifndef __FreeBSD__
 	if_alloc_sadl(ifp);
 #endif
 
 #if NCARP > 0
 	if_addgroup(ifp, "carp");
 #endif
 
 #if NBPFILTER > 0
 #ifdef __FreeBSD__
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 #else
 	bpfattach(&pfsyncif->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 #endif
 #endif
 
 	return (0);
 }
 
 #ifdef __FreeBSD__
 void
 #else
 int
 #endif
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 #ifdef __FreeBSD__
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfsyncif->sc_detachtag);
 	callout_stop(&pfsyncif->sc_tmo);
 #ifdef PFSYNC_TDB
 	callout_stop(&pfsyncif->sc_tdb_tmo);
 #endif
 	callout_stop(&pfsyncif->sc_bulk_tmo);
 	callout_stop(&pfsyncif->sc_bulkfail_tmo);
 	/* XXX: more? */
 #endif
 
 #if NBPFILTER > 0
 	bpfdetach(ifp);
 #endif
 	if_detach(ifp);
 #ifdef __FreeBSD__
 	if_free(ifp);
 	free(pfsyncif->sc_imo.imo_membership, M_DEVBUF);
 #endif
 	free(pfsyncif, M_DEVBUF);
 	pfsyncif = NULL;
 #ifndef __FreeBSD__
 	return (0);
 #endif
 }
 
 /*
  * Start output on the pfsync interface.
  */
 void
 pfsyncstart(struct ifnet *ifp)
 {
 	struct mbuf *m;
 #ifndef __FreeBSD__
 	int s;
 #endif
 
 	for (;;) {
 #ifdef __FreeBSD__
 		IF_LOCK(&ifp->if_snd);
 		_IF_DROP(&ifp->if_snd);
 		_IF_DEQUEUE(&ifp->if_snd, m);
 		IF_UNLOCK(&ifp->if_snd);
 #else
 		s = splnet();
 		IF_DROP(&ifp->if_snd);
 		IF_DEQUEUE(&ifp->if_snd, m);
 		splx(s);
 #endif
 
 		if (m == NULL)
 			return;
 		else
 			m_freem(m);
 	}
 }
 
 int
 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
     struct pf_state_peer *d)
 {
 	if (s->scrub.scrub_flag && d->scrub == NULL) {
 		d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
 		if (d->scrub == NULL)
 			return (ENOMEM);
 		bzero(d->scrub, sizeof(*d->scrub));
 	}
 
 	return (0);
 }
 
 int
 pfsync_insert_net_state(struct pfsync_state *sp, u_int8_t chksum_flag)
 {
 	struct pf_state	*st = NULL;
 	struct pf_rule *r = NULL;
 	struct pfi_kif	*kif;
 
 	if (sp->creatorid == 0 && pf_status.debug >= PF_DEBUG_MISC) {
 		printf("pfsync_insert_net_state: invalid creator id:"
 		    " %08x\n", ntohl(sp->creatorid));
 		return (EINVAL);
 	}
 
 	kif = pfi_kif_get(sp->ifname);
 	if (kif == NULL) {
 		if (pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync_insert_net_state: "
 			    "unknown interface: %s\n", sp->ifname);
 		/* skip this state */
 		return (0);
 	}
 
 	/*
 	 * If the ruleset checksums match, it's safe to associate the state
 	 * with the rule of that number.
 	 */
 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && chksum_flag)
 		r = pf_main_ruleset.rules[
 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
 	else
 		r = &pf_default_rule;
 
 	if (!r->max_states || r->states < r->max_states)
 		st = pool_get(&pf_state_pl, PR_NOWAIT);
 	if (st == NULL) {
 		pfi_kif_unref(kif, PFI_KIF_REF_NONE);
 		return (ENOMEM);
 	}
 	bzero(st, sizeof(*st));
 
 	/* allocate memory for scrub info */
 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) {
 		pfi_kif_unref(kif, PFI_KIF_REF_NONE);
 		if (st->src.scrub)
 			pool_put(&pf_state_scrub_pl, st->src.scrub);
 		pool_put(&pf_state_pl, st);
 		return (ENOMEM);
 	}
 
 	st->rule.ptr = r;
 	/* XXX get pointers to nat_rule and anchor */
 
 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
 	r->states++;
 
 	/* fill in the rest of the state entry */
 	pf_state_host_ntoh(&sp->lan, &st->lan);
 	pf_state_host_ntoh(&sp->gwy, &st->gwy);
 	pf_state_host_ntoh(&sp->ext, &st->ext);
 
 	pf_state_peer_ntoh(&sp->src, &st->src);
 	pf_state_peer_ntoh(&sp->dst, &st->dst);
 
 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
 	st->creation = time_second - ntohl(sp->creation);
 	st->expire = ntohl(sp->expire) + time_second;
 
 	st->af = sp->af;
 	st->proto = sp->proto;
 	st->direction = sp->direction;
 	st->log = sp->log;
 	st->timeout = sp->timeout;
 	st->allow_opts = sp->allow_opts;
 
 	bcopy(sp->id, &st->id, sizeof(st->id));
 	st->creatorid = sp->creatorid;
 	st->sync_flags = PFSTATE_FROMSYNC;
 
 	if (pf_insert_state(kif, st)) {
 		pfi_kif_unref(kif, PFI_KIF_REF_NONE);
 		/* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
 		r->states--;
 		if (st->dst.scrub)
 			pool_put(&pf_state_scrub_pl, st->dst.scrub);
 		if (st->src.scrub)
 			pool_put(&pf_state_scrub_pl, st->src.scrub);
 		pool_put(&pf_state_pl, st);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 void
 #ifdef __FreeBSD__
 pfsync_input(struct mbuf *m, __unused int off)
 #else
 pfsync_input(struct mbuf *m, ...)
 #endif
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct pfsync_header *ph;
 	struct pfsync_softc *sc = pfsyncif;
 	struct pf_state *st;
 	struct pf_state_cmp key;
 	struct pfsync_state *sp;
 	struct pfsync_state_upd *up;
 	struct pfsync_state_del *dp;
 	struct pfsync_state_clr *cp;
 	struct pfsync_state_upd_req *rup;
 	struct pfsync_state_bus *bus;
 #ifdef PFSYNC_TDB
 	struct pfsync_tdb *pt;
 #endif
 	struct in_addr src;
 	struct mbuf *mp;
 	int iplen, action, error, i, s, count, offp, sfail, stale = 0;
 	u_int8_t chksum_flag = 0;
 
 	pfsyncstats.pfsyncs_ipackets++;
 
 	/* verify that we have a sync interface configured */
 	if (!sc || !sc->sc_sync_ifp || !pf_status.running)
 		goto done;
 
 	/* verify that the packet came in on the right interface */
 	if (sc->sc_sync_ifp != m->m_pkthdr.rcvif) {
 		pfsyncstats.pfsyncs_badif++;
 		goto done;
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
 		pfsyncstats.pfsyncs_badttl++;
 		goto done;
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ph)) {
 		pfsyncstats.pfsyncs_hdrops++;
 		goto done;
 	}
 
 	if (iplen + sizeof(*ph) > m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ph))) == NULL) {
 			pfsyncstats.pfsyncs_hdrops++;
 			goto done;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ph = (struct pfsync_header *)((char *)ip + iplen);
 
 	/* verify the version */
 	if (ph->version != PFSYNC_VERSION) {
 		pfsyncstats.pfsyncs_badver++;
 		goto done;
 	}
 
 	action = ph->action;
 	count = ph->count;
 
 	/* make sure it's a valid action code */
 	if (action >= PFSYNC_ACT_MAX) {
 		pfsyncstats.pfsyncs_badact++;
 		goto done;
 	}
 
 	/* Cheaper to grab this now than having to mess with mbufs later */
 	src = ip->ip_src;
 
 	if (!bcmp(&ph->pf_chksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
 		chksum_flag++;
 
 	switch (action) {
 	case PFSYNC_ACT_CLR: {
 		struct pf_state *nexts;
 		struct pfi_kif	*kif;
 		u_int32_t creatorid;
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    sizeof(*cp), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 		cp = (struct pfsync_state_clr *)(mp->m_data + offp);
 		creatorid = cp->creatorid;
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		if (cp->ifname[0] == '\0') {
 			for (st = RB_MIN(pf_state_tree_id, &tree_id);
 			    st; st = nexts) {
 				nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
 				if (st->creatorid == creatorid) {
 					st->sync_flags |= PFSTATE_FROMSYNC;
 					pf_unlink_state(st);
 				}
 			}
 		} else {
 			if ((kif = pfi_kif_get(cp->ifname)) == NULL) {
 #ifdef __FreeBSD__
 				PF_UNLOCK();
 #endif
 				splx(s);
 				return;
 			}
 			for (st = RB_MIN(pf_state_tree_lan_ext,
 			    &kif->pfik_lan_ext); st; st = nexts) {
 				nexts = RB_NEXT(pf_state_tree_lan_ext,
 				    &kif->pfik_lan_ext, st);
 				if (st->creatorid == creatorid) {
 					st->sync_flags |= PFSTATE_FROMSYNC;
 					pf_unlink_state(st);
 				}
 			}
 		}
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 
 		break;
 	}
 	case PFSYNC_ACT_INS:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*sp), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
 		    i < count; i++, sp++) {
 			/* check for invalid values */
 			if (sp->timeout >= PFTM_MAX ||
 			    sp->src.state > PF_TCPS_PROXY_DST ||
 			    sp->dst.state > PF_TCPS_PROXY_DST ||
 			    sp->direction > PF_OUT ||
 			    (sp->af != AF_INET && sp->af != AF_INET6)) {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync_insert: PFSYNC_ACT_INS: "
 					    "invalid value\n");
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 
 			if ((error = pfsync_insert_net_state(sp,
 			    chksum_flag))) {
 				if (error == ENOMEM) {
 #ifdef __FreeBSD__
 					PF_UNLOCK();
 #endif
 					splx(s);
 					goto done;
 				}
 				continue;
 			}
 		}
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	case PFSYNC_ACT_UPD:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*sp), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
 		    i < count; i++, sp++) {
 			int flags = PFSYNC_FLAG_STALE;
 
 			/* check for invalid values */
 			if (sp->timeout >= PFTM_MAX ||
 			    sp->src.state > PF_TCPS_PROXY_DST ||
 			    sp->dst.state > PF_TCPS_PROXY_DST) {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync_insert: PFSYNC_ACT_UPD: "
 					    "invalid value\n");
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 
 			bcopy(sp->id, &key.id, sizeof(key.id));
 			key.creatorid = sp->creatorid;
 
 			st = pf_find_state_byid(&key);
 			if (st == NULL) {
 				/* insert the update */
 				if (pfsync_insert_net_state(sp, chksum_flag))
 					pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			sfail = 0;
 			if (st->proto == IPPROTO_TCP) {
 				/*
 				 * The state should never go backwards except
 				 * for syn-proxy states.  Neither should the
 				 * sequence window slide backwards.
 				 */
 				if (st->src.state > sp->src.state &&
 				    (st->src.state < PF_TCPS_PROXY_SRC ||
 				    sp->src.state >= PF_TCPS_PROXY_SRC))
 					sfail = 1;
 				else if (SEQ_GT(st->src.seqlo,
 				    ntohl(sp->src.seqlo)))
 					sfail = 3;
 				else if (st->dst.state > sp->dst.state) {
 					/* There might still be useful
 					 * information about the src state here,
 					 * so import that part of the update,
 					 * then "fail" so we send the updated
 					 * state back to the peer who is missing
 					 * our what we know. */
 					pf_state_peer_ntoh(&sp->src, &st->src);
 					/* XXX do anything with timeouts? */
 					sfail = 7;
 					flags = 0;
 				} else if (st->dst.state >= TCPS_SYN_SENT &&
 				    SEQ_GT(st->dst.seqlo, ntohl(sp->dst.seqlo)))
 					sfail = 4;
 			} else {
 				/*
 				 * Non-TCP protocol state machine always go
 				 * forwards
 				 */
 				if (st->src.state > sp->src.state)
 					sfail = 5;
 				else if (st->dst.state > sp->dst.state)
 					sfail = 6;
 			}
 			if (sfail) {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync: %s stale update "
 					    "(%d) id: %016llx "
 					    "creatorid: %08x\n",
 					    (sfail < 7 ?  "ignoring"
 					     : "partial"), sfail,
 					    betoh64(st->id),
 					    ntohl(st->creatorid));
 				pfsyncstats.pfsyncs_badstate++;
 
 				if (!(sp->sync_flags & PFSTATE_STALE)) {
 					/* we have a better state, send it */
 					if (sc->sc_mbuf != NULL && !stale)
 						pfsync_sendout(sc);
 					stale++;
 					if (!st->sync_flags)
 						pfsync_pack_state(
 						    PFSYNC_ACT_UPD, st, flags);
 				}
 				continue;
 			}
 	    		pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
 			pf_state_peer_ntoh(&sp->src, &st->src);
 			pf_state_peer_ntoh(&sp->dst, &st->dst);
 			st->expire = ntohl(sp->expire) + time_second;
 			st->timeout = sp->timeout;
 		}
 		if (stale && sc->sc_mbuf != NULL)
 			pfsync_sendout(sc);
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	/*
 	 * It's not strictly necessary for us to support the "uncompressed"
 	 * delete action, but it's relatively simple and maintains consistency.
 	 */
 	case PFSYNC_ACT_DEL:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*sp), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
 		    i < count; i++, sp++) {
 			bcopy(sp->id, &key.id, sizeof(key.id));
 			key.creatorid = sp->creatorid;
 
 			st = pf_find_state_byid(&key);
 			if (st == NULL) {
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			st->sync_flags |= PFSTATE_FROMSYNC;
 			pf_unlink_state(st);
 		}
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	case PFSYNC_ACT_UPD_C: {
 		int update_requested = 0;
 
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*up), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, up = (struct pfsync_state_upd *)(mp->m_data + offp);
 		    i < count; i++, up++) {
 			/* check for invalid values */
 			if (up->timeout >= PFTM_MAX ||
 			    up->src.state > PF_TCPS_PROXY_DST ||
 			    up->dst.state > PF_TCPS_PROXY_DST) {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync_insert: "
 					    "PFSYNC_ACT_UPD_C: "
 					    "invalid value\n");
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 
 			bcopy(up->id, &key.id, sizeof(key.id));
 			key.creatorid = up->creatorid;
 
 			st = pf_find_state_byid(&key);
 			if (st == NULL) {
 				/* We don't have this state. Ask for it. */
 				error = pfsync_request_update(up, &src);
 				if (error == ENOMEM) {
 #ifdef __FreeBSD__
 					PF_UNLOCK();
 #endif
 					splx(s);
 					goto done;
 				}
 				update_requested = 1;
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			sfail = 0;
 			if (st->proto == IPPROTO_TCP) {
 				/*
 				 * The state should never go backwards except
 				 * for syn-proxy states.  Neither should the
 				 * sequence window slide backwards.
 				 */
 				if (st->src.state > up->src.state &&
 				    (st->src.state < PF_TCPS_PROXY_SRC ||
 				    up->src.state >= PF_TCPS_PROXY_SRC))
 					sfail = 1;
 				else if (st->dst.state > up->dst.state)
 					sfail = 2;
 				else if (SEQ_GT(st->src.seqlo,
 				    ntohl(up->src.seqlo)))
 					sfail = 3;
 				else if (st->dst.state >= TCPS_SYN_SENT &&
 				    SEQ_GT(st->dst.seqlo, ntohl(up->dst.seqlo)))
 					sfail = 4;
 			} else {
 				/*
 				 * Non-TCP protocol state machine always go
 				 * forwards
 				 */
 				if (st->src.state > up->src.state)
 					sfail = 5;
 				else if (st->dst.state > up->dst.state)
 					sfail = 6;
 			}
 			if (sfail) {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync: ignoring stale update "
 					    "(%d) id: %016llx "
 					    "creatorid: %08x\n", sfail,
 					    betoh64(st->id),
 					    ntohl(st->creatorid));
 				pfsyncstats.pfsyncs_badstate++;
 
 				/* we have a better state, send it out */
 				if ((!stale || update_requested) &&
 				    sc->sc_mbuf != NULL) {
 					pfsync_sendout(sc);
 					update_requested = 0;
 				}
 				stale++;
 				if (!st->sync_flags)
 					pfsync_pack_state(PFSYNC_ACT_UPD, st,
 					    PFSYNC_FLAG_STALE);
 				continue;
 			}
 	    		pfsync_alloc_scrub_memory(&up->dst, &st->dst);
 			pf_state_peer_ntoh(&up->src, &st->src);
 			pf_state_peer_ntoh(&up->dst, &st->dst);
 			st->expire = ntohl(up->expire) + time_second;
 			st->timeout = up->timeout;
 		}
 		if ((update_requested || stale) && sc->sc_mbuf)
 			pfsync_sendout(sc);
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	}
 	case PFSYNC_ACT_DEL_C:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*dp), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, dp = (struct pfsync_state_del *)(mp->m_data + offp);
 		    i < count; i++, dp++) {
 			bcopy(dp->id, &key.id, sizeof(key.id));
 			key.creatorid = dp->creatorid;
 
 			st = pf_find_state_byid(&key);
 			if (st == NULL) {
 				pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			st->sync_flags |= PFSTATE_FROMSYNC;
 			pf_unlink_state(st);
 		}
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	case PFSYNC_ACT_INS_F:
 	case PFSYNC_ACT_DEL_F:
 		/* not implemented */
 		break;
 	case PFSYNC_ACT_UREQ:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*rup), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		if (sc->sc_mbuf != NULL)
 			pfsync_sendout(sc);
 		for (i = 0,
 		    rup = (struct pfsync_state_upd_req *)(mp->m_data + offp);
 		    i < count; i++, rup++) {
 			bcopy(rup->id, &key.id, sizeof(key.id));
 			key.creatorid = rup->creatorid;
 
 			if (key.id == 0 && key.creatorid == 0) {
 				sc->sc_ureq_received = time_uptime;
 				if (sc->sc_bulk_send_next == NULL)
 					sc->sc_bulk_send_next =
 					    TAILQ_FIRST(&state_list);
 				sc->sc_bulk_terminator = sc->sc_bulk_send_next;
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync: received "
 					    "bulk update request\n");
 				pfsync_send_bus(sc, PFSYNC_BUS_START);
 #ifdef __FreeBSD__
 				callout_reset(&sc->sc_bulk_tmo, 1 * hz,
 				    pfsync_bulk_update, pfsyncif);
 #else
 				timeout_add(&sc->sc_bulk_tmo, 1 * hz);
 #endif
 			} else {
 				st = pf_find_state_byid(&key);
 				if (st == NULL) {
 					pfsyncstats.pfsyncs_badstate++;
 					continue;
 				}
 				if (!st->sync_flags)
 					pfsync_pack_state(PFSYNC_ACT_UPD,
 					    st, 0);
 			}
 		}
 		if (sc->sc_mbuf != NULL)
 			pfsync_sendout(sc);
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	case PFSYNC_ACT_BUS:
 		/* If we're not waiting for a bulk update, who cares. */
 		if (sc->sc_ureq_sent == 0)
 			break;
 
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    sizeof(*bus), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 		bus = (struct pfsync_state_bus *)(mp->m_data + offp);
 		switch (bus->status) {
 		case PFSYNC_BUS_START:
 #ifdef __FreeBSD__
 			callout_reset(&sc->sc_bulkfail_tmo,
 			    pf_pool_limits[PF_LIMIT_STATES].limit /
 			    (PFSYNC_BULKPACKETS * sc->sc_maxcount),
 			    pfsync_bulkfail, pfsyncif);
 #else
 			timeout_add(&sc->sc_bulkfail_tmo,
 			    pf_pool_limits[PF_LIMIT_STATES].limit /
 			    (PFSYNC_BULKPACKETS * sc->sc_maxcount));
 #endif
 			if (pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received bulk "
 				    "update start\n");
 			break;
 		case PFSYNC_BUS_END:
 			if (time_uptime - ntohl(bus->endtime) >=
 			    sc->sc_ureq_sent) {
 				/* that's it, we're happy */
 				sc->sc_ureq_sent = 0;
 				sc->sc_bulk_tries = 0;
 				timeout_del(&sc->sc_bulkfail_tmo);
 #if NCARP > 0
 				if (!pfsync_sync_ok)
 #ifdef __FreeBSD__
 #ifdef CARP_ADVANCED
 					carp_group_demote_adj(sc->sc_ifp, -1);
 #endif
 #else
 					carp_group_demote_adj(&sc->sc_if, -1);
 #endif
 #endif
 				pfsync_sync_ok = 1;
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync: received valid "
 					    "bulk update end\n");
 			} else {
 				if (pf_status.debug >= PF_DEBUG_MISC)
 					printf("pfsync: received invalid "
 					    "bulk update end: bad timestamp\n");
 			}
 			break;
 		}
 		break;
 #ifdef PFSYNC_TDB
 	case PFSYNC_ACT_TDB_UPD:
 		if ((mp = m_pulldown(m, iplen + sizeof(*ph),
 		    count * sizeof(*pt), &offp)) == NULL) {
 			pfsyncstats.pfsyncs_badlen++;
 			return;
 		}
 		s = splsoftnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		for (i = 0, pt = (struct pfsync_tdb *)(mp->m_data + offp);
 		    i < count; i++, pt++)
 			pfsync_update_net_tdb(pt);
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 #endif
 	}
 
 done:
 	if (m)
 		m_freem(m);
 }
 
 int
 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 	struct rtentry *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 int
 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 #ifndef __FreeBSD__
 	struct proc *p = curproc;
 #endif
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct ip_moptions *imo = &sc->sc_imo;
 	struct pfsyncreq pfsyncr;
 	struct ifnet    *sifp;
 	int s, error;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 	case SIOCAIFADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFFLAGS:
 #ifdef __FreeBSD__
 		if (ifp->if_flags & IFF_UP)
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		else
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 #else
 		if (ifp->if_flags & IFF_UP)
 			ifp->if_flags |= IFF_RUNNING;
 		else
 			ifp->if_flags &= ~IFF_RUNNING;
 #endif
 		break;
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < PFSYNC_MINMTU)
 			return (EINVAL);
 		if (ifr->ifr_mtu > MCLBYTES)
 			ifr->ifr_mtu = MCLBYTES;
 		s = splnet();
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		if (ifr->ifr_mtu < ifp->if_mtu)
 			pfsync_sendout(sc);
 		pfsync_setmtu(sc, ifr->ifr_mtu);
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 		break;
 	case SIOCGETPFSYNC:
 		bzero(&pfsyncr, sizeof(pfsyncr));
 		if (sc->sc_sync_ifp)
 			strlcpy(pfsyncr.pfsyncr_syncdev,
 			    sc->sc_sync_ifp->if_xname, IFNAMSIZ);
 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
 		if ((error = copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))))
 			return (error);
 		break;
 	case SIOCSETPFSYNC:
 #ifdef __FreeBSD__
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 #else
 		if ((error = suser(p, p->p_acflag)) != 0)
 #endif
 			return (error);
 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
 			return (error);
 
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
 #ifdef __FreeBSD__
 			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
 #else
 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
 #endif
 		else
 			sc->sc_sync_peer.s_addr =
 			    pfsyncr.pfsyncr_syncpeer.s_addr;
 
 		if (pfsyncr.pfsyncr_maxupdates > 255)
 #ifdef __FreeBSD__
 		{
 			PF_UNLOCK();
 #endif
 			return (EINVAL);
 #ifdef __FreeBSD__
 		}
 #endif
 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
 
 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
 			sc->sc_sync_ifp = NULL;
 			if (sc->sc_mbuf_net != NULL) {
 				/* Don't keep stale pfsync packets around. */
 				s = splnet();
 				m_freem(sc->sc_mbuf_net);
 				sc->sc_mbuf_net = NULL;
 				sc->sc_statep_net.s = NULL;
 				splx(s);
 			}
 #ifdef __FreeBSD__
 			PF_UNLOCK();
 #endif
 			if (imo->imo_num_memberships > 0) {
 				in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 				imo->imo_multicast_ifp = NULL;
 			}
 			break;
 		}
 
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		if ((sifp = ifunit(pfsyncr.pfsyncr_syncdev)) == NULL)
 			return (EINVAL);
 #ifdef __FreeBSD__
 		PF_LOCK();
 #endif
 
 		s = splnet();
 #ifdef __FreeBSD__
 		if (sifp->if_mtu < sc->sc_ifp->if_mtu ||
 #else
 		if (sifp->if_mtu < sc->sc_if.if_mtu ||
 #endif
 		    (sc->sc_sync_ifp != NULL &&
 		    sifp->if_mtu < sc->sc_sync_ifp->if_mtu) ||
 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
 			pfsync_sendout(sc);
 		sc->sc_sync_ifp = sifp;
 
 #ifdef __FreeBSD__
 		pfsync_setmtu(sc, sc->sc_ifp->if_mtu);
 #else
 		pfsync_setmtu(sc, sc->sc_if.if_mtu);
 #endif
 
 		if (imo->imo_num_memberships > 0) {
 #ifdef __FreeBSD__
 			PF_UNLOCK();
 #endif
 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 #ifdef __FreeBSD__
 			PF_LOCK();
 #endif
 			imo->imo_multicast_ifp = NULL;
 		}
 
 		if (sc->sc_sync_ifp &&
 #ifdef __FreeBSD__
 		    sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 #else
 		    sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
 #endif
 			struct in_addr addr;
 
 			if (!(sc->sc_sync_ifp->if_flags & IFF_MULTICAST)) {
 				sc->sc_sync_ifp = NULL;
 #ifdef __FreeBSD__
 				PF_UNLOCK();
 #endif
 				splx(s);
 				return (EADDRNOTAVAIL);
 			}
 
 #ifdef __FreeBSD__
 			addr.s_addr = htonl(INADDR_PFSYNC_GROUP);
 #else
 			addr.s_addr = INADDR_PFSYNC_GROUP;
 #endif
 
 #ifdef __FreeBSD__
 			PF_UNLOCK();
 #endif
 			if ((imo->imo_membership[0] =
 			    in_addmulti(&addr, sc->sc_sync_ifp)) == NULL) {
 				sc->sc_sync_ifp = NULL;
 				splx(s);
 				return (ENOBUFS);
 			}
 #ifdef __FreeBSD__
 			PF_LOCK();
 #endif
 			imo->imo_num_memberships++;
 			imo->imo_multicast_ifp = sc->sc_sync_ifp;
 			imo->imo_multicast_ttl = PFSYNC_DFLTTL;
 			imo->imo_multicast_loop = 0;
 		}
 
 		if (sc->sc_sync_ifp ||
 #ifdef __FreeBSD__
 		    sc->sc_sendaddr.s_addr != htonl(INADDR_PFSYNC_GROUP)) {
 #else
 		    sc->sc_sendaddr.s_addr != INADDR_PFSYNC_GROUP) {
 #endif
 			/* Request a full state table update. */
 			sc->sc_ureq_sent = time_uptime;
 #if NCARP > 0
 			if (pfsync_sync_ok)
 #ifdef __FreeBSD__
 #ifdef CARP_ADVANCED
 				carp_group_demote_adj(sc->sc_ifp, 1);
 #endif
 #else
 				carp_group_demote_adj(&sc->sc_if, 1);
 #endif
 #endif
 			pfsync_sync_ok = 0;
 			if (pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: requesting bulk update\n");
 #ifdef __FreeBSD__
 			callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
 			    pfsync_bulkfail, pfsyncif);
 #else
 			timeout_add(&sc->sc_bulkfail_tmo, 5 * hz);
 #endif
 			error = pfsync_request_update(NULL, NULL);
 			if (error == ENOMEM) {
 #ifdef __FreeBSD__
 				PF_UNLOCK();
 #endif
 				splx(s);
 				return (ENOMEM);
 			}
 			pfsync_sendout(sc);
 		}
 #ifdef __FreeBSD__
 		PF_UNLOCK();
 #endif
 		splx(s);
 
 		break;
 
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 void
 pfsync_setmtu(struct pfsync_softc *sc, int mtu_req)
 {
 	int mtu;
 
 	if (sc->sc_sync_ifp && sc->sc_sync_ifp->if_mtu < mtu_req)
 		mtu = sc->sc_sync_ifp->if_mtu;
 	else
 		mtu = mtu_req;
 
 	sc->sc_maxcount = (mtu - sizeof(struct pfsync_header)) /
 	    sizeof(struct pfsync_state);
 	if (sc->sc_maxcount > 254)
 	    sc->sc_maxcount = 254;
 #ifdef __FreeBSD__
 	sc->sc_ifp->if_mtu = sizeof(struct pfsync_header) +
 #else
 	sc->sc_if.if_mtu = sizeof(struct pfsync_header) +
 #endif
 	    sc->sc_maxcount * sizeof(struct pfsync_state);
 }
 
 struct mbuf *
 pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp)
 {
 	struct pfsync_header *h;
 	struct mbuf *m;
 	int len;
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m == NULL) {
 #ifdef __FreeBSD__
 		sc->sc_ifp->if_oerrors++;
 #else
 		sc->sc_if.if_oerrors++;
 #endif
 		return (NULL);
 	}
 
 	switch (action) {
 	case PFSYNC_ACT_CLR:
 		len = sizeof(struct pfsync_header) +
 		    sizeof(struct pfsync_state_clr);
 		break;
 	case PFSYNC_ACT_UPD_C:
 		len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd)) +
 		    sizeof(struct pfsync_header);
 		break;
 	case PFSYNC_ACT_DEL_C:
 		len = (sc->sc_maxcount * sizeof(struct pfsync_state_del)) +
 		    sizeof(struct pfsync_header);
 		break;
 	case PFSYNC_ACT_UREQ:
 		len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd_req)) +
 		    sizeof(struct pfsync_header);
 		break;
 	case PFSYNC_ACT_BUS:
 		len = sizeof(struct pfsync_header) +
 		    sizeof(struct pfsync_state_bus);
 		break;
 #ifdef PFSYNC_TDB
 	case PFSYNC_ACT_TDB_UPD:
 		len = (sc->sc_maxcount * sizeof(struct pfsync_tdb)) +
 		    sizeof(struct pfsync_header);
 		break;
 #endif
 	default:
 		len = (sc->sc_maxcount * sizeof(struct pfsync_state)) +
 		    sizeof(struct pfsync_header);
 		break;
 	}
 
 	if (len > MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 #ifdef __FreeBSD__
 			sc->sc_ifp->if_oerrors++;
 #else
 			sc->sc_if.if_oerrors++;
 #endif
 			return (NULL);
 		}
 		m->m_data += (MCLBYTES - len) &~ (sizeof(long) - 1);
 	} else
 		MH_ALIGN(m, len);
 
 	m->m_pkthdr.rcvif = NULL;
 	m->m_pkthdr.len = m->m_len = sizeof(struct pfsync_header);
 	h = mtod(m, struct pfsync_header *);
 	h->version = PFSYNC_VERSION;
 	h->af = 0;
 	h->count = 0;
 	h->action = action;
 #ifndef PFSYNC_TDB
 	if (action != PFSYNC_ACT_TDB_UPD)
 #endif
 		bcopy(&pf_status.pf_chksum, &h->pf_chksum,
 		    PF_MD5_DIGEST_LENGTH);
 
 	*sp = (void *)((char *)h + PFSYNC_HDRLEN);
 #ifdef PFSYNC_TDB
 	if (action == PFSYNC_ACT_TDB_UPD)
 #ifdef __FreeBSD__
 		callout_reset(&sc->sc_tdb_tmo, hz, pfsync_tdb_timeout,
 		    pfsyncif);
 #else
 		timeout_add(&sc->sc_tdb_tmo, hz);
 #endif
 	else
 #endif
 #ifdef __FreeBSD__
 		callout_reset(&sc->sc_tmo, hz, pfsync_timeout, pfsyncif);
 #else
 		timeout_add(&sc->sc_tmo, hz);
 #endif
 	return (m);
 }
 
 int
 pfsync_pack_state(u_int8_t action, struct pf_state *st, int flags)
 {
 	struct ifnet *ifp = NULL;
 	struct pfsync_softc *sc = pfsyncif;
 	struct pfsync_header *h, *h_net;
 	struct pfsync_state *sp = NULL;
 	struct pfsync_state_upd *up = NULL;
 	struct pfsync_state_del *dp = NULL;
 	struct pf_rule *r;
 	u_long secs;
 	int s, ret = 0;
 	u_int8_t i = 255, newaction = 0;
 
 	if (sc == NULL)
 		return (0);
 #ifdef __FreeBSD__
 	ifp = sc->sc_ifp;
 #else
 	ifp = &sc->sc_if;
 #endif
 
 	/*
 	 * If a packet falls in the forest and there's nobody around to
 	 * hear, does it make a sound?
 	 */
 	if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL &&
 #ifdef __FreeBSD__
 	    sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 #else
 	    sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
 #endif
 		/* Don't leave any stale pfsync packets hanging around. */
 		if (sc->sc_mbuf != NULL) {
 			m_freem(sc->sc_mbuf);
 			sc->sc_mbuf = NULL;
 			sc->sc_statep.s = NULL;
 		}
 		return (0);
 	}
 
 	if (action >= PFSYNC_ACT_MAX)
 		return (EINVAL);
 
 	s = splnet();
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	if (sc->sc_mbuf == NULL) {
 		if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action,
 		    (void *)&sc->sc_statep.s)) == NULL) {
 			splx(s);
 			return (ENOMEM);
 		}
 		h = mtod(sc->sc_mbuf, struct pfsync_header *);
 	} else {
 		h = mtod(sc->sc_mbuf, struct pfsync_header *);
 		if (h->action != action) {
 			pfsync_sendout(sc);
 			if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action,
 			    (void *)&sc->sc_statep.s)) == NULL) {
 				splx(s);
 				return (ENOMEM);
 			}
 			h = mtod(sc->sc_mbuf, struct pfsync_header *);
 		} else {
 			/*
 			 * If it's an update, look in the packet to see if
 			 * we already have an update for the state.
 			 */
 			if (action == PFSYNC_ACT_UPD && sc->sc_maxupdates) {
 				struct pfsync_state *usp =
 				    (void *)((char *)h + PFSYNC_HDRLEN);
 
 				for (i = 0; i < h->count; i++) {
 					if (!memcmp(usp->id, &st->id,
 					    PFSYNC_ID_LEN) &&
 					    usp->creatorid == st->creatorid) {
 						sp = usp;
 						sp->updates++;
 						break;
 					}
 					usp++;
 				}
 			}
 		}
 	}
 
 	secs = time_second;
 
 	st->pfsync_time = time_uptime;
 
 	if (sp == NULL) {
 		/* not a "duplicate" update */
 		i = 255;
 		sp = sc->sc_statep.s++;
 		sc->sc_mbuf->m_pkthdr.len =
 		    sc->sc_mbuf->m_len += sizeof(struct pfsync_state);
 		h->count++;
 		bzero(sp, sizeof(*sp));
 
 		bcopy(&st->id, sp->id, sizeof(sp->id));
 		sp->creatorid = st->creatorid;
 
 		strlcpy(sp->ifname, st->u.s.kif->pfik_name, sizeof(sp->ifname));
 		pf_state_host_hton(&st->lan, &sp->lan);
 		pf_state_host_hton(&st->gwy, &sp->gwy);
 		pf_state_host_hton(&st->ext, &sp->ext);
 
 		bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
 
 		sp->creation = htonl(secs - st->creation);
 		pf_state_counter_hton(st->packets[0], sp->packets[0]);
 		pf_state_counter_hton(st->packets[1], sp->packets[1]);
 		pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
 		pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
 		if ((r = st->rule.ptr) == NULL)
 			sp->rule = htonl(-1);
 		else
 			sp->rule = htonl(r->nr);
 		if ((r = st->anchor.ptr) == NULL)
 			sp->anchor = htonl(-1);
 		else
 			sp->anchor = htonl(r->nr);
 		sp->af = st->af;
 		sp->proto = st->proto;
 		sp->direction = st->direction;
 		sp->log = st->log;
 		sp->allow_opts = st->allow_opts;
 		sp->timeout = st->timeout;
 
 		if (flags & PFSYNC_FLAG_STALE)
 			sp->sync_flags |= PFSTATE_STALE;
 	}
 
 	pf_state_peer_hton(&st->src, &sp->src);
 	pf_state_peer_hton(&st->dst, &sp->dst);
 
 	if (st->expire <= secs)
 		sp->expire = htonl(0);
 	else
 		sp->expire = htonl(st->expire - secs);
 
 	/* do we need to build "compressed" actions for network transfer? */
 	if (sc->sc_sync_ifp && flags & PFSYNC_FLAG_COMPRESS) {
 		switch (action) {
 		case PFSYNC_ACT_UPD:
 			newaction = PFSYNC_ACT_UPD_C;
 			break;
 		case PFSYNC_ACT_DEL:
 			newaction = PFSYNC_ACT_DEL_C;
 			break;
 		default:
 			/* by default we just send the uncompressed states */
 			break;
 		}
 	}
 
 	if (newaction) {
 		if (sc->sc_mbuf_net == NULL) {
 			if ((sc->sc_mbuf_net = pfsync_get_mbuf(sc, newaction,
 			    (void *)&sc->sc_statep_net.s)) == NULL) {
 				splx(s);
 				return (ENOMEM);
 			}
 		}
 		h_net = mtod(sc->sc_mbuf_net, struct pfsync_header *);
 
 		switch (newaction) {
 		case PFSYNC_ACT_UPD_C:
 			if (i != 255) {
 				up = (void *)((char *)h_net +
 				    PFSYNC_HDRLEN + (i * sizeof(*up)));
 				up->updates++;
 			} else {
 				h_net->count++;
 				sc->sc_mbuf_net->m_pkthdr.len =
 				    sc->sc_mbuf_net->m_len += sizeof(*up);
 				up = sc->sc_statep_net.u++;
 
 				bzero(up, sizeof(*up));
 				bcopy(&st->id, up->id, sizeof(up->id));
 				up->creatorid = st->creatorid;
 			}
 			up->timeout = st->timeout;
 			up->expire = sp->expire;
 			up->src = sp->src;
 			up->dst = sp->dst;
 			break;
 		case PFSYNC_ACT_DEL_C:
 			sc->sc_mbuf_net->m_pkthdr.len =
 			    sc->sc_mbuf_net->m_len += sizeof(*dp);
 			dp = sc->sc_statep_net.d++;
 			h_net->count++;
 
 			bzero(dp, sizeof(*dp));
 			bcopy(&st->id, dp->id, sizeof(dp->id));
 			dp->creatorid = st->creatorid;
 			break;
 		}
 	}
 
 	if (h->count == sc->sc_maxcount ||
 	    (sc->sc_maxupdates && (sp->updates >= sc->sc_maxupdates)))
 		ret = pfsync_sendout(sc);
 
 	splx(s);
 	return (ret);
 }
 
 /* This must be called in splnet() */
 int
 pfsync_request_update(struct pfsync_state_upd *up, struct in_addr *src)
 {
 	struct ifnet *ifp = NULL;
 	struct pfsync_header *h;
 	struct pfsync_softc *sc = pfsyncif;
 	struct pfsync_state_upd_req *rup;
 	int ret = 0;
 
 	if (sc == NULL)
 		return (0);
 
 #ifdef __FreeBSD__
 	ifp = sc->sc_ifp;
 #else
 	ifp = &sc->sc_if;
 #endif
 	if (sc->sc_mbuf == NULL) {
 		if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ,
 		    (void *)&sc->sc_statep.s)) == NULL)
 			return (ENOMEM);
 		h = mtod(sc->sc_mbuf, struct pfsync_header *);
 	} else {
 		h = mtod(sc->sc_mbuf, struct pfsync_header *);
 		if (h->action != PFSYNC_ACT_UREQ) {
 			pfsync_sendout(sc);
 			if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ,
 			    (void *)&sc->sc_statep.s)) == NULL)
 				return (ENOMEM);
 			h = mtod(sc->sc_mbuf, struct pfsync_header *);
 		}
 	}
 
 	if (src != NULL)
 		sc->sc_sendaddr = *src;
 	sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*rup);
 	h->count++;
 	rup = sc->sc_statep.r++;
 	bzero(rup, sizeof(*rup));
 	if (up != NULL) {
 		bcopy(up->id, rup->id, sizeof(rup->id));
 		rup->creatorid = up->creatorid;
 	}
 
 	if (h->count == sc->sc_maxcount)
 		ret = pfsync_sendout(sc);
 
 	return (ret);
 }
 
 int
 pfsync_clear_states(u_int32_t creatorid, char *ifname)
 {
 	struct ifnet *ifp = NULL;
 	struct pfsync_softc *sc = pfsyncif;
 	struct pfsync_state_clr *cp;
 	int s, ret;
 
 	if (sc == NULL)
 		return (0);
 
 #ifdef __FreeBSD__
 	ifp = sc->sc_ifp;
 #else
 	ifp = &sc->sc_if;
 #endif
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	s = splnet();
 	if (sc->sc_mbuf != NULL)
 		pfsync_sendout(sc);
 	if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_CLR,
 	    (void *)&sc->sc_statep.c)) == NULL) {
 		splx(s);
 		return (ENOMEM);
 	}
 	sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*cp);
 	cp = sc->sc_statep.c;
 	cp->creatorid = creatorid;
 	if (ifname != NULL)
 		strlcpy(cp->ifname, ifname, IFNAMSIZ);
 
 	ret = (pfsync_sendout(sc));
 	splx(s);
 	return (ret);
 }
 
 void
 pfsync_timeout(void *v)
 {
 	struct pfsync_softc *sc = v;
 	int s;
 
 	s = splnet();
 #ifdef __FreeBSD__
 	PF_LOCK();
 #endif
 	pfsync_sendout(sc);
 #ifdef __FreeBSD__
 	PF_UNLOCK();
 #endif
 	splx(s);
 }
 
 #ifdef PFSYNC_TDB
 void
 pfsync_tdb_timeout(void *v)
 {
 	struct pfsync_softc *sc = v;
 	int s;
 
 	s = splnet();
 #ifdef __FreeBSD__
 	PF_LOCK();
 #endif
 	pfsync_tdb_sendout(sc);
 #ifdef __FreeBSD__
 	PF_UNLOCK();
 #endif
 	splx(s);
 }
 #endif
 
 /* This must be called in splnet() */
 void
 pfsync_send_bus(struct pfsync_softc *sc, u_int8_t status)
 {
 	struct pfsync_state_bus *bus;
 
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	if (sc->sc_mbuf != NULL)
 		pfsync_sendout(sc);
 
 	if (pfsync_sync_ok &&
 	    (sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_BUS,
 	    (void *)&sc->sc_statep.b)) != NULL) {
 		sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*bus);
 		bus = sc->sc_statep.b;
 		bus->creatorid = pf_status.hostid;
 		bus->status = status;
 		bus->endtime = htonl(time_uptime - sc->sc_ureq_received);
 		pfsync_sendout(sc);
 	}
 }
 
 void
 pfsync_bulk_update(void *v)
 {
 	struct pfsync_softc *sc = v;
 	int s, i = 0;
 	struct pf_state *state;
 
 	s = splnet();
 #ifdef __FreeBSD__
 	PF_LOCK();
 #endif
 	if (sc->sc_mbuf != NULL)
 		pfsync_sendout(sc);
 
 	/*
 	 * Grab at most PFSYNC_BULKPACKETS worth of states which have not
 	 * been sent since the latest request was made.
 	 */
 	state = sc->sc_bulk_send_next;
 	if (state)
 		do {
 			/* send state update if syncable and not already sent */
 			if (!state->sync_flags
 			    && state->timeout < PFTM_MAX
 			    && state->pfsync_time <= sc->sc_ureq_received) {
 				pfsync_pack_state(PFSYNC_ACT_UPD, state, 0);
 				i++;
 			}
 
 			/* figure next state to send */
 			state = TAILQ_NEXT(state, u.s.entry_list);
 
 			/* wrap to start of list if we hit the end */
 			if (!state)
 				state = TAILQ_FIRST(&state_list);
 		} while (i < sc->sc_maxcount * PFSYNC_BULKPACKETS &&
 		    state != sc->sc_bulk_terminator);
 
 	if (!state || state == sc->sc_bulk_terminator) {
 		/* we're done */
 		pfsync_send_bus(sc, PFSYNC_BUS_END);
 		sc->sc_ureq_received = 0;
 		sc->sc_bulk_send_next = NULL;
 		sc->sc_bulk_terminator = NULL;
 		timeout_del(&sc->sc_bulk_tmo);
 		if (pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: bulk update complete\n");
 	} else {
 		/* look again for more in a bit */
 #ifdef __FreeBSD__
 		callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update,
 		    pfsyncif);
 #else
 		timeout_add(&sc->sc_bulk_tmo, 1);
 #endif
 		sc->sc_bulk_send_next = state;
 	}
 	if (sc->sc_mbuf != NULL)
 		pfsync_sendout(sc);
 	splx(s);
 #ifdef __FreeBSD__
 	PF_UNLOCK();
 #endif
 }
 
 void
 pfsync_bulkfail(void *v)
 {
 	struct pfsync_softc *sc = v;
 	int s, error;
 
 #ifdef __FreeBSD__
 	PF_LOCK();
 #endif
 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
 		/* Try again in a bit */
 #ifdef __FreeBSD__
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulkfail,
 		    pfsyncif);
 #else
 		timeout_add(&sc->sc_bulkfail_tmo, 5 * hz);
 #endif
 		s = splnet();
 		error = pfsync_request_update(NULL, NULL);
 		if (error == ENOMEM) {
 			if (pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: cannot allocate mbufs for "
 				    "bulk update\n");
 		} else
 			pfsync_sendout(sc);
 		splx(s);
 	} else {
 		/* Pretend like the transfer was ok */
 		sc->sc_ureq_sent = 0;
 		sc->sc_bulk_tries = 0;
 #if NCARP > 0
 		if (!pfsync_sync_ok)
 #ifdef __FreeBSD__
 #ifdef CARP_ADVANCED
 			carp_group_demote_adj(sc->sc_ifp, -1);
 #endif
 #else
 			carp_group_demote_adj(&sc->sc_if, -1);
 #endif
 #endif
 		pfsync_sync_ok = 1;
 		if (pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: failed to receive "
 			    "bulk update status\n");
 		timeout_del(&sc->sc_bulkfail_tmo);
 	}
 #ifdef __FreeBSD__
 	PF_UNLOCK();
 #endif
 }
 
 /* This must be called in splnet() */
 int
 pfsync_sendout(struct pfsync_softc *sc)
 {
 #if NBPFILTER > 0
 #ifdef __FreeBSD__
 	struct ifnet *ifp = sc->sc_ifp;
 #else
 	struct ifnet *ifp = &sc->sc_if;
 #endif
 #endif
 	struct mbuf *m;
 
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	timeout_del(&sc->sc_tmo);
 
 	if (sc->sc_mbuf == NULL)
 		return (0);
 	m = sc->sc_mbuf;
 	sc->sc_mbuf = NULL;
 	sc->sc_statep.s = NULL;
 
 #if NBPFILTER > 0
 	if (ifp->if_bpf)
 #ifdef __FreeBSD__
 		BPF_MTAP(ifp, m);
 #else
 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
 #endif
 #endif
 
 	if (sc->sc_mbuf_net) {
 		m_freem(m);
 		m = sc->sc_mbuf_net;
 		sc->sc_mbuf_net = NULL;
 		sc->sc_statep_net.s = NULL;
 	}
 
 	return pfsync_sendout_mbuf(sc, m);
 }
 
 #ifdef PFSYNC_TDB
 int
 pfsync_tdb_sendout(struct pfsync_softc *sc)
 {
 #if NBPFILTER > 0
 #ifdef __FreeBSD__
 	struct ifnet *ifp = sc->sc_ifp;
 #else
 	struct ifnet *ifp = &sc->sc_if;
 #endif
 #endif
 	struct mbuf *m;
 
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	timeout_del(&sc->sc_tdb_tmo);
 
 	if (sc->sc_mbuf_tdb == NULL)
 		return (0);
 	m = sc->sc_mbuf_tdb;
 	sc->sc_mbuf_tdb = NULL;
 	sc->sc_statep_tdb.t = NULL;
 
 #if NBPFILTER > 0
 	if (ifp->if_bpf)
 #ifdef __FreeBSD__
 		BPF_MTAP(ifp, m);
 #else
 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
 #endif
 #endif
 
 	return pfsync_sendout_mbuf(sc, m);
 }
 #endif
 
 int
 pfsync_sendout_mbuf(struct pfsync_softc *sc, struct mbuf *m)
 {
 	struct sockaddr sa;
 	struct ip *ip;
 
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	if (sc->sc_sync_ifp ||
 #ifdef __FreeBSD__
 	    sc->sc_sync_peer.s_addr != htonl(INADDR_PFSYNC_GROUP)) {
 #else
 	    sc->sc_sync_peer.s_addr != INADDR_PFSYNC_GROUP) {
 #endif
 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 		if (m == NULL) {
 			pfsyncstats.pfsyncs_onomem++;
 			return (0);
 		}
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 #ifdef __FreeBSD__
 		ip->ip_len = m->m_pkthdr.len;
 #else
 		ip->ip_len = htons(m->m_pkthdr.len);
 #endif
 		ip->ip_id = htons(ip_randomid());
 #ifdef __FreeBSD__
 		ip->ip_off = IP_DF;
 #else
 		ip->ip_off = htons(IP_DF);
 #endif
 		ip->ip_ttl = PFSYNC_DFLTTL;
 		ip->ip_p = IPPROTO_PFSYNC;
 		ip->ip_sum = 0;
 
 		bzero(&sa, sizeof(sa));
 		ip->ip_src.s_addr = INADDR_ANY;
 
 #ifdef __FreeBSD__
 		if (sc->sc_sendaddr.s_addr == htonl(INADDR_PFSYNC_GROUP))
 #else
 		if (sc->sc_sendaddr.s_addr == INADDR_PFSYNC_GROUP)
 #endif
 			m->m_flags |= M_MCAST;
 		ip->ip_dst = sc->sc_sendaddr;
 		sc->sc_sendaddr.s_addr = sc->sc_sync_peer.s_addr;
 
 		pfsyncstats.pfsyncs_opackets++;
 
 #ifdef __FreeBSD__
 		if (!IF_HANDOFF(&sc->sc_ifq, m, NULL))
 			pfsyncstats.pfsyncs_oerrors++;
 		taskqueue_enqueue(taskqueue_thread, &pfsyncif->sc_send_task);
 #else
 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL))
 			pfsyncstats.pfsyncs_oerrors++;
 #endif
 	} else
 		m_freem(m);
 
 	return (0);
 }
 
 #ifdef PFSYNC_TDB
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
 	struct tdb		*tdb;
 	int			 s;
 
 	/* check for invalid values */
 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
 	    (pt->dst.sa.sa_family != AF_INET &&
 	     pt->dst.sa.sa_family != AF_INET6))
 		goto bad;
 
 	s = spltdb();
 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
 	if (tdb) {
 		pt->rpl = ntohl(pt->rpl);
 		pt->cur_bytes = betoh64(pt->cur_bytes);
 
 		/* Neither replay nor byte counter should ever decrease. */
 		if (pt->rpl < tdb->tdb_rpl ||
 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
 			splx(s);
 			goto bad;
 		}
 
 		tdb->tdb_rpl = pt->rpl;
 		tdb->tdb_cur_bytes = pt->cur_bytes;
 	}
 	splx(s);
 	return;
 
  bad:
 	if (pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
 		    "invalid value\n");
 	pfsyncstats.pfsyncs_badstate++;
 	return;
 }
 
 /* One of our local tdbs have been updated, need to sync rpl with others */
 int
 pfsync_update_tdb(struct tdb *tdb, int output)
 {
 	struct ifnet *ifp = NULL;
 	struct pfsync_softc *sc = pfsyncif;
 	struct pfsync_header *h;
 	struct pfsync_tdb *pt = NULL;
 	int s, i, ret;
 
 	if (sc == NULL)
 		return (0);
 
 #ifdef __FreeBSD__
 	ifp = sc->sc_ifp;
 #else
 	ifp = &sc->sc_if;
 #endif
 	if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL &&
 #ifdef __FreeBSD__
 	    sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 #else
 	    sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
 #endif
 		/* Don't leave any stale pfsync packets hanging around. */
 		if (sc->sc_mbuf_tdb != NULL) {
 			m_freem(sc->sc_mbuf_tdb);
 			sc->sc_mbuf_tdb = NULL;
 			sc->sc_statep_tdb.t = NULL;
 		}
 		return (0);
 	}
 
 #ifdef __FreeBSD__
 	PF_ASSERT(MA_OWNED);
 #endif
 	s = splnet();
 	if (sc->sc_mbuf_tdb == NULL) {
 		if ((sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, PFSYNC_ACT_TDB_UPD,
 		    (void *)&sc->sc_statep_tdb.t)) == NULL) {
 			splx(s);
 			return (ENOMEM);
 		}
 		h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
 	} else {
 		h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
 		if (h->action != PFSYNC_ACT_TDB_UPD) {
 			/*
 			 * XXX will never happen as long as there's
 			 * only one "TDB action".
 			 */
 			pfsync_tdb_sendout(sc);
 			sc->sc_mbuf_tdb = pfsync_get_mbuf(sc,
 			    PFSYNC_ACT_TDB_UPD, (void *)&sc->sc_statep_tdb.t);
 			if (sc->sc_mbuf_tdb == NULL) {
 				splx(s);
 				return (ENOMEM);
 			}
 			h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
 		} else if (sc->sc_maxupdates) {
 			/*
 			 * If it's an update, look in the packet to see if
 			 * we already have an update for the state.
 			 */
 			struct pfsync_tdb *u =
 			    (void *)((char *)h + PFSYNC_HDRLEN);
 
 			for (i = 0; !pt && i < h->count; i++) {
 				if (tdb->tdb_spi == u->spi &&
 				    tdb->tdb_sproto == u->sproto &&
 			            !bcmp(&tdb->tdb_dst, &u->dst,
 				    SA_LEN(&u->dst.sa))) {
 					pt = u;
 					pt->updates++;
 				}
 				u++;
 			}
 		}
 	}
 
 	if (pt == NULL) {
 		/* not a "duplicate" update */
 		pt = sc->sc_statep_tdb.t++;
 		sc->sc_mbuf_tdb->m_pkthdr.len =
 		    sc->sc_mbuf_tdb->m_len += sizeof(struct pfsync_tdb);
 		h->count++;
 		bzero(pt, sizeof(*pt));
 
 		pt->spi = tdb->tdb_spi;
 		memcpy(&pt->dst, &tdb->tdb_dst, sizeof pt->dst);
 		pt->sproto = tdb->tdb_sproto;
 	}
 
 	/*
 	 * When a failover happens, the master's rpl is probably above
 	 * what we see here (we may be up to a second late), so
 	 * increase it a bit for outbound tdbs to manage most such
 	 * situations.
 	 *
 	 * For now, just add an offset that is likely to be larger
 	 * than the number of packets we can see in one second. The RFC
 	 * just says the next packet must have a higher seq value.
 	 *
 	 * XXX What is a good algorithm for this? We could use
 	 * a rate-determined increase, but to know it, we would have
 	 * to extend struct tdb.
 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
 	 * will soon be replaced anyway. For now, just don't handle
 	 * this edge case.
 	 */
 #define RPL_INCR 16384
 	pt->rpl = htonl(tdb->tdb_rpl + (output ? RPL_INCR : 0));
 	pt->cur_bytes = htobe64(tdb->tdb_cur_bytes);
 
 	if (h->count == sc->sc_maxcount ||
 	    (sc->sc_maxupdates && (pt->updates >= sc->sc_maxupdates)))
 		ret = pfsync_tdb_sendout(sc);
 
 	splx(s);
 	return (ret);
 }
 #endif /* PFSYNC_TDB */
 
 #ifdef __FreeBSD__
 void
 pfsync_ifdetach(void *arg, struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = (struct pfsync_softc *)arg;
 	struct ip_moptions *imo;
 
 	if (sc == NULL || sc->sc_sync_ifp != ifp)
 		return;         /* not for us; unlocked read */
 
 	PF_LOCK();
 
 	/* Deal with a member interface going away from under us. */
 	sc->sc_sync_ifp = NULL;
 	if (sc->sc_mbuf_net != NULL) {
 		m_freem(sc->sc_mbuf_net);
 		sc->sc_mbuf_net = NULL;
 		sc->sc_statep_net.s = NULL;
 	}
 	imo = &sc->sc_imo;
 	if (imo->imo_num_memberships > 0) {
 		KASSERT(imo->imo_num_memberships == 1,
 		    ("%s: imo_num_memberships != 1", __func__));
 		/*
 		 * Our event handler is always called after protocol
 		 * domains have been detached from the underlying ifnet.
 		 * Do not call in_delmulti(); we held a single reference
 		 * which the protocol domain has purged in in_purgemaddrs().
 		 */
 		PF_UNLOCK();
 		imo->imo_membership[--imo->imo_num_memberships] = NULL;
 		PF_LOCK();
 		imo->imo_multicast_ifp = NULL;
 	}
 
 	PF_UNLOCK();
 }
 
 void
 pfsync_senddef(void *arg, __unused int pending)
 {
 	struct pfsync_softc *sc = (struct pfsync_softc *)arg;
 	struct mbuf *m;
 
 	for(;;) {
 		IF_DEQUEUE(&sc->sc_ifq, m);
 		if (m == NULL)
 			break;
 		/* Deal with a member interface going away from under us. */
 		if (sc->sc_sync_ifp == NULL) {
 			pfsyncstats.pfsyncs_oerrors++;
 			m_freem(m);
 			continue;
 		}
 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL))
 			pfsyncstats.pfsyncs_oerrors++;
 	}
 }
 
 static int
 pfsync_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		pfsyncattach(0);
 		break;
 	case MOD_UNLOAD:
 		if_clone_detach(&pfsync_cloner);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return error;
 }
 
 static moduledata_t pfsync_mod = {
 	"pfsync",
 	pfsync_modevent,
 	0
 };
 
 #define PFSYNC_MODVER 1
 
 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 MODULE_VERSION(pfsync, PFSYNC_MODVER);
 MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER);
 #endif /* __FreeBSD__ */
Index: head/sys/net/bpf.c
===================================================================
--- head/sys/net/bpf.c	(revision 171636)
+++ head/sys/net/bpf.c	(revision 171637)
@@ -1,1873 +1,1873 @@
 /*-
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  *
  * $FreeBSD$
  */
 
 #include "opt_bpf.h"
 #include "opt_mac.h"
 #include "opt_netgraph.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/bpf.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 
 #define	M_SKIP_BPF	M_SKIP_FIREWALL
 
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT.  The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 static LIST_HEAD(, bpf_if)	bpf_iflist;
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpf_allocbufs(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
 static void	bpf_mcopy(const void *, void *, size_t);
 static int	bpf_movein(struct uio *, int, int, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int,
 		    u_int, void (*)(const void *, void *, size_t),
 		    struct timeval *);
 static void	reset_d(struct bpf_d *);
 static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static void	bpf_drvinit(void *);
 static void	bpf_clone(void *, struct ucred *, char *, int, struct cdev **);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 static int bpf_bufsize = 4096;
 SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
     &bpf_bufsize, 0, "Default bpf buffer size");
 static int bpf_maxbufsize = BPF_MAXBUFSIZE;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
     &bpf_maxbufsize, 0, "Maximum bpf buffer size");
 static int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 static	d_open_t	bpfopen;
 static	d_close_t	bpfclose;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_close =	bpfclose,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops =
 	{ 1, NULL, filt_bpfdetach, filt_bpfread };
 
 static int
 bpf_movein(struct uio *uio, int linktype, int mtu, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
 {
 	const struct ieee80211_bpf_params *p;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 
 	if (len - hlen > mtu)
 		return (EMSGSIZE);
 
 	if ((unsigned)len > MCLBYTES)
 		return (EIO);
 
 	if (len > MHLEN) {
 		m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
 	} else {
 		MGETHDR(m, M_TRYWAIT, MT_DATA);
 	}
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	*mp = m;
 
 	if (m->m_len < hlen) {
 		error = EPERM;
 		goto bad;
 	}
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(m->m_data, sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	/*
 	 * Point d at bp, and add d to the interface's list of listeners.
 	 * Finally, point the driver's bpf cookie at the interface so
 	 * it will divert packets to bpf.
 	 */
 	BPFIF_LOCK(bp);
 	d->bd_bif = bp;
 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	bpf_bpfd_cnt++;
 	BPFIF_UNLOCK(bp);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	int error;
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 
 	bp = d->bd_bif;
 	BPFIF_LOCK(bp);
 	BPFD_LOCK(d);
 	ifp = d->bd_bif->bif_ifp;
 
 	/*
 	 * Remove d from the interface's descriptor list.
 	 */
 	LIST_REMOVE(d, bd_next);
 
 	bpf_bpfd_cnt--;
 	d->bd_bif = NULL;
 	BPFD_UNLOCK(d);
 	BPFIF_UNLOCK(bp);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so, turn it off.
 	 */
 	if (d->bd_promisc) {
 		d->bd_promisc = 0;
 		error = ifpromisc(ifp, 0);
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 
 	mtx_lock(&bpf_mtx);
 	d = dev->si_drv1;
 	/*
 	 * Each minor can be opened by only one process.  If the requested
 	 * minor is in use, return EBUSY.
 	 */
 	if (d != NULL) {
 		mtx_unlock(&bpf_mtx);
 		return (EBUSY);
 	}
 	dev->si_drv1 = (struct bpf_d *)~0;	/* mark device in use */
 	mtx_unlock(&bpf_mtx);
 
 	if ((dev->si_flags & SI_NAMED) == 0)
 		make_dev(&bpf_cdevsw, minor(dev), UID_ROOT, GID_WHEEL, 0600,
 		    "bpf%d", dev2unit(dev));
 	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	dev->si_drv1 = d;
 	d->bd_bufsize = bpf_bufsize;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	d->bd_pid = td->td_proc->p_pid;
 #ifdef MAC
 	mac_init_bpfdesc(d);
 	mac_create_bpfdesc(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
-	callout_init(&d->bd_callout, NET_CALLOUT_MPSAFE);
+	callout_init(&d->bd_callout, CALLOUT_MPSAFE);
 	knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL);
 
 	return (0);
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 /* ARGSUSED */
 static	int
 bpfclose(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d = dev->si_drv1;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	mtx_lock(&bpf_mtx);
 	if (d->bd_bif)
 		bpf_detachd(d);
 	mtx_unlock(&bpf_mtx);
 	selwakeuppri(&d->bd_sel, PRINET);
 #ifdef MAC
 	mac_destroy_bpfdesc(d);
 #endif /* MAC */
 	knlist_destroy(&d->bd_sel.si_note);
 	bpf_freed(d);
 	dev->si_drv1 = NULL;
 	free(d, M_BPF);
 
 	return (0);
 }
 
 
 /*
  * Rotate the packet buffers in descriptor d.  Move the store buffer
  * into the hold slot, and the free buffer into the store slot.
  * Zero the length of the new store buffer.
  */
 #define ROTATE_BUFFERS(d) \
 	(d)->bd_hbuf = (d)->bd_sbuf; \
 	(d)->bd_hlen = (d)->bd_slen; \
 	(d)->bd_sbuf = (d)->bd_fbuf; \
 	(d)->bd_slen = 0; \
 	(d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d = dev->si_drv1;
 	int timed_out;
 	int error;
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 * Rotate the buffers and return what's here.
 			 */
 			ROTATE_BUFFERS(d);
 			break;
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (ioflag & O_NONBLOCK) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
 	 */
 	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 	BPFD_UNLOCK(d);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d = dev->si_drv1;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	struct sockaddr dst;
 	int error, hlen;
 
 	if (d->bd_bif == NULL)
 		return (ENXIO);
 
 	ifp = d->bd_bif->bif_ifp;
 
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return (ENETDOWN);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu,
 	    &m, &dst, &hlen, d->bd_wfilter);
 	if (error)
 		return (error);
 
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_DONTWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* XXX Do not return the same packet twice. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_SKIP_BPF;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 #ifdef MAC
 	BPFD_LOCK(d);
 	mac_create_mbuf_from_bpfdesc(d, m);
 	if (mc != NULL)
 		mac_create_mbuf_from_bpfdesc(d, mc);
 	BPFD_UNLOCK(d);
 #endif
 
 	NET_LOCK_GIANT();
 	error = (*ifp->if_output)(ifp, m, &dst, NULL);
 	NET_UNLOCK_GIANT();
 
 	if (mc != NULL) {
 		if (error == 0) {
 			NET_LOCK_GIANT();
 			(*ifp->if_input)(ifp, mc);
 			NET_UNLOCK_GIANT();
 		} else
 			m_freem(mc);
 	}
 
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the
  * receive and drop counts.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	mtx_assert(&d->bd_mtx, MA_OWNED);
 	if (d->bd_hbuf) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 	}
 	d->bd_slen = 0;
 	d->bd_hlen = 0;
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  SIOCGIFADDR		Get interface address - convenient hook to driver.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set ethernet read filter.
  *  BIOCSETWF		Set ethernet write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d = dev->si_drv1;
 	int error = 0;
 
 	/* 
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	d->bd_pid = td->td_proc->p_pid;
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST: 
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 	switch (cmd) {
 
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	case SIOCGIFADDR:
 		{
 			struct ifnet *ifp;
 
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				NET_LOCK_GIANT();
 				ifp = d->bd_bif->bif_ifp;
 				error = (*ifp->if_ioctl)(ifp, cmd, addr);
 				NET_UNLOCK_GIANT();
 			}
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		*(u_int *)addr = d->bd_bufsize;
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		if (d->bd_bif != NULL)
 			error = EINVAL;
 		else {
 			u_int size = *(u_int *)addr;
 
 			if (size > bpf_maxbufsize)
 				*(u_int *)addr = size = bpf_maxbufsize;
 			else if (size < BPF_MINBUFSIZE)
 				*(u_int *)addr = size = BPF_MINBUFSIZE;
 			d->bd_bufsize = size;
 		}
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETWF:
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
 		if (d->bd_promisc == 0) {
 			NET_LOCK_GIANT();
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			NET_UNLOCK_GIANT();
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 	case BIOCGDLTLIST:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		error = bpf_setif(d, (struct ifreq *)addr);
 		break;
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 		{
 			struct timeval *tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 		{
 			struct timeval *tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			bs->bs_recv = d->bd_rcount;
 			bs->bs_drop = d->bd_dcount;
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		d->bd_immediate = *(u_int *)addr;
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		*(u_int *)addr = d->bd_hdrcmplt;
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		*(u_int *)addr = d->bd_direction;
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				d->bd_direction = direction;
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		d->bd_feedback = *(u_int *)addr;
 		break;
 
 	case BIOCLOCK:
 		d->bd_locked = 1;
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		d->bd_async = *(int *)addr;
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)addr = fgetown(&d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else
 				d->bd_sig = sig;
 			break;
 		}
 	case BIOCGRSIG:
 		*(u_int *)addr = d->bd_sig;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp.  If this file already has a filter,
  * free it and replace it.  Returns EINVAL for bogus requests.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 	struct bpf_insn *fcode, *old;
 	u_int wfilter, flen, size;
 #ifdef BPF_JITTER
 	bpf_jit_filter *ofunc;
 #endif
 
 	if (cmd == BIOCSETWF) {
 		old = d->bd_wfilter;
 		wfilter = 1;
 #ifdef BPF_JITTER
 		ofunc = NULL;
 #endif
 	} else {
 		wfilter = 0;
 		old = d->bd_rfilter;
 #ifdef BPF_JITTER
 		ofunc = d->bd_bfilter;
 #endif
 	}
 	if (fp->bf_insns == NULL) {
 		if (fp->bf_len != 0)
 			return (EINVAL);
 		BPFD_LOCK(d);
 		if (wfilter)
 			d->bd_wfilter = NULL;
 		else {
 			d->bd_rfilter = NULL;
 #ifdef BPF_JITTER
 			d->bd_bfilter = NULL;
 #endif
 		}
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (old != NULL)
 			free((caddr_t)old, M_BPF);
 #ifdef BPF_JITTER
 		if (ofunc != NULL)
 			bpf_destroy_jit_filter(ofunc);
 #endif
 		return (0);
 	}
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns)
 		return (EINVAL);
 
 	size = flen * sizeof(*fp->bf_insns);
 	fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK);
 	if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 &&
 	    bpf_validate(fcode, (int)flen)) {
 		BPFD_LOCK(d);
 		if (wfilter)
 			d->bd_wfilter = fcode;
 		else {
 			d->bd_rfilter = fcode;
 #ifdef BPF_JITTER
 			d->bd_bfilter = bpf_jitter(fcode, flen);
 #endif
 		}
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (old != NULL)
 			free((caddr_t)old, M_BPF);
 #ifdef BPF_JITTER
 		if (ofunc != NULL)
 			bpf_destroy_jit_filter(ofunc);
 #endif
 
 		return (0);
 	}
 	free((caddr_t)fcode, M_BPF);
 	return (EINVAL);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 	/*
 	 * Allocate the packet buffers if we need to.
 	 * If we're already attached to requested interface,
 	 * just flush the buffer.
 	 */
 	if (d->bd_sbuf == NULL)
 		bpf_allocbufs(d);
 	if (bp != d->bd_bif) {
 		if (d->bd_bif)
 			/*
 			 * Detach if attached to something else.
 			 */
 			bpf_detachd(d);
 
 		bpf_attachd(d, bp);
 	}
 	BPFD_LOCK(d);
 	reset_d(d);
 	BPFD_UNLOCK(d);
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	d = dev->si_drv1;
 	if (d->bd_bif == NULL)
 		return (ENXIO);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	d->bd_pid = td->td_proc->p_pid;
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)dev->si_drv1;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	/* 
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	d->bd_pid = curthread->td_proc->p_pid;
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		if (d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	}
 	else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct bpf_d *d;
 	u_int slen;
 	int gottime;
 	struct timeval tv;
 
 	gottime = 0;
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 #ifdef BPF_JITTER
 		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL)
 			slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen, bcopy, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 /*
  * Copy data from an mbuf chain into a buffer.  This code is derived
  * from m_copydata in sys/uipc_mbuf.c.
  */
 static void
 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
 {
 	const struct mbuf *m;
 	u_int count;
 	u_char *dst;
 
 	m = src_arg;
 	dst = dst_arg;
 	while (len > 0) {
 		if (m == NULL)
 			panic("bpf_mcopy");
 		count = min(m->m_len, len);
 		bcopy(mtod(m, void *), dst, count);
 		m = m->m_next;
 		dst += count;
 		len -= count;
 	}
 }
 
 #define	BPF_CHECK_DIRECTION(d, m) \
 	if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \
 	    ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 	struct timeval tv;
 
 	if (m->m_flags & M_SKIP_BPF) {
 		m->m_flags &= ~M_SKIP_BPF;
 		return;
 	}
 
 	gottime = 0;
 
 	pktlen = m_length(m, NULL);
 
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		BPF_CHECK_DIRECTION(d, m)
 			continue;
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 #ifdef BPF_JITTER
 		/* XXX We cannot handle multiple mbufs. */
 		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL &&
 		    m->m_next == NULL)
 			slen = (*(d->bd_bfilter->func))(mtod(m, u_char *),
 			    pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_mcopy, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 	struct timeval tv;
 
 	if (m->m_flags & M_SKIP_BPF) {
 		m->m_flags &= ~M_SKIP_BPF;
 		return;
 	}
 
 	gottime = 0;
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		BPF_CHECK_DIRECTION(d, m)
 			continue;
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_mcopy, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 #undef	BPF_CHECK_DIRECTION
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
 {
 	struct bpf_hdr *hp;
 	int totlen, curlen;
 	int hdrlen = d->bd_bif->bif_hdrlen;
 	int do_wakeup = 0;
 
 	BPFD_LOCK_ASSERT(d);
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 */
 	curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize) {
 		/*
 		 * This packet will overflow the storage buffer.
 		 * Rotate the buffers if we can, then wakeup any
 		 * pending reads.
 		 */
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * We haven't completed the previous read yet,
 			 * so drop the packet.
 			 */
 			++d->bd_dcount;
 			return;
 		}
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	}
 	else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has
 		 * already expired during a select call.  A packet
 		 * arrived, so the reader should be woken up.
 		 */
 		do_wakeup = 1;
 
 	/*
 	 * Append the bpf header.
 	 */
 	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
 	hp->bh_tstamp = *tv;
 	hp->bh_datalen = pktlen;
 	hp->bh_hdrlen = hdrlen;
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Initialize all nonzero fields of a descriptor.
  */
 static void
 bpf_allocbufs(struct bpf_d *d)
 {
 
 	KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL"));
 	KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL"));
 	KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL"));
 
 	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
 	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
 	d->bd_slen = 0;
 	d->bd_hlen = 0;
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	if (d->bd_sbuf != NULL) {
 		free(d->bd_sbuf, M_BPF);
 		if (d->bd_hbuf != NULL)
 			free(d->bd_hbuf, M_BPF);
 		if (d->bd_fbuf != NULL)
 			free(d->bd_fbuf, M_BPF);
 	}
 	if (d->bd_rfilter) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
 		bpf_destroy_jit_filter(d->bd_bfilter);
 #endif
 	}
 	if (d->bd_wfilter)
 		free((caddr_t)d->bd_wfilter, M_BPF);
 	mtx_destroy(&d->bd_mtx);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
 	if (bp == NULL)
 		panic("bpfattach");
 
 	LIST_INIT(&bp->bif_dlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF);
 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
 	*driverp = bp;
 
 	mtx_lock(&bpf_mtx);
 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	mtx_unlock(&bpf_mtx);
 
 	/*
 	 * Compute the length of the bpf header.  This is not necessarily
 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
 	 * that the network layer header begins on a longword boundary (for
 	 * performance reasons and to alleviate alignment restrictions).
 	 */
 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
 
 	if (bootverbose)
 		if_printf(ifp, "bpf attached\n");
 }
 
 /*
  * Detach bpf from an interface.  This involves detaching each descriptor
  * associated with the interface, and leaving bd_bif NULL.  Notify each
  * descriptor as it's detached so that any sleepers wake up and get
  * ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if	*bp;
 	struct bpf_d	*d;
 
 	/* Locate BPF interface information */
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (ifp == bp->bif_ifp)
 			break;
 	}
 
 	/* Interface wasn't attached */
 	if ((bp == NULL) || (bp->bif_ifp == NULL)) {
 		mtx_unlock(&bpf_mtx);
 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
 		return;
 	}
 
 	LIST_REMOVE(bp, bif_next);
 	mtx_unlock(&bpf_mtx);
 
 	while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
 		bpf_detachd(d);
 		BPFD_LOCK(d);
 		bpf_wakeup(d);
 		BPFD_UNLOCK(d);
 	}
 
 	mtx_destroy(&bp->bif_mtx);
 	free(bp, M_BPF);
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	int n, error;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	ifp = d->bd_bif->bif_ifp;
 	n = 0;
 	error = 0;
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		if (bfl->bfl_list != NULL) {
 			if (n >= bfl->bfl_len) {
 				mtx_unlock(&bpf_mtx);
 				return (ENOMEM);
 			}
 			error = copyout(&bp->bif_dlt,
 			    bfl->bfl_list + n, sizeof(u_int));
 		}
 		n++;
 	}
 	mtx_unlock(&bpf_mtx);
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 	ifp = d->bd_bif->bif_ifp;
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 	mtx_unlock(&bpf_mtx);
 	if (bp != NULL) {
 		opromisc = d->bd_promisc;
 		bpf_detachd(d);
 		bpf_attachd(d, bp);
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (opromisc) {
 			error = ifpromisc(bp->bif_ifp, 1);
 			if (error)
 				if_printf(bp->bif_ifp,
 					"bpf_setdlt: ifpromisc failed (%d)\n",
 					error);
 			else
 				d->bd_promisc = 1;
 		}
 	}
 	return (bp == NULL ? EINVAL : 0);
 }
 
 static void
 bpf_clone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	int u;
 
 	if (*dev != NULL)
 		return;
 	if (dev_stdclone(name, NULL, "bpf", &u) != 1)
 		return;
 	*dev = make_dev(&bpf_cdevsw, unit2minor(u), UID_ROOT, GID_WHEEL, 0600,
 	    "bpf%d", u);
 	dev_ref(*dev);
 	(*dev)->si_flags |= SI_CHEAPCLONE;
 	return;
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 
 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
 	LIST_INIT(&bpf_iflist);
 	EVENTHANDLER_REGISTER(dev_clone, bpf_clone, 0, 1000);
 }
 
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	bzero(d, sizeof(*d));
 	BPFD_LOCK_ASSERT(bd);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = bd->bd_rcount;
 	d->bd_dcount = bd->bd_dcount;
 	d->bd_fcount = bd->bd_fcount;
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 }
 
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct xbpf_d *xbdbuf, *xbd;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	mtx_lock(&bpf_mtx);
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		mtx_unlock(&bpf_mtx);
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_LOCK(bp);
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_UNLOCK(bp);
 	}
 	mtx_unlock(&bpf_mtx);
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL)
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 static struct bpf_if bp_null;
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = &bp_null;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
Index: head/sys/netgraph/netgraph.h
===================================================================
--- head/sys/netgraph/netgraph.h	(revision 171636)
+++ head/sys/netgraph/netgraph.h	(revision 171637)
@@ -1,1139 +1,1139 @@
 /*
  * netgraph.h
  */
 
 /*-
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Julian Elischer <julian@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: netgraph.h,v 1.29 1999/11/01 07:56:13 julian Exp $
  */
 
 #ifndef _NETGRAPH_NETGRAPH_H_
 #define _NETGRAPH_NETGRAPH_H_
 
 #ifndef _KERNEL
 #error "This file should not be included in user level programs"
 #endif
 
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 
 #include "opt_netgraph.h"
 
 /* debugging options */
 #define NG_SEPARATE_MALLOC	/* make modules use their own malloc types */
 
 /*
  * This defines the in-kernel binary interface version.
  * It is possible to change this but leave the external message
  * API the same. Each type also has it's own cookies for versioning as well.
  * Change it for NETGRAPH_DEBUG version so we cannot mix debug and non debug
  * modules.
  */
 #define _NG_ABI_VERSION 11
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 #define NG_ABI_VERSION	(_NG_ABI_VERSION + 0x10000)
 #else	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 #define NG_ABI_VERSION	_NG_ABI_VERSION
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 
 /*
  * Forward references for the basic structures so we can
  * define the typedefs and use them in the structures themselves.
  */
 struct ng_hook ;
 struct ng_node ;
 struct ng_item ;
 typedef	struct ng_item *item_p;
 typedef struct ng_node *node_p;
 typedef struct ng_hook *hook_p;
 
 /* node method definitions */
 typedef	int	ng_constructor_t(node_p node);
 typedef	int	ng_close_t(node_p node);
 typedef	int	ng_shutdown_t(node_p node);
 typedef	int	ng_newhook_t(node_p node, hook_p hook, const char *name);
 typedef	hook_p	ng_findhook_t(node_p node, const char *name);
 typedef	int	ng_connect_t(hook_p hook);
 typedef	int	ng_rcvmsg_t(node_p node, item_p item, hook_p lasthook);
 typedef	int	ng_rcvdata_t(hook_p hook, item_p item);
 typedef	int	ng_disconnect_t(hook_p hook);
 typedef	int	ng_rcvitem (node_p node, hook_p hook, item_p item);
 
 /***********************************************************************
  ***************** Hook Structure and Methods **************************
  ***********************************************************************
  *
  * Structure of a hook
  */
 struct ng_hook {
 	char	hk_name[NG_HOOKSIZ];	/* what this node knows this link as */
 	void   *hk_private;		/* node dependant ID for this hook */
 	int	hk_flags;		/* info about this hook/link */
 	int	hk_refs;		/* dont actually free this till 0 */
 	int	hk_type;		/* tbd: hook data link type */
 	struct	ng_hook *hk_peer;	/* the other end of this link */
 	struct	ng_node *hk_node;	/* The node this hook is attached to */
 	LIST_ENTRY(ng_hook) hk_hooks;	/* linked list of all hooks on node */
 	ng_rcvmsg_t	*hk_rcvmsg;	/* control messages come here */
 	ng_rcvdata_t	*hk_rcvdata;	/* data comes here */
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 #define HK_MAGIC 0x78573011
 	int	hk_magic;
 	char	*lastfile;
 	int	lastline;
 	SLIST_ENTRY(ng_hook)	  hk_all;		/* all existing items */
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 };
 /* Flags for a hook */
 #define HK_INVALID		0x0001	/* don't trust it! */
 #define HK_QUEUE		0x0002	/* queue for later delivery */
 #define HK_FORCE_WRITER		0x0004	/* Incoming data queued as a writer */
 #define HK_DEAD			0x0008	/* This is the dead hook.. don't free */
 
 /*
  * Public Methods for hook
  * If you can't do it with these you probably shouldn;t be doing it.
  */
 void ng_unref_hook(hook_p hook); /* don't move this */
 #define	_NG_HOOK_REF(hook)	atomic_add_int(&(hook)->hk_refs, 1)
 #define _NG_HOOK_NAME(hook)	((hook)->hk_name)
 #define _NG_HOOK_UNREF(hook)	ng_unref_hook(hook)
 #define	_NG_HOOK_SET_PRIVATE(hook, val)	do {(hook)->hk_private = val;} while (0)
 #define	_NG_HOOK_SET_RCVMSG(hook, val)	do {(hook)->hk_rcvmsg = val;} while (0)
 #define	_NG_HOOK_SET_RCVDATA(hook, val)	do {(hook)->hk_rcvdata = val;} while (0)
 #define	_NG_HOOK_PRIVATE(hook)	((hook)->hk_private)
 #define _NG_HOOK_NOT_VALID(hook)	((hook)->hk_flags & HK_INVALID)
 #define _NG_HOOK_IS_VALID(hook)	(!((hook)->hk_flags & HK_INVALID))
 #define _NG_HOOK_NODE(hook)	((hook)->hk_node) /* only rvalue! */
 #define _NG_HOOK_PEER(hook)	((hook)->hk_peer) /* only rvalue! */
 #define _NG_HOOK_FORCE_WRITER(hook)				\
 		do { hook->hk_flags |= HK_FORCE_WRITER; } while (0)
 #define _NG_HOOK_FORCE_QUEUE(hook) do { hook->hk_flags |= HK_QUEUE; } while (0)
 
 /* Some shortcuts */
 #define NG_PEER_NODE(hook)	NG_HOOK_NODE(NG_HOOK_PEER(hook))
 #define NG_PEER_HOOK_NAME(hook)	NG_HOOK_NAME(NG_HOOK_PEER(hook))
 #define NG_PEER_NODE_NAME(hook)	NG_NODE_NAME(NG_PEER_NODE(hook))
 
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 #define _NN_ __FILE__,__LINE__
 void	dumphook (hook_p hook, char *file, int line);
 static __inline void	_chkhook(hook_p hook, char *file, int line);
 static __inline void	_ng_hook_ref(hook_p hook, char * file, int line);
 static __inline char *	_ng_hook_name(hook_p hook, char * file, int line);
 static __inline void	_ng_hook_unref(hook_p hook, char * file, int line);
 static __inline void	_ng_hook_set_private(hook_p hook,
 				void * val, char * file, int line);
 static __inline void	_ng_hook_set_rcvmsg(hook_p hook,
 				ng_rcvmsg_t *val, char * file, int line);
 static __inline void	_ng_hook_set_rcvdata(hook_p hook,
 				ng_rcvdata_t *val, char * file, int line);
 static __inline void *	_ng_hook_private(hook_p hook, char * file, int line);
 static __inline int	_ng_hook_not_valid(hook_p hook, char * file, int line);
 static __inline int	_ng_hook_is_valid(hook_p hook, char * file, int line);
 static __inline node_p	_ng_hook_node(hook_p hook, char * file, int line);
 static __inline hook_p	_ng_hook_peer(hook_p hook, char * file, int line);
 static __inline void	_ng_hook_force_writer(hook_p hook, char * file,
 					int line);
 static __inline void	_ng_hook_force_queue(hook_p hook, char * file, int line);
 
 static __inline void
 _chkhook(hook_p hook, char *file, int line)
 {
 	if (hook->hk_magic != HK_MAGIC) {
 		printf("Accessing freed hook ");
 		dumphook(hook, file, line);
 	}
 	hook->lastline = line;
 	hook->lastfile = file;
 }
 
 static __inline void
 _ng_hook_ref(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_REF(hook);
 }
 
 static __inline char *
 _ng_hook_name(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_NAME(hook));
 }
 
 static __inline void
 _ng_hook_unref(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_UNREF(hook);
 }
 
 static __inline void
 _ng_hook_set_private(hook_p hook, void *val, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_SET_PRIVATE(hook, val);
 }
 
 static __inline void
 _ng_hook_set_rcvmsg(hook_p hook, ng_rcvmsg_t *val, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_SET_RCVMSG(hook, val);
 }
 
 static __inline void
 _ng_hook_set_rcvdata(hook_p hook, ng_rcvdata_t *val, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_SET_RCVDATA(hook, val);
 }
 
 static __inline void *
 _ng_hook_private(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_PRIVATE(hook));
 }
 
 static __inline int
 _ng_hook_not_valid(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_NOT_VALID(hook));
 }
 
 static __inline int
 _ng_hook_is_valid(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_IS_VALID(hook));
 }
 
 static __inline node_p
 _ng_hook_node(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_NODE(hook));
 }
 
 static __inline hook_p
 _ng_hook_peer(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	return (_NG_HOOK_PEER(hook));
 }
 
 static __inline void
 _ng_hook_force_writer(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_FORCE_WRITER(hook);
 }
 
 static __inline void
 _ng_hook_force_queue(hook_p hook, char * file, int line)
 {
 	_chkhook(hook, file, line);
 	_NG_HOOK_FORCE_QUEUE(hook);
 }
 
 
 #define	NG_HOOK_REF(hook)		_ng_hook_ref(hook, _NN_)
 #define NG_HOOK_NAME(hook)		_ng_hook_name(hook, _NN_)
 #define NG_HOOK_UNREF(hook)		_ng_hook_unref(hook, _NN_)
 #define	NG_HOOK_SET_PRIVATE(hook, val)	_ng_hook_set_private(hook, val, _NN_)
 #define	NG_HOOK_SET_RCVMSG(hook, val)	_ng_hook_set_rcvmsg(hook, val, _NN_)
 #define	NG_HOOK_SET_RCVDATA(hook, val)	_ng_hook_set_rcvdata(hook, val, _NN_)
 #define	NG_HOOK_PRIVATE(hook)		_ng_hook_private(hook, _NN_)
 #define NG_HOOK_NOT_VALID(hook)		_ng_hook_not_valid(hook, _NN_)
 #define NG_HOOK_IS_VALID(hook)		_ng_hook_is_valid(hook, _NN_)
 #define NG_HOOK_NODE(hook)		_ng_hook_node(hook, _NN_)
 #define NG_HOOK_PEER(hook)		_ng_hook_peer(hook, _NN_)
 #define NG_HOOK_FORCE_WRITER(hook)	_ng_hook_force_writer(hook, _NN_)
 #define NG_HOOK_FORCE_QUEUE(hook)	_ng_hook_force_queue(hook, _NN_)
 
 #else	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 #define	NG_HOOK_REF(hook)		_NG_HOOK_REF(hook)
 #define NG_HOOK_NAME(hook)		_NG_HOOK_NAME(hook)
 #define NG_HOOK_UNREF(hook)		_NG_HOOK_UNREF(hook)
 #define	NG_HOOK_SET_PRIVATE(hook, val)	_NG_HOOK_SET_PRIVATE(hook, val)
 #define	NG_HOOK_SET_RCVMSG(hook, val)	_NG_HOOK_SET_RCVMSG(hook, val)
 #define	NG_HOOK_SET_RCVDATA(hook, val)	_NG_HOOK_SET_RCVDATA(hook, val)
 #define	NG_HOOK_PRIVATE(hook)		_NG_HOOK_PRIVATE(hook)
 #define NG_HOOK_NOT_VALID(hook)		_NG_HOOK_NOT_VALID(hook)
 #define NG_HOOK_IS_VALID(hook)		_NG_HOOK_IS_VALID(hook)
 #define NG_HOOK_NODE(hook)		_NG_HOOK_NODE(hook)
 #define NG_HOOK_PEER(hook)		_NG_HOOK_PEER(hook)
 #define NG_HOOK_FORCE_WRITER(hook)	_NG_HOOK_FORCE_WRITER(hook)
 #define NG_HOOK_FORCE_QUEUE(hook)	_NG_HOOK_FORCE_QUEUE(hook)
 
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 /***********************************************************************
  ***************** Node Structure and Methods **************************
  ***********************************************************************
  * Structure of a node
  * including the eembedded queue structure.
  *
  * The structure for queueing Netgraph request items
  * embedded in the node structure
  */
 struct ng_queue {
 	u_long		q_flags;
 	struct mtx	q_mtx;
 	item_p queue;
 	item_p *last;
 	struct ng_node *q_node;		/* find the front of the node.. */
 };
 
 struct ng_node {
 	char	nd_name[NG_NODESIZ];	/* optional globally unique name */
 	struct	ng_type *nd_type;	/* the installed 'type' */
 	int	nd_flags;		/* see below for bit definitions */
 	int	nd_refs;		/* # of references to this node */
 	int	nd_numhooks;		/* number of hooks */
 	void   *nd_private;		/* node type dependant node ID */
 	ng_ID_t	nd_ID;			/* Unique per node */
 	LIST_HEAD(hooks, ng_hook) nd_hooks;	/* linked list of node hooks */
 	LIST_ENTRY(ng_node)	  nd_nodes;	/* linked list of all nodes */
 	LIST_ENTRY(ng_node)	  nd_idnodes;	/* ID hash collision list */
 	TAILQ_ENTRY(ng_node)	  nd_work;	/* nodes with work to do */
 	struct	ng_queue	  nd_input_queue; /* input queue for locking */
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 #define ND_MAGIC 0x59264837
 	int	nd_magic;
 	char	*lastfile;
 	int	lastline;
 	SLIST_ENTRY(ng_node)	  nd_all;	/* all existing nodes */
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 };
 
 /* Flags for a node */
 #define NGF_INVALID	0x00000001	/* free when refs go to 0 */
 #define NG_INVALID	NGF_INVALID	/* compat for old code */
 #define NGF_WORKQ	0x00000002	/* node is on the work queue */
 #define NG_WORKQ	NGF_WORKQ	/* compat for old code */
 #define NGF_FORCE_WRITER	0x00000004	/* Never multithread this node */
 #define NG_FORCE_WRITER	NGF_FORCE_WRITER /* compat for old code */
 #define NGF_CLOSING	0x00000008	/* ng_rmnode() at work */
 #define NG_CLOSING	NGF_CLOSING	/* compat for old code */
 #define NGF_REALLY_DIE	0x00000010	/* "persistent" node is unloading */
 #define NG_REALLY_DIE	NGF_REALLY_DIE	/* compat for old code */
 #define NGF_TYPE1	0x10000000	/* reserved for type specific storage */
 #define NGF_TYPE2	0x20000000	/* reserved for type specific storage */
 #define NGF_TYPE3	0x40000000	/* reserved for type specific storage */
 #define NGF_TYPE4	0x80000000	/* reserved for type specific storage */
 
 /*
  * Public methods for nodes.
  * If you can't do it with these you probably shouldn't be doing it.
  */
 int	ng_unref_node(node_p node); /* don't move this */
 #define _NG_NODE_NAME(node)	((node)->nd_name + 0)
 #define _NG_NODE_HAS_NAME(node)	((node)->nd_name[0] + 0)
 #define _NG_NODE_ID(node)	((node)->nd_ID + 0)
 #define	_NG_NODE_REF(node)	atomic_add_int(&(node)->nd_refs, 1)
 #define	_NG_NODE_UNREF(node)	ng_unref_node(node)
 #define	_NG_NODE_SET_PRIVATE(node, val)	do {(node)->nd_private = val;} while (0)
 #define	_NG_NODE_PRIVATE(node)	((node)->nd_private)
 #define _NG_NODE_IS_VALID(node)	(!((node)->nd_flags & NGF_INVALID))
 #define _NG_NODE_NOT_VALID(node)	((node)->nd_flags & NGF_INVALID)
 #define _NG_NODE_NUMHOOKS(node)	((node)->nd_numhooks + 0) /* rvalue */
 #define _NG_NODE_FORCE_WRITER(node)					\
 	do{ node->nd_flags |= NGF_FORCE_WRITER; }while (0)
 #define _NG_NODE_REALLY_DIE(node)					\
 	do{ node->nd_flags |= (NGF_REALLY_DIE|NGF_INVALID); }while (0)
 #define _NG_NODE_REVIVE(node) \
 	do { node->nd_flags &= ~NGF_INVALID; } while (0)
 /*
  * The hook iterator.
  * This macro will call a function of type ng_fn_eachhook for each
  * hook attached to the node. If the function returns 0, then the
  * iterator will stop and return a pointer to the hook that returned 0.
  */
 typedef	int	ng_fn_eachhook(hook_p hook, void* arg);
 #define _NG_NODE_FOREACH_HOOK(node, fn, arg, rethook)			\
 	do {								\
 		hook_p _hook;						\
 		(rethook) = NULL;					\
 		LIST_FOREACH(_hook, &((node)->nd_hooks), hk_hooks) {	\
 			if ((fn)(_hook, arg) == 0) {			\
 				(rethook) = _hook;			\
 				break;					\
 			}						\
 		}							\
 	} while (0)
 
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 void	dumpnode(node_p node, char *file, int line);
 static __inline void _chknode(node_p node, char *file, int line);
 static __inline char * _ng_node_name(node_p node, char *file, int line);
 static __inline int _ng_node_has_name(node_p node, char *file, int line);
 static __inline ng_ID_t _ng_node_id(node_p node, char *file, int line);
 static __inline void _ng_node_ref(node_p node, char *file, int line);
 static __inline int _ng_node_unref(node_p node, char *file, int line);
 static __inline void _ng_node_set_private(node_p node, void * val,
 							char *file, int line);
 static __inline void * _ng_node_private(node_p node, char *file, int line);
 static __inline int _ng_node_is_valid(node_p node, char *file, int line);
 static __inline int _ng_node_not_valid(node_p node, char *file, int line);
 static __inline int _ng_node_numhooks(node_p node, char *file, int line);
 static __inline void _ng_node_force_writer(node_p node, char *file, int line);
 static __inline hook_p _ng_node_foreach_hook(node_p node,
 			ng_fn_eachhook *fn, void *arg, char *file, int line);
 static __inline void _ng_node_revive(node_p node, char *file, int line);
 
 static __inline void
 _chknode(node_p node, char *file, int line)
 {
 	if (node->nd_magic != ND_MAGIC) {
 		printf("Accessing freed node ");
 		dumpnode(node, file, line);
 	}
 	node->lastline = line;
 	node->lastfile = file;
 }
 
 static __inline char *
 _ng_node_name(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_NAME(node));
 }
 
 static __inline int
 _ng_node_has_name(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_HAS_NAME(node));
 }
 
 static __inline ng_ID_t
 _ng_node_id(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_ID(node));
 }
 
 static __inline void
 _ng_node_ref(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	_NG_NODE_REF(node);
 }
 
 static __inline int
 _ng_node_unref(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return (_NG_NODE_UNREF(node));
 }
 
 static __inline void
 _ng_node_set_private(node_p node, void * val, char *file, int line)
 {
 	_chknode(node, file, line);
 	_NG_NODE_SET_PRIVATE(node, val);
 }
 
 static __inline void *
 _ng_node_private(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return (_NG_NODE_PRIVATE(node));
 }
 
 static __inline int
 _ng_node_is_valid(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_IS_VALID(node));
 }
 
 static __inline int
 _ng_node_not_valid(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_NOT_VALID(node));
 }
 
 static __inline int
 _ng_node_numhooks(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	return(_NG_NODE_NUMHOOKS(node));
 }
 
 static __inline void
 _ng_node_force_writer(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	_NG_NODE_FORCE_WRITER(node);
 }
 
 static __inline void
 _ng_node_really_die(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	_NG_NODE_REALLY_DIE(node);
 }
 
 static __inline void
 _ng_node_revive(node_p node, char *file, int line)
 {
 	_chknode(node, file, line);
 	_NG_NODE_REVIVE(node);
 }
 
 static __inline hook_p
 _ng_node_foreach_hook(node_p node, ng_fn_eachhook *fn, void *arg,
 						char *file, int line)
 {
 	hook_p hook;
 	_chknode(node, file, line);
 	_NG_NODE_FOREACH_HOOK(node, fn, arg, hook);
 	return (hook);
 }
 
 #define NG_NODE_NAME(node)		_ng_node_name(node, _NN_)	
 #define NG_NODE_HAS_NAME(node)		_ng_node_has_name(node, _NN_)	
 #define NG_NODE_ID(node)		_ng_node_id(node, _NN_)
 #define NG_NODE_REF(node)		_ng_node_ref(node, _NN_)
 #define	NG_NODE_UNREF(node)		_ng_node_unref(node, _NN_)
 #define	NG_NODE_SET_PRIVATE(node, val)	_ng_node_set_private(node, val, _NN_)
 #define	NG_NODE_PRIVATE(node)		_ng_node_private(node, _NN_)
 #define NG_NODE_IS_VALID(node)		_ng_node_is_valid(node, _NN_)
 #define NG_NODE_NOT_VALID(node)		_ng_node_not_valid(node, _NN_)
 #define NG_NODE_FORCE_WRITER(node) 	_ng_node_force_writer(node, _NN_)
 #define NG_NODE_REALLY_DIE(node) 	_ng_node_really_die(node, _NN_)
 #define NG_NODE_NUMHOOKS(node)		_ng_node_numhooks(node, _NN_)
 #define NG_NODE_REVIVE(node)		_ng_node_revive(node, _NN_)
 #define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook)			      \
 	do {								      \
 		rethook = _ng_node_foreach_hook(node, fn, (void *)arg, _NN_); \
 	} while (0)
 
 #else	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 #define NG_NODE_NAME(node)		_NG_NODE_NAME(node)	
 #define NG_NODE_HAS_NAME(node)		_NG_NODE_HAS_NAME(node)	
 #define NG_NODE_ID(node)		_NG_NODE_ID(node)	
 #define	NG_NODE_REF(node)		_NG_NODE_REF(node)	
 #define	NG_NODE_UNREF(node)		_NG_NODE_UNREF(node)	
 #define	NG_NODE_SET_PRIVATE(node, val)	_NG_NODE_SET_PRIVATE(node, val)	
 #define	NG_NODE_PRIVATE(node)		_NG_NODE_PRIVATE(node)	
 #define NG_NODE_IS_VALID(node)		_NG_NODE_IS_VALID(node)	
 #define NG_NODE_NOT_VALID(node)		_NG_NODE_NOT_VALID(node)	
 #define NG_NODE_FORCE_WRITER(node) 	_NG_NODE_FORCE_WRITER(node)
 #define NG_NODE_REALLY_DIE(node) 	_NG_NODE_REALLY_DIE(node)
 #define NG_NODE_NUMHOOKS(node)		_NG_NODE_NUMHOOKS(node)	
 #define NG_NODE_REVIVE(node)		_NG_NODE_REVIVE(node)
 #define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook)			\
 		_NG_NODE_FOREACH_HOOK(node, fn, arg, rethook)
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 /***********************************************************************
  ************* Node Queue and Item Structures and Methods **************
  ***********************************************************************
  *
  */
 typedef	void	ng_item_fn(node_p node, hook_p hook, void *arg1, int arg2);
 typedef	void	ng_apply_t(void *context, int error);
 struct ng_item {
 	u_long	el_flags;
 	item_p	el_next;
 	node_p	el_dest; /* The node it will be applied against (or NULL) */
 	hook_p	el_hook; /* Entering hook. Optional in Control messages */
 	union {
 		struct mbuf	*da_m;
 		struct {
 			struct ng_mesg	*msg_msg;
 			ng_ID_t		msg_retaddr;
 		} msg;
 		struct {
 			ng_item_fn	*fn_fn;
 			void 		*fn_arg1;
 			int		fn_arg2;
 		} fn;
 	} body;
 	/*
 	 * Optional callback called when item is being applied,
 	 * and its context.
 	 */
 	ng_apply_t	*apply;
 	void		*context;
 #ifdef	NETGRAPH_DEBUG /*----------------------------------------------*/
 	char *lastfile;
 	int  lastline;
 	TAILQ_ENTRY(ng_item)	  all;		/* all existing items */
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 };
 
 #define NGQF_TYPE	0x03		/* MASK of content definition */
 #define NGQF_MESG	0x00		/* the queue element is a message */
 #define NGQF_DATA	0x01		/* the queue element is data */
 #define NGQF_FN		0x02		/* the queue element is a function */
 #define NGQF_UNDEF	0x03		/* UNDEFINED */
 
 #define NGQF_RW		0x04		/* MASK for wanted queue mode */
 #define NGQF_READER	0x04		/* wants to be a reader */
 #define NGQF_WRITER	0x00		/* wants to be a writer */
 
 #define NGQF_QMODE	0x08		/* MASK for how it was queued */
 #define NGQF_QREADER	0x08		/* was queued as a reader */
 #define NGQF_QWRITER	0x00		/* was queued as a writer */
 
 /*
  * Get the mbuf (etc) out of an item.
  * Sets the value in the item to NULL in case we need to call NG_FREE_ITEM()
  * with it, (to avoid freeing the things twice).
  * If you don't want to zero out the item then realise that the
  * item still owns it.
  * Retaddr is different. There are no references on that. It's just a number.
  * The debug versions must be either all used everywhere or not at all.
  */
 
 #define _NGI_M(i) ((i)->body.da_m)
 #define _NGI_MSG(i) ((i)->body.msg.msg_msg)
 #define _NGI_RETADDR(i) ((i)->body.msg.msg_retaddr)
 #define	_NGI_FN(i) ((i)->body.fn.fn_fn)
 #define	_NGI_ARG1(i) ((i)->body.fn.fn_arg1)
 #define	_NGI_ARG2(i) ((i)->body.fn.fn_arg2)
 #define	_NGI_NODE(i) ((i)->el_dest)
 #define	_NGI_HOOK(i) ((i)->el_hook)
 #define	_NGI_SET_HOOK(i,h) do { _NGI_HOOK(i) = h; h = NULL;} while (0)
 #define	_NGI_CLR_HOOK(i)   do {						\
 		hook_p _hook = _NGI_HOOK(i);				\
 		if (_hook) {						\
 			_NG_HOOK_UNREF(_hook);				\
 			_NGI_HOOK(i) = NULL;				\
 		}							\
 	} while (0)
 #define	_NGI_SET_NODE(i,n) do { _NGI_NODE(i) = n; n = NULL;} while (0)
 #define	_NGI_CLR_NODE(i)   do {						\
 		node_p _node = _NGI_NODE(i);				\
 		if (_node) {						\
 			_NG_NODE_UNREF(_node);				\
 			_NGI_NODE(i) = NULL;				\
 		}							\
 	} while (0)
 
 #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/
 void				dumpitem(item_p item, char *file, int line);
 static __inline void		_ngi_check(item_p item, char *file, int line) ;
 static __inline struct mbuf **	_ngi_m(item_p item, char *file, int line) ;
 static __inline ng_ID_t *	_ngi_retaddr(item_p item, char *file, int line);
 static __inline struct ng_mesg ** _ngi_msg(item_p item, char *file, int line) ;
 static __inline ng_item_fn **	_ngi_fn(item_p item, char *file, int line) ;
 static __inline void **		_ngi_arg1(item_p item, char *file, int line) ;
 static __inline int *		_ngi_arg2(item_p item, char *file, int line) ;
 static __inline node_p		_ngi_node(item_p item, char *file, int line);
 static __inline hook_p		_ngi_hook(item_p item, char *file, int line);
 
 static __inline void
 _ngi_check(item_p item, char *file, int line)
 {
 	(item)->lastline = line;
 	(item)->lastfile = file;
 }
 
 static __inline struct mbuf **
 _ngi_m(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_M(item));
 }
 
 static __inline struct ng_mesg **
 _ngi_msg(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_MSG(item));
 }
 
 static __inline ng_ID_t *
 _ngi_retaddr(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_RETADDR(item));
 }
 
 static __inline ng_item_fn **
 _ngi_fn(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_FN(item));
 }
 
 static __inline void **
 _ngi_arg1(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_ARG1(item));
 }
 
 static __inline int *
 _ngi_arg2(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (&_NGI_ARG2(item));
 }
 
 static __inline node_p
 _ngi_node(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (_NGI_NODE(item));
 }
 
 static __inline hook_p
 _ngi_hook(item_p item, char *file, int line)
 {
 	_ngi_check(item, file, line);
 	return (_NGI_HOOK(item));
 }
 
 #define NGI_M(i)	(*_ngi_m(i, _NN_))
 #define NGI_MSG(i)	(*_ngi_msg(i, _NN_))
 #define NGI_RETADDR(i)	(*_ngi_retaddr(i, _NN_))
 #define NGI_FN(i)	(*_ngi_fn(i, _NN_))
 #define NGI_ARG1(i)	(*_ngi_arg1(i, _NN_))
 #define NGI_ARG2(i)	(*_ngi_arg2(i, _NN_))
 #define NGI_HOOK(i)	_ngi_hook(i, _NN_)
 #define NGI_NODE(i)	_ngi_node(i, _NN_)
 #define	NGI_SET_HOOK(i,h)						\
 	do { _ngi_check(i, _NN_); _NGI_SET_HOOK(i, h); } while (0)
 #define	NGI_CLR_HOOK(i)							\
 	do { _ngi_check(i, _NN_); _NGI_CLR_HOOK(i); } while (0)
 #define	NGI_SET_NODE(i,n)						\
 	do { _ngi_check(i, _NN_); _NGI_SET_NODE(i, n); } while (0)
 #define	NGI_CLR_NODE(i)							\
 	do { _ngi_check(i, _NN_); _NGI_CLR_NODE(i); } while (0)
 
 #define NG_FREE_ITEM(item)						\
 	do {								\
 		_ngi_check(item, _NN_);					\
 		ng_free_item((item));					\
 	} while (0)
 
 #define	SAVE_LINE(item)							\
 	do {								\
 		(item)->lastline = __LINE__;				\
 		(item)->lastfile = __FILE__;				\
 	} while (0)
 
 #else	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 #define NGI_M(i)	_NGI_M(i)
 #define NGI_MSG(i)	_NGI_MSG(i)
 #define NGI_RETADDR(i)	_NGI_RETADDR(i)
 #define NGI_FN(i)	_NGI_FN(i)
 #define NGI_ARG1(i)	_NGI_ARG1(i)
 #define NGI_ARG2(i)	_NGI_ARG2(i)
 #define	NGI_NODE(i)	_NGI_NODE(i)
 #define	NGI_HOOK(i)	_NGI_HOOK(i)
 #define	NGI_SET_HOOK(i,h) _NGI_SET_HOOK(i,h)
 #define	NGI_CLR_HOOK(i)	  _NGI_CLR_HOOK(i)
 #define	NGI_SET_NODE(i,n) _NGI_SET_NODE(i,n)
 #define	NGI_CLR_NODE(i)	  _NGI_CLR_NODE(i)
 
 #define	NG_FREE_ITEM(item)	ng_free_item((item))
 #define	SAVE_LINE(item)		do {} while (0)
 
 #endif	/* NETGRAPH_DEBUG */ /*----------------------------------------------*/
 
 #define NGI_GET_M(i,m)							\
 	do {								\
 		(m) = NGI_M(i);						\
 		_NGI_M(i) = NULL;					\
 	} while (0)
 
 #define NGI_GET_MSG(i,m)						\
 	do {								\
 		(m) = NGI_MSG(i);					\
 		_NGI_MSG(i) = NULL;					\
 	} while (0)
 
 #define NGI_GET_NODE(i,n)	/* YOU NOW HAVE THE REFERENCE */	\
 	do {								\
 		(n) = NGI_NODE(i);					\
 		_NGI_NODE(i) = NULL;					\
 	} while (0)
 
 #define NGI_GET_HOOK(i,h)						\
 	do {								\
 		(h) = NGI_HOOK(i);					\
 		_NGI_HOOK(i) = NULL;					\
 	} while (0)
 
 #define NGI_SET_WRITER(i)	((i)->el_flags &= ~NGQF_QMODE)
 #define NGI_SET_READER(i)	((i)->el_flags |= NGQF_QREADER)
 
 #define NGI_QUEUED_READER(i)	((i)->el_flags & NGQF_QREADER)
 #define NGI_QUEUED_WRITER(i)	(((i)->el_flags & NGQF_QMODE) == NGQF_QWRITER)
 	
 /**********************************************************************
 * Data macros.  Send, manipulate and free.
 **********************************************************************/
 /*
  * Assuming the data is already ok, just set the new address and send
  */
 #define NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags)		\
 	do {								\
 		(error) =						\
 		    ng_address_hook(NULL, (item), (hook), NG_NOFLAGS);	\
 		if (error == 0) {					\
 			SAVE_LINE(item);				\
 			(error) = ng_snd_item((item), (flags));		\
 		}							\
 		(item) = NULL;						\
 	} while (0)
 #define	NG_FWD_ITEM_HOOK(error, item, hook)	\
 		NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, NG_NOFLAGS)
 
 /*
  * Forward a data packet. Mbuf pointer is updated to new value. We
  * presume you dealt with the old one when you update it to the new one
  * (or it maybe the old one). We got a packet and possibly had to modify
  * the mbuf. You should probably use NGI_GET_M() if you are going to use
  * this too.
  */
 #define NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, flags)		\
 	do {								\
 		NGI_M(item) = (m);					\
 		(m) = NULL;						\
 		NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags);	\
 	} while (0)
 #define	NG_FWD_NEW_DATA(error, item, hook, m)	\
 		NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, NG_NOFLAGS)
 
 /* Send a previously unpackaged mbuf. XXX: This should be called
  * NG_SEND_DATA in future, but this name is kept for compatibility
  * reasons.
  */
 #define NG_SEND_DATA_FLAGS(error, hook, m, flags)			\
 	do {								\
 		item_p _item;						\
 		if ((_item = ng_package_data((m), flags))) {		\
 			NG_FWD_ITEM_HOOK_FLAGS(error, _item, hook, flags);\
 		} else {						\
 			(error) = ENOMEM;				\
 		}							\
 		(m) = NULL;						\
 	} while (0)
 
 #define NG_SEND_DATA_ONLY(error, hook, m)	\
 		NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS)
 /* NG_SEND_DATA() compat for meta-data times */
 #define	NG_SEND_DATA(error, hook, m, x)	\
 		NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS)
 
 #define NG_FREE_MSG(msg)						\
 	do {								\
 		if ((msg)) {						\
 			FREE((msg), M_NETGRAPH_MSG);			\
 			(msg) = NULL;					\
 		}	 						\
 	} while (0)
 
 #define NG_FREE_M(m)							\
 	do {								\
 		if ((m)) {						\
 			m_freem((m));					\
 			(m) = NULL;					\
 		}							\
 	} while (0)
 
 /*****************************************
 * Message macros
 *****************************************/
 
 #define NG_SEND_MSG_HOOK(error, here, msg, hook, retaddr)		\
 	do {								\
 		item_p _item;						\
 		if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
 			(msg) = NULL;					\
 			(error) = ENOMEM;				\
 			break;						\
 		}							\
 		if (((error) = ng_address_hook((here), (_item),		\
 					(hook), (retaddr))) == 0) {	\
 			SAVE_LINE(_item);				\
 			(error) = ng_snd_item((_item), 0);		\
 		}							\
 		(msg) = NULL;						\
 	} while (0)
 
 #define NG_SEND_MSG_PATH(error, here, msg, path, retaddr)		\
 	do {								\
 		item_p _item;						\
 		if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
 			(msg) = NULL;					\
 			(error) = ENOMEM;				\
 			break;						\
 		}							\
 		if (((error) = ng_address_path((here), (_item),		\
 					(path), (retaddr))) == 0) {	\
 			SAVE_LINE(_item);				\
 			(error) = ng_snd_item((_item), 0);		\
 		}							\
 		(msg) = NULL;						\
 	} while (0)
 
 #define NG_SEND_MSG_ID(error, here, msg, ID, retaddr)			\
 	do {								\
 		item_p _item;						\
 		if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
 			(msg) = NULL;					\
 			(error) = ENOMEM;				\
 			break;						\
 		}							\
 		if (((error) = ng_address_ID((here), (_item),		\
 					(ID), (retaddr))) == 0) {	\
 			SAVE_LINE(_item);				\
 			(error) = ng_snd_item((_item), 0);		\
 		}							\
 		(msg) = NULL;						\
 	} while (0)
 
 /*
  * Redirect the message to the next hop using the given hook.
  * ng_retarget_msg() frees the item if there is an error
  * and returns an error code.  It returns 0 on success.
  */
 #define NG_FWD_MSG_HOOK(error, here, item, hook, retaddr)		\
 	do {								\
 		if (((error) = ng_address_hook((here), (item),		\
 					(hook), (retaddr))) == 0) {	\
 			SAVE_LINE(item);				\
 			(error) = ng_snd_item((item), 0);		\
 		}							\
 		(item) = NULL;						\
 	} while (0)
 
 /*
  * Send a queue item back to it's originator with a response message.
  * Assume original message was removed and freed separatly.
  */
 #define NG_RESPOND_MSG(error, here, item, resp)				\
 	do {								\
 		if (resp) {						\
 			ng_ID_t _dest = NGI_RETADDR(item);		\
 			NGI_RETADDR(item) = 0;				\
 			NGI_MSG(item) = resp;				\
 			if ((error = ng_address_ID((here), (item),	\
 					_dest, 0)) == 0) {		\
 				SAVE_LINE(item);			\
 				(error) = ng_snd_item((item), NG_QUEUE);\
 			}						\
 		} else							\
 			NG_FREE_ITEM(item);				\
 		(item) = NULL;						\
 	} while (0)
 
 
 /***********************************************************************
  ******** Structures Definitions and Macros for defining a node  *******
  ***********************************************************************
  *
  * Here we define the structures needed to actually define a new node
  * type.
  */
 
 /*
  * Command list -- each node type specifies the command that it knows
  * how to convert between ASCII and binary using an array of these.
  * The last element in the array must be a terminator with cookie=0.
  */
 
 struct ng_cmdlist {
 	u_int32_t			cookie;		/* command typecookie */
 	int				cmd;		/* command number */
 	const char			*name;		/* command name */
 	const struct ng_parse_type	*mesgType;	/* args if !NGF_RESP */
 	const struct ng_parse_type	*respType;	/* args if NGF_RESP */
 };
 
 /*
  * Structure of a node type
  * If data is sent to the "rcvdata()" entrypoint then the system
  * may decide to defer it until later by queing it with the normal netgraph
  * input queuing system.  This is decidde by the HK_QUEUE flag being set in
  * the flags word of the peer (receiving) hook. The dequeuing mechanism will
  * ensure it is not requeued again.
  * Note the input queueing system is to allow modules
  * to 'release the stack' or to pass data across spl layers.
  * The data will be redelivered as soon as the NETISR code runs
  * which may be almost immediatly.  A node may also do it's own queueing
  * for other reasons (e.g. device output queuing).
  */
 struct ng_type {
 
 	u_int32_t	version; 	/* must equal NG_API_VERSION */
 	const char	*name;		/* Unique type name */
 	modeventhand_t	mod_event;	/* Module event handler (optional) */
 	ng_constructor_t *constructor;	/* Node constructor */
 	ng_rcvmsg_t	*rcvmsg;	/* control messages come here */
 	ng_close_t	*close;		/* warn about forthcoming shutdown */
 	ng_shutdown_t	*shutdown;	/* reset, and free resources */
 	ng_newhook_t	*newhook;	/* first notification of new hook */
 	ng_findhook_t	*findhook;	/* only if you have lots of hooks */
 	ng_connect_t	*connect;	/* final notification of new hook */
 	ng_rcvdata_t	*rcvdata;	/* data comes here */
 	ng_disconnect_t	*disconnect;	/* notify on disconnect */
 
 	const struct	ng_cmdlist *cmdlist;	/* commands we can convert */
 
 	/* R/W data private to the base netgraph code DON'T TOUCH! */
 	LIST_ENTRY(ng_type) types;		/* linked list of all types */
 	int		    refs;		/* number of instances */
 };
 
 /*
  * Use the NETGRAPH_INIT() macro to link a node type into the
  * netgraph system. This works for types compiled into the kernel
  * as well as KLD modules. The first argument should be the type
  * name (eg, echo) and the second a pointer to the type struct.
  *
  * If a different link time is desired, e.g., a device driver that
  * needs to install its netgraph type before probing, use the
  * NETGRAPH_INIT_ORDERED() macro instead.  Device drivers probably
  * want to use SI_SUB_DRIVERS/SI_ORDER_FIRST.
  */
 
 #define NETGRAPH_INIT_ORDERED(typename, typestructp, sub, order)	\
 static moduledata_t ng_##typename##_mod = {				\
 	"ng_" #typename,						\
 	ng_mod_event,							\
 	(typestructp)							\
 };									\
 DECLARE_MODULE(ng_##typename, ng_##typename##_mod, sub, order);		\
 MODULE_DEPEND(ng_##typename, netgraph,	NG_ABI_VERSION,			\
 					NG_ABI_VERSION,			\
 					NG_ABI_VERSION)
 
 #define NETGRAPH_INIT(tn, tp)						\
 	NETGRAPH_INIT_ORDERED(tn, tp, SI_SUB_PSEUDO, SI_ORDER_ANY)
 
 /* Special malloc() type for netgraph structs and ctrl messages */
 /* Only these two types should be visible to nodes */
 MALLOC_DECLARE(M_NETGRAPH);
 MALLOC_DECLARE(M_NETGRAPH_MSG);
 
 /* declare the base of the netgraph sysclt hierarchy */
 /* but only if this file cares about sysctls */
 #ifdef	SYSCTL_DECL
 SYSCTL_DECL(_net_graph);
 #endif
 
 /*
  * Methods that the nodes can use.
  * Many of these methods should usually NOT be used directly but via
  * Macros above.
  */
 int	ng_address_ID(node_p here, item_p item, ng_ID_t ID, ng_ID_t retaddr);
 int	ng_address_hook(node_p here, item_p item, hook_p hook, ng_ID_t retaddr);
 int	ng_address_path(node_p here, item_p item, char *address, ng_ID_t raddr);
 int	ng_bypass(hook_p hook1, hook_p hook2);
 hook_p	ng_findhook(node_p node, const char *name);
 struct	ng_type *ng_findtype(const char *type);
 int	ng_make_node_common(struct ng_type *typep, node_p *nodep);
 int	ng_name_node(node_p node, const char *name);
 int	ng_newtype(struct ng_type *tp);
 ng_ID_t ng_node2ID(node_p node);
 item_p	ng_package_data(struct mbuf *m, int flags);
 item_p	ng_package_msg(struct ng_mesg *msg, int flags);
 item_p	ng_package_msg_self(node_p here, hook_p hook, struct ng_mesg *msg);
 void	ng_replace_retaddr(node_p here, item_p item, ng_ID_t retaddr);
 int	ng_rmhook_self(hook_p hook);	/* if a node wants to kill a hook */
 int	ng_rmnode_self(node_p here);	/* if a node wants to suicide */
 int	ng_rmtype(struct ng_type *tp);
 int	ng_snd_item(item_p item, int queue);
 int 	ng_send_fn1(node_p node, hook_p hook, ng_item_fn *fn,
 	void *arg1, int arg2, int flags);
 #define	ng_send_fn(node, hook, fn, arg1, arg2) \
 	ng_send_fn1(node, hook, fn, arg1, arg2, NG_NOFLAGS)
 int	ng_uncallout(struct callout *c, node_p node);
 int	ng_callout(struct callout *c, node_p node, hook_p hook, int ticks,
 	    ng_item_fn *fn, void * arg1, int arg2);
-#define	ng_callout_init(c)	callout_init(c, NET_CALLOUT_MPSAFE)
+#define	ng_callout_init(c)	callout_init(c, CALLOUT_MPSAFE)
 
 /* Flags for netgraph functions. */
 #define	NG_NOFLAGS	0x00000000	/* no special options */
 #define	NG_QUEUE	0x00000001	/* enqueue item, don't dispatch */
 #define	NG_WAITOK	0x00000002	/* use M_WAITOK, etc. */
 #define	NG_PROGRESS	0x00000004	/* return EINPROGRESS if queued */
 
 /*
  * prototypes the user should DEFINITELY not use directly
  */
 void	ng_free_item(item_p item); /* Use NG_FREE_ITEM instead */
 int	ng_mod_event(module_t mod, int what, void *arg);
 
 /*
  * Tag definitions and constants
  */
 
 #define	NG_TAG_PRIO	1
 
 struct ng_tag_prio {
 	struct m_tag	tag;
 	char	priority;
 	char	discardability;
 };
 
 #define	NG_PRIO_CUTOFF		32
 #define	NG_PRIO_LINKSTATE	64
 
 /* Macros and declarations to keep compatibility with metadata, which
  * is obsoleted now. To be deleted.
  */
 typedef void *meta_p;
 #define _NGI_META(i)	NULL
 #define NGI_META(i)	NULL
 #define NG_FREE_META(meta)
 #define NGI_GET_META(i,m)
 #define	ng_copy_meta(meta) NULL
 
 #endif /* _NETGRAPH_NETGRAPH_H_ */
Index: head/sys/netinet/ip_carp.c
===================================================================
--- head/sys/netinet/ip_carp.c	(revision 171636)
+++ head/sys/netinet/ip_carp.c	(revision 171637)
@@ -1,2220 +1,2220 @@
 /* 	$FreeBSD$ */
 
 /*
  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
  * Copyright (c) 2003 Ryan McBride. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_carp.h"
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 
 #include <sys/socket.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/fddi.h>
 #include <net/iso88025.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/if_ether.h>
 #include <machine/in_cksum.h>
 #endif
 
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <crypto/sha1.h>
 #include <netinet/ip_carp.h>
 
 #define	CARP_IFNAME	"carp"
 static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
 SYSCTL_DECL(_net_inet_carp);
 
 struct carp_softc {
 	struct ifnet	 	*sc_ifp;	/* Interface clue */
 	struct ifnet		*sc_carpdev;	/* Pointer to parent interface */
 	struct in_ifaddr 	*sc_ia;		/* primary iface address */
 	struct ip_moptions 	 sc_imo;
 #ifdef INET6
 	struct in6_ifaddr 	*sc_ia6;	/* primary iface address v6 */
 	struct ip6_moptions 	 sc_im6o;
 #endif /* INET6 */
 	TAILQ_ENTRY(carp_softc)	 sc_list;
 
 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
 
 	int			 sc_flags_backup;
 	int			 sc_suppress;
 
 	int			 sc_sendad_errors;
 #define	CARP_SENDAD_MAX_ERRORS	3
 	int			 sc_sendad_success;
 #define	CARP_SENDAD_MIN_SUCCESS 3
 
 	int			 sc_vhid;
 	int			 sc_advskew;
 	int			 sc_naddrs;
 	int			 sc_naddrs6;
 	int			 sc_advbase;	/* seconds */
 	int			 sc_init_counter;
 	u_int64_t		 sc_counter;
 
 	/* authentication */
 #define CARP_HMAC_PAD	64
 	unsigned char sc_key[CARP_KEY_LEN];
 	unsigned char sc_pad[CARP_HMAC_PAD];
 	SHA1_CTX sc_sha1;
 
 	struct callout		 sc_ad_tmo;	/* advertisement timeout */
 	struct callout		 sc_md_tmo;	/* master down timeout */
 	struct callout 		 sc_md6_tmo;	/* master down timeout */
 	
 	LIST_ENTRY(carp_softc)	 sc_next;	/* Interface clue */
 };
 #define	SC2IFP(sc)	((sc)->sc_ifp)
 
 int carp_suppress_preempt = 0;
 int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 };	/* XXX for now */
 SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
     &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
 SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
     &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
 SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
     &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
 SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
     &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
     &carp_suppress_preempt, 0, "Preemption is suppressed");
 
 struct carpstats carpstats;
 SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
     &carpstats, carpstats,
     "CARP statistics (struct carpstats, netinet/ip_carp.h)");
 
 struct carp_if {
 	TAILQ_HEAD(, carp_softc) vhif_vrs;
 	int vhif_nvrs;
 
 	struct ifnet 	*vhif_ifp;
 	struct mtx	 vhif_mtx;
 };
 
 /* Get carp_if from softc. Valid after carp_set_addr{,6}. */
 #define	SC2CIF(sc)		((struct carp_if *)(sc)->sc_carpdev->if_carp)
 
 /* lock per carp_if queue */
 #define	CARP_LOCK_INIT(cif)	mtx_init(&(cif)->vhif_mtx, "carp_if", 	\
 	NULL, MTX_DEF)
 #define	CARP_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->vhif_mtx)
 #define	CARP_LOCK_ASSERT(cif)	mtx_assert(&(cif)->vhif_mtx, MA_OWNED)
 #define	CARP_LOCK(cif)		mtx_lock(&(cif)->vhif_mtx)
 #define	CARP_UNLOCK(cif)	mtx_unlock(&(cif)->vhif_mtx)
 
 #define	CARP_SCLOCK(sc)		mtx_lock(&SC2CIF(sc)->vhif_mtx)
 #define	CARP_SCUNLOCK(sc)	mtx_unlock(&SC2CIF(sc)->vhif_mtx)
 #define	CARP_SCLOCK_ASSERT(sc)	mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED)
 
 #define	CARP_LOG(...)	do {				\
 	if (carp_opts[CARPCTL_LOG] > 0)			\
 		log(LOG_INFO, __VA_ARGS__);		\
 } while (0)
 
 #define	CARP_DEBUG(...)	do {				\
 	if (carp_opts[CARPCTL_LOG] > 1)			\
 		log(LOG_DEBUG, __VA_ARGS__);		\
 } while (0)
 
 static void	carp_hmac_prepare(struct carp_softc *);
 static void	carp_hmac_generate(struct carp_softc *, u_int32_t *,
 		    unsigned char *);
 static int	carp_hmac_verify(struct carp_softc *, u_int32_t *,
 		    unsigned char *);
 static void	carp_setroute(struct carp_softc *, int);
 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
 static int 	carp_clone_create(struct if_clone *, int, caddr_t);
 static void 	carp_clone_destroy(struct ifnet *);
 static void	carpdetach(struct carp_softc *, int);
 static int	carp_prepare_ad(struct mbuf *, struct carp_softc *,
 		    struct carp_header *);
 static void	carp_send_ad_all(void);
 static void	carp_send_ad(void *);
 static void	carp_send_ad_locked(struct carp_softc *);
 static void	carp_send_arp(struct carp_softc *);
 static void	carp_master_down(void *);
 static void	carp_master_down_locked(struct carp_softc *);
 static int	carp_ioctl(struct ifnet *, u_long, caddr_t);
 static int	carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 static void	carp_start(struct ifnet *);
 static void	carp_setrun(struct carp_softc *, sa_family_t);
 static void	carp_set_state(struct carp_softc *, int);
 static int	carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
 enum	{ CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
 
 static void	carp_multicast_cleanup(struct carp_softc *);
 static int	carp_set_addr(struct carp_softc *, struct sockaddr_in *);
 static int	carp_del_addr(struct carp_softc *, struct sockaddr_in *);
 static void	carp_carpdev_state_locked(struct carp_if *);
 static void	carp_sc_state_locked(struct carp_softc *);
 #ifdef INET6
 static void	carp_send_na(struct carp_softc *);
 static int	carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
 static int	carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
 static void	carp_multicast6_cleanup(struct carp_softc *);
 #endif
 
 static LIST_HEAD(, carp_softc) carpif_list;
 static struct mtx carp_mtx;
 IFC_SIMPLE_DECLARE(carp, 0);
 
 static eventhandler_tag if_detach_event_tag;
 
 static __inline u_int16_t
 carp_cksum(struct mbuf *m, int len)
 {
 	return (in_cksum(m, len));
 }
 
 static void
 carp_hmac_prepare(struct carp_softc *sc)
 {
 	u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
 	u_int8_t vhid = sc->sc_vhid & 0xff;
 	struct ifaddr *ifa;
 	int i;
 #ifdef INET6
 	struct in6_addr in6;
 #endif
 
 	if (sc->sc_carpdev)
 		CARP_SCLOCK(sc);
 
 	/* XXX: possible race here */
 
 	/* compute ipad from key */
 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36;
 
 	/* precompute first part of inner hash */
 	SHA1Init(&sc->sc_sha1);
 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
 #ifdef INET
 	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			SHA1Update(&sc->sc_sha1,
 			    (void *)&ifatoia(ifa)->ia_addr.sin_addr.s_addr,
 			    sizeof(struct in_addr));
 	}
 #endif /* INET */
 #ifdef INET6
 	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 			in6_clearscope(&in6);
 			SHA1Update(&sc->sc_sha1, (void *)&in6, sizeof(in6));
 		}
 	}
 #endif /* INET6 */
 
 	/* convert ipad to opad */
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
 
 	if (sc->sc_carpdev)
 		CARP_SCUNLOCK(sc);
 }
 
 static void
 carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
     unsigned char md[20])
 {
 	SHA1_CTX sha1ctx;
 
 	/* fetch first half of inner hash */
 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
 
 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
 	SHA1Final(md, &sha1ctx);
 
 	/* outer hash */
 	SHA1Init(&sha1ctx);
 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sha1ctx, md, 20);
 	SHA1Final(md, &sha1ctx);
 }
 
 static int
 carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2],
     unsigned char md[20])
 {
 	unsigned char md2[20];
 
 	CARP_SCLOCK_ASSERT(sc);
 
 	carp_hmac_generate(sc, counter, md2);
 
 	return (bcmp(md, md2, sizeof(md2)));
 }
 
 static void
 carp_setroute(struct carp_softc *sc, int cmd)
 {
 	struct ifaddr *ifa;
 	int s;
 
 	if (sc->sc_carpdev)
 		CARP_SCLOCK_ASSERT(sc);
 
 	s = splnet();
 	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    sc->sc_carpdev != NULL) {
 			int count = carp_addrcount(
 			    (struct carp_if *)sc->sc_carpdev->if_carp,
 			    ifatoia(ifa), CARP_COUNT_MASTER);
 
 			if ((cmd == RTM_ADD && count == 1) ||
 			    (cmd == RTM_DELETE && count == 0))
 				rtinit(ifa, cmd, RTF_UP | RTF_HOST);
 		}
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			if (cmd == RTM_ADD)
 				in6_ifaddloop(ifa);
 			else
 				in6_ifremloop(ifa);
 		}
 #endif /* INET6 */
 	}
 	splx(s);
 }
 
 static int
 carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 
 	struct carp_softc *sc;
 	struct ifnet *ifp;
 
 	MALLOC(sc, struct carp_softc *, sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
 	ifp = SC2IFP(sc) = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		FREE(sc, M_CARP);
 		return (ENOSPC);
 	}
 	
 	sc->sc_flags_backup = 0;
 	sc->sc_suppress = 0;
 	sc->sc_advbase = CARP_DFLTINTV;
 	sc->sc_vhid = -1;	/* required setting */
 	sc->sc_advskew = 0;
 	sc->sc_init_counter = 1;
 	sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
 #ifdef INET6
 	sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
 #endif
 	sc->sc_imo.imo_membership = (struct in_multi **)malloc(
 	    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
 	    M_WAITOK);
 	sc->sc_imo.imo_mfilters = NULL;
 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	sc->sc_imo.imo_multicast_vif = -1;
 
-	callout_init(&sc->sc_ad_tmo, NET_CALLOUT_MPSAFE);
-	callout_init(&sc->sc_md_tmo, NET_CALLOUT_MPSAFE);
-	callout_init(&sc->sc_md6_tmo, NET_CALLOUT_MPSAFE);
+	callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE);
+	callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE);
+	callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE);
 	
 	ifp->if_softc = sc;
 	if_initname(ifp, CARP_IFNAME, unit);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_flags = IFF_LOOPBACK;
 	ifp->if_ioctl = carp_ioctl;
 	ifp->if_output = carp_looutput;
 	ifp->if_start = carp_start;
 	ifp->if_type = IFT_CARP;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = 0;
 	if_attach(ifp);
 	bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t));
 	mtx_lock(&carp_mtx);
 	LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
 	mtx_unlock(&carp_mtx);
 	return (0);
 }
 
 static void
 carp_clone_destroy(struct ifnet *ifp)
 {
 	struct carp_softc *sc = ifp->if_softc;
 
 	if (sc->sc_carpdev)
 		CARP_SCLOCK(sc);
 	carpdetach(sc, 1);	/* Returns unlocked. */
 
 	mtx_lock(&carp_mtx);
 	LIST_REMOVE(sc, sc_next);
 	mtx_unlock(&carp_mtx);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free_type(ifp, IFT_ETHER);
 	free(sc->sc_imo.imo_membership, M_CARP);
 	free(sc, M_CARP);
 }
 
 /*
  * This function can be called on CARP interface destroy path,
  * and in case of the removal of the underlying interface as
  * well. We differentiate these two cases. In the latter case
  * we do not cleanup our multicast memberships, since they
  * are already freed. Also, in the latter case we do not
  * release the lock on return, because the function will be
  * called once more, for another CARP instance on the same
  * interface.
  */
 static void
 carpdetach(struct carp_softc *sc, int unlock)
 {
 	struct carp_if *cif;
 
 	callout_stop(&sc->sc_ad_tmo);
 	callout_stop(&sc->sc_md_tmo);
 	callout_stop(&sc->sc_md6_tmo);
 
 	if (sc->sc_suppress)
 		carp_suppress_preempt--;
 	sc->sc_suppress = 0;
 
 	if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
 		carp_suppress_preempt--;
 	sc->sc_sendad_errors = 0;
 
 	carp_set_state(sc, INIT);
 	SC2IFP(sc)->if_flags &= ~IFF_UP;
 	carp_setrun(sc, 0);
 	if (unlock)
 		carp_multicast_cleanup(sc);
 #ifdef INET6
 	carp_multicast6_cleanup(sc);
 #endif
 
 	if (sc->sc_carpdev != NULL) {
 		cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 		CARP_LOCK_ASSERT(cif);
 		TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 		if (!--cif->vhif_nvrs) {
 			ifpromisc(sc->sc_carpdev, 0);
 			sc->sc_carpdev->if_carp = NULL;
 			CARP_LOCK_DESTROY(cif);
 			FREE(cif, M_IFADDR);
 		} else if (unlock)
 			CARP_UNLOCK(cif);
 		sc->sc_carpdev = NULL;
 	}
 }
 
 /* Detach an interface from the carp. */
 static void
 carp_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct carp_if *cif = (struct carp_if *)ifp->if_carp;
 	struct carp_softc *sc, *nextsc;
 
 	if (cif == NULL)
 		return;
 
 	/*
 	 * XXX: At the end of for() cycle the lock will be destroyed.
 	 */
 	CARP_LOCK(cif);
 	for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
 		nextsc = TAILQ_NEXT(sc, sc_list);
 		carpdetach(sc, 0);
 	}
 }
 
 /*
  * process input packet.
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
 void
 carp_input(struct mbuf *m, int hlen)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_header *ch;
 	int iplen, len;
 
 	carpstats.carps_ipackets++;
 
 	if (!carp_opts[CARPCTL_ALLOW]) {
 		m_freem(m);
 		return;
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		carpstats.carps_badif++;
 		CARP_LOG("carp_input: packet received on non-carp "
 		    "interface: %s\n",
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return;
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != CARP_DFLTTL) {
 		carpstats.carps_badttl++;
 		CARP_LOG("carp_input: received ttl %d != 255i on %s\n",
 		    ip->ip_ttl,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return;
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
 		carpstats.carps_badlen++;
 		CARP_LOG("carp_input: received len %zd < "
 		    "sizeof(struct carp_header)\n",
 		    m->m_len - sizeof(struct ip));
 		m_freem(m);
 		return;
 	}
 
 	if (iplen + sizeof(*ch) < m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
 			carpstats.carps_hdrops++;
 			CARP_LOG("carp_input: pullup failed\n");
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/*
 	 * verify that the received packet length is
 	 * equal to the CARP header
 	 */
 	len = iplen + sizeof(*ch);
 	if (len > m->m_pkthdr.len) {
 		carpstats.carps_badlen++;
 		CARP_LOG("carp_input: packet too short %d on %s\n",
 		    m->m_pkthdr.len,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return;
 	}
 
 	if ((m = m_pullup(m, len)) == NULL) {
 		carpstats.carps_hdrops++;
 		return;
 	}
 	ip = mtod(m, struct ip *);
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/* verify the CARP checksum */
 	m->m_data += iplen;
 	if (carp_cksum(m, len - iplen)) {
 		carpstats.carps_badsum++;
 		CARP_LOG("carp_input: checksum failed on %s\n",
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return;
 	}
 	m->m_data -= iplen;
 
 	carp_input_c(m, ch, AF_INET);
 }
 
 #ifdef INET6
 int
 carp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
 	u_int len;
 
 	carpstats.carps_ipackets6++;
 
 	if (!carp_opts[CARPCTL_ALLOW]) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		carpstats.carps_badif++;
 		CARP_LOG("carp6_input: packet received on non-carp "
 		    "interface: %s\n",
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255 */
 	if (ip6->ip6_hlim != CARP_DFLTTL) {
 		carpstats.carps_badttl++;
 		CARP_LOG("carp6_input: received ttl %d != 255 on %s\n",
 		    ip6->ip6_hlim,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that we have a complete carp packet */
 	len = m->m_len;
 	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
 	if (ch == NULL) {
 		carpstats.carps_badlen++;
 		CARP_LOG("carp6_input: packet size %u too small\n", len);
 		return (IPPROTO_DONE);
 	}
 
 
 	/* verify the CARP checksum */
 	m->m_data += *offp;
 	if (carp_cksum(m, sizeof(*ch))) {
 		carpstats.carps_badsum++;
 		CARP_LOG("carp6_input: checksum failed, on %s\n",
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= *offp;
 
 	carp_input_c(m, ch, AF_INET6);
 	return (IPPROTO_DONE);
 }
 #endif /* INET6 */
 
 static void
 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct carp_softc *sc;
 	u_int64_t tmp_counter;
 	struct timeval sc_tv, ch_tv;
 
 	/* verify that the VHID is valid on the receiving interface */
 	CARP_LOCK(ifp->if_carp);
 	TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list)
 		if (sc->sc_vhid == ch->carp_vhid)
 			break;
 
 	if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) &&
 	    (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
 		carpstats.carps_badvhid++;
 		CARP_UNLOCK(ifp->if_carp);
 		m_freem(m);
 		return;
 	}
 
 	getmicrotime(&SC2IFP(sc)->if_lastchange);
 	SC2IFP(sc)->if_ipackets++;
 	SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
 
 	if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
 		struct ip *ip = mtod(m, struct ip *);
 		uint32_t af1 = af;
 
 		/* BPF wants net byte order */
 		ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
 		ip->ip_off = htons(ip->ip_off);
 		bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
 	}
 
 	/* verify the CARP version. */
 	if (ch->carp_version != CARP_VERSION) {
 		carpstats.carps_badver++;
 		SC2IFP(sc)->if_ierrors++;
 		CARP_UNLOCK(ifp->if_carp);
 		CARP_LOG("%s; invalid version %d\n",
 		    SC2IFP(sc)->if_xname,
 		    ch->carp_version);
 		m_freem(m);
 		return;
 	}
 
 	/* verify the hash */
 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
 		carpstats.carps_badauth++;
 		SC2IFP(sc)->if_ierrors++;
 		CARP_UNLOCK(ifp->if_carp);
 		CARP_LOG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
 		m_freem(m);
 		return;
 	}
 
 	tmp_counter = ntohl(ch->carp_counter[0]);
 	tmp_counter = tmp_counter<<32;
 	tmp_counter += ntohl(ch->carp_counter[1]);
 
 	/* XXX Replay protection goes here */
 
 	sc->sc_init_counter = 0;
 	sc->sc_counter = tmp_counter;
 
 	sc_tv.tv_sec = sc->sc_advbase;
 	if (carp_suppress_preempt && sc->sc_advskew <  240)
 		sc_tv.tv_usec = 240 * 1000000 / 256;
 	else
 		sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 	ch_tv.tv_sec = ch->carp_advbase;
 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
 
 	switch (sc->sc_state) {
 	case INIT:
 		break;
 	case MASTER:
 		/*
 		 * If we receive an advertisement from a master who's going to
 		 * be more frequent than us, go into BACKUP state.
 		 */
 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
 			callout_stop(&sc->sc_ad_tmo);
 			CARP_DEBUG("%s: MASTER -> BACKUP "
 			   "(more frequent advertisement received)\n",
 			   SC2IFP(sc)->if_xname);
 			carp_set_state(sc, BACKUP);
 			carp_setrun(sc, 0);
 			carp_setroute(sc, RTM_DELETE);
 		}
 		break;
 	case BACKUP:
 		/*
 		 * If we're pre-empting masters who advertise slower than us,
 		 * and this one claims to be slower, treat him as down.
 		 */
 		if (carp_opts[CARPCTL_PREEMPT] &&
 		    timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_DEBUG("%s: BACKUP -> MASTER "
 			    "(preempting a slower master)\n",
 			    SC2IFP(sc)->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 *  If the master is going to advertise at such a low frequency
 		 *  that he's guaranteed to time out, we'd might as well just
 		 *  treat him as timed out now.
 		 */
 		sc_tv.tv_sec = sc->sc_advbase * 3;
 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_DEBUG("%s: BACKUP -> MASTER "
 			    "(master timed out)\n",
 			    SC2IFP(sc)->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 * Otherwise, we reset the counter and wait for the next
 		 * advertisement.
 		 */
 		carp_setrun(sc, af);
 		break;
 	}
 
 	CARP_UNLOCK(ifp->if_carp);
 
 	m_freem(m);
 	return;
 }
 
 static int
 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
 {
 	struct m_tag *mtag;
 	struct ifnet *ifp = SC2IFP(sc);
 
 	if (sc->sc_init_counter) {
 		/* this could also be seconds since unix epoch */
 		sc->sc_counter = arc4random();
 		sc->sc_counter = sc->sc_counter << 32;
 		sc->sc_counter += arc4random();
 	} else
 		sc->sc_counter++;
 
 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 
 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
 
 	/* Tag packet for carp_output */
 	mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT);
 	if (mtag == NULL) {
 		m_freem(m);
 		SC2IFP(sc)->if_oerrors++;
 		return (ENOMEM);
 	}
 	bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
 	m_tag_prepend(m, mtag);
 
 	return (0);
 }
 
 static void
 carp_send_ad_all(void)
 {
 	struct carp_softc *sc;
 
 	mtx_lock(&carp_mtx);
 	LIST_FOREACH(sc, &carpif_list, sc_next) {
 		if (sc->sc_carpdev == NULL)
 			continue;
 		CARP_SCLOCK(sc);
 		if ((SC2IFP(sc)->if_flags & IFF_UP) &&
 		    (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) &&
 		     sc->sc_state == MASTER)
 			carp_send_ad_locked(sc);
 		CARP_SCUNLOCK(sc);
 	}
 	mtx_unlock(&carp_mtx);
 }
 
 static void
 carp_send_ad(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_SCLOCK(sc);
 	carp_send_ad_locked(sc);
 	CARP_SCUNLOCK(sc);
 }
 
 static void
 carp_send_ad_locked(struct carp_softc *sc)
 {
 	struct carp_header ch;
 	struct timeval tv;
 	struct carp_header *ch_ptr;
 	struct mbuf *m;
 	int len, advbase, advskew;
 
 	CARP_SCLOCK_ASSERT(sc);
 
 	/* bow out if we've lost our UPness or RUNNINGuiness */
 	if (!((SC2IFP(sc)->if_flags & IFF_UP) &&
 	    (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
 		advbase = 255;
 		advskew = 255;
 	} else {
 		advbase = sc->sc_advbase;
 		if (!carp_suppress_preempt || sc->sc_advskew > 240)
 			advskew = sc->sc_advskew;
 		else
 			advskew = 240;
 		tv.tv_sec = advbase;
 		tv.tv_usec = advskew * 1000000 / 256;
 	}
 
 	ch.carp_version = CARP_VERSION;
 	ch.carp_type = CARP_ADVERTISEMENT;
 	ch.carp_vhid = sc->sc_vhid;
 	ch.carp_advbase = advbase;
 	ch.carp_advskew = advskew;
 	ch.carp_authlen = 7;	/* XXX DEFINE */
 	ch.carp_pad1 = 0;	/* must be zero */
 	ch.carp_cksum = 0;
 
 #ifdef INET
 	if (sc->sc_ia) {
 		struct ip *ip;
 
 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
 		if (m == NULL) {
 			SC2IFP(sc)->if_oerrors++;
 			carpstats.carps_onomem++;
 			/* XXX maybe less ? */
 			if (advbase != 255 || advskew != 255)
 				callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 				    carp_send_ad, sc);
 			return;
 		}
 		len = sizeof(*ip) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		MH_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = len;
 		ip->ip_id = ip_newid();
 		ip->ip_off = IP_DF;
 		ip->ip_ttl = CARP_DFLTTL;
 		ip->ip_p = IPPROTO_CARP;
 		ip->ip_sum = 0;
 		ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
 
 		ch_ptr = (struct carp_header *)(&ip[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			return;
 
 		m->m_data += sizeof(*ip);
 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
 		m->m_data -= sizeof(*ip);
 
 		getmicrotime(&SC2IFP(sc)->if_lastchange);
 		SC2IFP(sc)->if_opackets++;
 		SC2IFP(sc)->if_obytes += len;
 		carpstats.carps_opackets++;
 
 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
 			SC2IFP(sc)->if_oerrors++;
 			if (sc->sc_sendad_errors < INT_MAX)
 				sc->sc_sendad_errors++;
 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 				carp_suppress_preempt++;
 				if (carp_suppress_preempt == 1) {
 					CARP_SCUNLOCK(sc);
 					carp_send_ad_all();
 					CARP_SCLOCK(sc);
 				}
 			}
 			sc->sc_sendad_success = 0;
 		} else {
 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
 				if (++sc->sc_sendad_success >=
 				    CARP_SENDAD_MIN_SUCCESS) {
 					carp_suppress_preempt--;
 					sc->sc_sendad_errors = 0;
 				}
 			} else
 				sc->sc_sendad_errors = 0;
 		}
 	}
 #endif /* INET */
 #ifdef INET6
 	if (sc->sc_ia6) {
 		struct ip6_hdr *ip6;
 
 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
 		if (m == NULL) {
 			SC2IFP(sc)->if_oerrors++;
 			carpstats.carps_onomem++;
 			/* XXX maybe less ? */
 			if (advbase != 255 || advskew != 255)
 				callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 				    carp_send_ad, sc);
 			return;
 		}
 		len = sizeof(*ip6) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		MH_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip6 = mtod(m, struct ip6_hdr *);
 		bzero(ip6, sizeof(*ip6));
 		ip6->ip6_vfc |= IPV6_VERSION;
 		ip6->ip6_hlim = CARP_DFLTTL;
 		ip6->ip6_nxt = IPPROTO_CARP;
 		bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
 		    sizeof(struct in6_addr));
 		/* set the multicast destination */
 
 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
 		ip6->ip6_dst.s6_addr8[15] = 0x12;
 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 			SC2IFP(sc)->if_oerrors++;
 			m_freem(m);
 			CARP_LOG("%s: in6_setscope failed\n", __func__);
 			return;
 		}
 
 		ch_ptr = (struct carp_header *)(&ip6[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			return;
 
 		m->m_data += sizeof(*ip6);
 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
 		m->m_data -= sizeof(*ip6);
 
 		getmicrotime(&SC2IFP(sc)->if_lastchange);
 		SC2IFP(sc)->if_opackets++;
 		SC2IFP(sc)->if_obytes += len;
 		carpstats.carps_opackets6++;
 
 		if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
 			SC2IFP(sc)->if_oerrors++;
 			if (sc->sc_sendad_errors < INT_MAX)
 				sc->sc_sendad_errors++;
 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 				carp_suppress_preempt++;
 				if (carp_suppress_preempt == 1) {
 					CARP_SCUNLOCK(sc);
 					carp_send_ad_all();
 					CARP_SCLOCK(sc);
 				}
 			}
 			sc->sc_sendad_success = 0;
 		} else {
 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
 				if (++sc->sc_sendad_success >=
 				    CARP_SENDAD_MIN_SUCCESS) {
 					carp_suppress_preempt--;
 					sc->sc_sendad_errors = 0;
 				}
 			} else
 				sc->sc_sendad_errors = 0;
 		}
 	}
 #endif /* INET6 */
 
 	if (advbase != 255 || advskew != 255)
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 
 }
 
 /*
  * Broadcast a gratuitous ARP request containing
  * the virtual router MAC address for each IP address
  * associated with the virtual router.
  */
 static void
 carp_send_arp(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 /*		arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */
 		arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));
 
 		DELAY(1000);	/* XXX */
 	}
 }
 
 #ifdef INET6
 static void
 carp_send_na(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 	struct in6_addr *in6;
 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 
 	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
 		DELAY(1000);	/* XXX */
 	}
 }
 #endif /* INET6 */
 
 static int
 carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
 {
 	struct carp_softc *vh;
 	struct ifaddr *ifa;
 	int count = 0;
 
 	CARP_LOCK_ASSERT(cif);
 
 	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 		if ((type == CARP_COUNT_RUNNING &&
 		    (SC2IFP(vh)->if_flags & IFF_UP) &&
 		    (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) ||
 		    (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
 			TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
 			    ifa_list) {
 				if (ifa->ifa_addr->sa_family == AF_INET &&
 				    ia->ia_addr.sin_addr.s_addr ==
 				    ifatoia(ifa)->ia_addr.sin_addr.s_addr)
 					count++;
 			}
 		}
 	}
 	return (count);
 }
 
 int
 carp_iamatch(void *v, struct in_ifaddr *ia,
     struct in_addr *isaddr, u_int8_t **enaddr)
 {
 	struct carp_if *cif = v;
 	struct carp_softc *vh;
 	int index, count = 0;
 	struct ifaddr *ifa;
 
 	CARP_LOCK(cif);
 
 	if (carp_opts[CARPCTL_ARPBALANCE]) {
 		/*
 		 * XXX proof of concept implementation.
 		 * We use the source ip to decide which virtual host should
 		 * handle the request. If we're master of that virtual host,
 		 * then we respond, otherwise, just drop the arp packet on
 		 * the floor.
 		 */
 		count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
 		if (count == 0) {
 			/* should never reach this */
 			CARP_UNLOCK(cif);
 			return (0);
 		}
 
 		/* this should be a hash, like pf_hash() */
 		index = ntohl(isaddr->s_addr) % count;
 		count = 0;
 
 		TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 			if ((SC2IFP(vh)->if_flags & IFF_UP) &&
 			    (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) {
 				TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
 				    ifa_list) {
 					if (ifa->ifa_addr->sa_family ==
 					    AF_INET &&
 					    ia->ia_addr.sin_addr.s_addr ==
 					    ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
 						if (count == index) {
 							if (vh->sc_state ==
 							    MASTER) {
 								*enaddr = IF_LLADDR(vh->sc_ifp);
 								CARP_UNLOCK(cif);
 								return (1);
 							} else {
 								CARP_UNLOCK(cif);
 								return (0);
 							}
 						}
 						count++;
 					}
 				}
 			}
 		}
 	} else {
 		TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 			if ((SC2IFP(vh)->if_flags & IFF_UP) &&
 			    (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
 			    ia->ia_ifp == SC2IFP(vh) &&
 			    vh->sc_state == MASTER) {
 				*enaddr = IF_LLADDR(vh->sc_ifp);
 				CARP_UNLOCK(cif);
 				return (1);
 			}
 		}
 	}
 	CARP_UNLOCK(cif);
 	return (0);
 }
 
 #ifdef INET6
 struct ifaddr *
 carp_iamatch6(void *v, struct in6_addr *taddr)
 {
 	struct carp_if *cif = v;
 	struct carp_softc *vh;
 	struct ifaddr *ifa;
 
 	CARP_LOCK(cif);
 	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 		TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) {
 			if (IN6_ARE_ADDR_EQUAL(taddr,
 			    &ifatoia6(ifa)->ia_addr.sin6_addr) &&
  			    (SC2IFP(vh)->if_flags & IFF_UP) &&
 			    (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
 			    vh->sc_state == MASTER) {
 			    	CARP_UNLOCK(cif);
 				return (ifa);
 			}
 		}
 	}
 	CARP_UNLOCK(cif);
 	
 	return (NULL);
 }
 
 void *
 carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr)
 {
 	struct m_tag *mtag;
 	struct carp_if *cif = v;
 	struct carp_softc *sc;
 	struct ifaddr *ifa;
 
 	CARP_LOCK(cif);
 	TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
 		TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
 			if (IN6_ARE_ADDR_EQUAL(taddr,
 			    &ifatoia6(ifa)->ia_addr.sin6_addr) &&
  			    (SC2IFP(sc)->if_flags & IFF_UP) &&
 			    (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) {
 				struct ifnet *ifp = SC2IFP(sc);
 				mtag = m_tag_get(PACKET_TAG_CARP,
 				    sizeof(struct ifnet *), M_NOWAIT);
 				if (mtag == NULL) {
 					/* better a bit than nothing */
 					CARP_UNLOCK(cif);
 					return (IF_LLADDR(sc->sc_ifp));
 				}
 				bcopy(&ifp, (caddr_t)(mtag + 1),
 				    sizeof(struct ifnet *));
 				m_tag_prepend(m, mtag);
 
 				CARP_UNLOCK(cif);
 				return (IF_LLADDR(sc->sc_ifp));
 			}
 		}
 	}
 	CARP_UNLOCK(cif);
 
 	return (NULL);
 }
 #endif
 
 struct ifnet *
 carp_forus(void *v, void *dhost)
 {
 	struct carp_if *cif = v;
 	struct carp_softc *vh;
 	u_int8_t *ena = dhost;
 
 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 		return (NULL);
 
 	CARP_LOCK(cif);
 	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list)
 		if ((SC2IFP(vh)->if_flags & IFF_UP) &&
 		    (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
 		    vh->sc_state == MASTER &&
 		    !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
 		    	CARP_UNLOCK(cif);
 			return (SC2IFP(vh));
 		}
 
     	CARP_UNLOCK(cif);
 	return (NULL);
 }
 
 static void
 carp_master_down(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_SCLOCK(sc);
 	carp_master_down_locked(sc);
 	CARP_SCUNLOCK(sc);
 }
 
 static void
 carp_master_down_locked(struct carp_softc *sc)
 {
 	if (sc->sc_carpdev)
 		CARP_SCLOCK_ASSERT(sc);
 
 	switch (sc->sc_state) {
 	case INIT:
 		printf("%s: master_down event in INIT state\n",
 		    SC2IFP(sc)->if_xname);
 		break;
 	case MASTER:
 		break;
 	case BACKUP:
 		carp_set_state(sc, MASTER);
 		carp_send_ad_locked(sc);
 		carp_send_arp(sc);
 #ifdef INET6
 		carp_send_na(sc);
 #endif /* INET6 */
 		carp_setrun(sc, 0);
 		carp_setroute(sc, RTM_ADD);
 		break;
 	}
 }
 
 /*
  * When in backup state, af indicates whether to reset the master down timer
  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
  */
 static void
 carp_setrun(struct carp_softc *sc, sa_family_t af)
 {
 	struct timeval tv;
 
 	if (sc->sc_carpdev == NULL) {
 		SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		carp_set_state(sc, INIT);
 		return;
 	} else
 		CARP_SCLOCK_ASSERT(sc);
 
 	if (SC2IFP(sc)->if_flags & IFF_UP &&
 	    sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6))
 		SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else {
 		SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		carp_setroute(sc, RTM_DELETE);
 		return;
 	}
 
 	switch (sc->sc_state) {
 	case INIT:
 		if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
 			carp_send_ad_locked(sc);
 			carp_send_arp(sc);
 #ifdef INET6
 			carp_send_na(sc);
 #endif /* INET6 */
 			CARP_DEBUG("%s: INIT -> MASTER (preempting)\n",
 			    SC2IFP(sc)->if_xname);
 			carp_set_state(sc, MASTER);
 			carp_setroute(sc, RTM_ADD);
 		} else {
 			CARP_DEBUG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname);
 			carp_set_state(sc, BACKUP);
 			carp_setroute(sc, RTM_DELETE);
 			carp_setrun(sc, 0);
 		}
 		break;
 	case BACKUP:
 		callout_stop(&sc->sc_ad_tmo);
 		tv.tv_sec = 3 * sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif /* INET6 */
 		default:
 			if (sc->sc_naddrs)
 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 			if (sc->sc_naddrs6)
 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 			break;
 		}
 		break;
 	case MASTER:
 		tv.tv_sec = sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 		break;
 	}
 }
 
 static void
 carp_multicast_cleanup(struct carp_softc *sc)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	u_int16_t n = imo->imo_num_memberships;
 
 	/* Clean up our own multicast memberships */
 	while (n-- > 0) {
 		if (imo->imo_membership[n] != NULL) {
 			in_delmulti(imo->imo_membership[n]);
 			imo->imo_membership[n] = NULL;
 		}
 	}
 	KASSERT(imo->imo_mfilters == NULL,
 	   ("%s: imo_mfilters != NULL", __func__));
 	imo->imo_num_memberships = 0;
 	imo->imo_multicast_ifp = NULL;
 }
 
 #ifdef INET6
 static void
 carp_multicast6_cleanup(struct carp_softc *sc)
 {
 	struct ip6_moptions *im6o = &sc->sc_im6o;
 
 	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
 		struct in6_multi_mship *imm =
 		    LIST_FIRST(&im6o->im6o_memberships);
 
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_leavegroup(imm);
 	}
 	im6o->im6o_multicast_ifp = NULL;
 }
 #endif
 
 static int
 carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
 {
 	struct ifnet *ifp;
 	struct carp_if *cif;
 	struct in_ifaddr *ia, *ia_if;
 	struct ip_moptions *imo = &sc->sc_imo;
 	struct in_addr addr;
 	u_long iaddr = htonl(sin->sin_addr.s_addr);
 	int own, error;
 
 	if (sin->sin_addr.s_addr == 0) {
 		if (!(SC2IFP(sc)->if_flags & IFF_UP))
 			carp_set_state(sc, INIT);
 		if (sc->sc_naddrs)
 			SC2IFP(sc)->if_flags |= IFF_UP;
 		carp_setrun(sc, 0);
 		return (0);
 	}
 
 	/* we have to do it by hands to check we won't match on us */
 	ia_if = NULL; own = 0;
 	TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 		/* and, yeah, we need a multicast-capable iface too */
 		if (ia->ia_ifp != SC2IFP(sc) &&
 		    (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
 		    (iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
 			if (!ia_if)
 				ia_if = ia;
 			if (sin->sin_addr.s_addr ==
 			    ia->ia_addr.sin_addr.s_addr)
 				own++;
 		}
 	}
 
 	if (!ia_if)
 		return (EADDRNOTAVAIL);
 
 	ia = ia_if;
 	ifp = ia->ia_ifp;
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
 	    (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp))
 		return (EADDRNOTAVAIL);
 
 	if (imo->imo_num_memberships == 0) {
 		addr.s_addr = htonl(INADDR_CARP_GROUP);
 		if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL)
 			return (ENOBUFS);
 		imo->imo_num_memberships++;
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = CARP_DFLTTL;
 		imo->imo_multicast_loop = 0;
 	}
 
 	if (!ifp->if_carp) {
 
 		MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP,
 		    M_WAITOK|M_ZERO);
 		if (!cif) {
 			error = ENOBUFS;
 			goto cleanup;
 		}
 		if ((error = ifpromisc(ifp, 1))) {
 			FREE(cif, M_CARP);
 			goto cleanup;
 		}
 		
 		CARP_LOCK_INIT(cif);
 		CARP_LOCK(cif);
 		cif->vhif_ifp = ifp;
 		TAILQ_INIT(&cif->vhif_vrs);
 		ifp->if_carp = cif;
 
 	} else {
 		struct carp_softc *vr;
 
 		cif = (struct carp_if *)ifp->if_carp;
 		CARP_LOCK(cif);
 		TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
 			if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
 				CARP_UNLOCK(cif);
 				error = EINVAL;
 				goto cleanup;
 			}
 	}
 	sc->sc_ia = ia;
 	sc->sc_carpdev = ifp;
 
 	{ /* XXX prevent endless loop if already in queue */
 	struct carp_softc *vr, *after = NULL;
 	int myself = 0;
 	cif = (struct carp_if *)ifp->if_carp;
 
 	/* XXX: cif should not change, right? So we still hold the lock */
 	CARP_LOCK_ASSERT(cif);
 
 	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 		if (vr == sc)
 			myself = 1;
 		if (vr->sc_vhid < sc->sc_vhid)
 			after = vr;
 	}
 
 	if (!myself) {
 		/* We're trying to keep things in order */
 		if (after == NULL) {
 			TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
 		} else {
 			TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
 		}
 		cif->vhif_nvrs++;
 	}
 	}
 
 	sc->sc_naddrs++;
 	SC2IFP(sc)->if_flags |= IFF_UP;
 	if (own)
 		sc->sc_advskew = 0;
 	carp_sc_state_locked(sc);
 	carp_setrun(sc, 0);
 
 	CARP_UNLOCK(cif);
 
 	return (0);
 
 cleanup:
 	in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 	return (error);
 }
 
 static int
 carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
 {
 	int error = 0;
 
 	if (!--sc->sc_naddrs) {
 		struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 		struct ip_moptions *imo = &sc->sc_imo;
 
 		CARP_LOCK(cif);
 		callout_stop(&sc->sc_ad_tmo);
 		SC2IFP(sc)->if_flags &= ~IFF_UP;
 		SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		sc->sc_vhid = -1;
 		in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 		imo->imo_multicast_ifp = NULL;
 		TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 		if (!--cif->vhif_nvrs) {
 			sc->sc_carpdev->if_carp = NULL;
 			CARP_LOCK_DESTROY(cif);
 			FREE(cif, M_IFADDR);
 		} else {
 			CARP_UNLOCK(cif);
 		}
 	}
 
 	return (error);
 }
 
 #ifdef INET6
 static int
 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
 {
 	struct ifnet *ifp;
 	struct carp_if *cif;
 	struct in6_ifaddr *ia, *ia_if;
 	struct ip6_moptions *im6o = &sc->sc_im6o;
 	struct in6_multi_mship *imm;
 	struct in6_addr in6;
 	int own, error;
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 		if (!(SC2IFP(sc)->if_flags & IFF_UP))
 			carp_set_state(sc, INIT);
 		if (sc->sc_naddrs6)
 			SC2IFP(sc)->if_flags |= IFF_UP;
 		carp_setrun(sc, 0);
 		return (0);
 	}
 
 	/* we have to do it by hands to check we won't match on us */
 	ia_if = NULL; own = 0;
 	for (ia = in6_ifaddr; ia; ia = ia->ia_next) {
 		int i;
 
 		for (i = 0; i < 4; i++) {
 			if ((sin6->sin6_addr.s6_addr32[i] &
 			    ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
 			    (ia->ia_addr.sin6_addr.s6_addr32[i] &
 			    ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
 				break;
 		}
 		/* and, yeah, we need a multicast-capable iface too */
 		if (ia->ia_ifp != SC2IFP(sc) &&
 		    (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
 		    (i == 4)) {
 			if (!ia_if)
 				ia_if = ia;
 			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
 			    &ia->ia_addr.sin6_addr))
 				own++;
 		}
 	}
 
 	if (!ia_if)
 		return (EADDRNOTAVAIL);
 	ia = ia_if;
 	ifp = ia->ia_ifp;
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
 	    (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp))
 		return (EADDRNOTAVAIL);
 
 	if (!sc->sc_naddrs6) {
 		im6o->im6o_multicast_ifp = ifp;
 
 		/* join CARP multicast address */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr8[15] = 0x12;
 		if (in6_setscope(&in6, ifp, NULL) != 0)
 			goto cleanup;
 		if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL)
 			goto cleanup;
 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
 
 		/* join solicited multicast address */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr32[1] = 0;
 		in6.s6_addr32[2] = htonl(1);
 		in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
 		in6.s6_addr8[12] = 0xff;
 		if (in6_setscope(&in6, ifp, NULL) != 0)
 			goto cleanup;
 		if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL)
 			goto cleanup;
 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
 	}
 
 	if (!ifp->if_carp) {
 		MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP,
 		    M_WAITOK|M_ZERO);
 		if (!cif) {
 			error = ENOBUFS;
 			goto cleanup;
 		}
 		if ((error = ifpromisc(ifp, 1))) {
 			FREE(cif, M_CARP);
 			goto cleanup;
 		}
 
 		CARP_LOCK_INIT(cif);
 		CARP_LOCK(cif);
 		cif->vhif_ifp = ifp;
 		TAILQ_INIT(&cif->vhif_vrs);
 		ifp->if_carp = cif;
 
 	} else {
 		struct carp_softc *vr;
 
 		cif = (struct carp_if *)ifp->if_carp;
 		CARP_LOCK(cif);
 		TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
 			if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
 				CARP_UNLOCK(cif);
 				error = EINVAL;
 				goto cleanup;
 			}
 	}
 	sc->sc_ia6 = ia;
 	sc->sc_carpdev = ifp;
 
 	{ /* XXX prevent endless loop if already in queue */
 	struct carp_softc *vr, *after = NULL;
 	int myself = 0;
 	cif = (struct carp_if *)ifp->if_carp;
 	CARP_LOCK_ASSERT(cif);
 
 	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 		if (vr == sc)
 			myself = 1;
 		if (vr->sc_vhid < sc->sc_vhid)
 			after = vr;
 	}
 
 	if (!myself) {
 		/* We're trying to keep things in order */
 		if (after == NULL) {
 			TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
 		} else {
 			TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
 		}
 		cif->vhif_nvrs++;
 	}
 	}
 
 	sc->sc_naddrs6++;
 	SC2IFP(sc)->if_flags |= IFF_UP;
 	if (own)
 		sc->sc_advskew = 0;
 	carp_sc_state_locked(sc);
 	carp_setrun(sc, 0);
 
 	CARP_UNLOCK(cif);
 
 	return (0);
 
 cleanup:
 	/* clean up multicast memberships */
 	if (!sc->sc_naddrs6) {
 		while (!LIST_EMPTY(&im6o->im6o_memberships)) {
 			imm = LIST_FIRST(&im6o->im6o_memberships);
 			LIST_REMOVE(imm, i6mm_chain);
 			in6_leavegroup(imm);
 		}
 	}
 	return (error);
 }
 
 static int
 carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
 {
 	int error = 0;
 
 	if (!--sc->sc_naddrs6) {
 		struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 		struct ip6_moptions *im6o = &sc->sc_im6o;
 
 		CARP_LOCK(cif);
 		callout_stop(&sc->sc_ad_tmo);
 		SC2IFP(sc)->if_flags &= ~IFF_UP;
 		SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		sc->sc_vhid = -1;
 		while (!LIST_EMPTY(&im6o->im6o_memberships)) {
 			struct in6_multi_mship *imm =
 			    LIST_FIRST(&im6o->im6o_memberships);
 
 			LIST_REMOVE(imm, i6mm_chain);
 			in6_leavegroup(imm);
 		}
 		im6o->im6o_multicast_ifp = NULL;
 		TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 		if (!--cif->vhif_nvrs) {
 			CARP_LOCK_DESTROY(cif);
 			sc->sc_carpdev->if_carp = NULL;
 			FREE(cif, M_IFADDR);
 		} else
 			CARP_UNLOCK(cif);
 	}
 
 	return (error);
 }
 #endif /* INET6 */
 
 static int
 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
 {
 	struct carp_softc *sc = ifp->if_softc, *vr;
 	struct carpreq carpr;
 	struct ifaddr *ifa;
 	struct ifreq *ifr;
 	struct ifaliasreq *ifra;
 	int locked = 0, error = 0;
 
 	ifa = (struct ifaddr *)addr;
 	ifra = (struct ifaliasreq *)addr;
 	ifr = (struct ifreq *)addr;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			SC2IFP(sc)->if_flags |= IFF_UP;
 			bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
 			    sizeof(struct sockaddr));
 			error = carp_set_addr(sc, satosin(ifa->ifa_addr));
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			SC2IFP(sc)->if_flags |= IFF_UP;
 			error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
 			break;
 #endif /* INET6 */
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCAIFADDR:
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			SC2IFP(sc)->if_flags |= IFF_UP;
 			bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
 			    sizeof(struct sockaddr));
 			error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			SC2IFP(sc)->if_flags |= IFF_UP;
 			error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
 			break;
 #endif /* INET6 */
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCDIFADDR:
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr));
 			break;
 #endif /* INET6 */
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCSIFFLAGS:
 		if (sc->sc_carpdev) {
 			locked = 1;
 			CARP_SCLOCK(sc);
 		}
 		if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
  			callout_stop(&sc->sc_ad_tmo);
  			callout_stop(&sc->sc_md_tmo);
  			callout_stop(&sc->sc_md6_tmo);
 			if (sc->sc_state == MASTER)
 				carp_send_ad_locked(sc);
 			carp_set_state(sc, INIT);
 			carp_setrun(sc, 0);
 		} else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
 			SC2IFP(sc)->if_flags |= IFF_UP;
 			carp_setrun(sc, 0);
 		}
 		break;
 
 	case SIOCSVH:
 		error = priv_check(curthread, PRIV_NETINET_CARP);
 		if (error)
 			break;
 		if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
 			break;
 		error = 1;
 		if (sc->sc_carpdev) {
 			locked = 1;
 			CARP_SCLOCK(sc);
 		}
 		if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
 			switch (carpr.carpr_state) {
 			case BACKUP:
 				callout_stop(&sc->sc_ad_tmo);
 				carp_set_state(sc, BACKUP);
 				carp_setrun(sc, 0);
 				carp_setroute(sc, RTM_DELETE);
 				break;
 			case MASTER:
 				carp_master_down_locked(sc);
 				break;
 			default:
 				break;
 			}
 		}
 		if (carpr.carpr_vhid > 0) {
 			if (carpr.carpr_vhid > 255) {
 				error = EINVAL;
 				break;
 			}
 			if (sc->sc_carpdev) {
 				struct carp_if *cif;
 				cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 				TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
 					if (vr != sc &&
 					    vr->sc_vhid == carpr.carpr_vhid) {
 						error = EEXIST;
 						break;
 					}
 				if (error == EEXIST)
 					break;
 			}
 			sc->sc_vhid = carpr.carpr_vhid;
 			IF_LLADDR(sc->sc_ifp)[0] = 0;
 			IF_LLADDR(sc->sc_ifp)[1] = 0;
 			IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
 			IF_LLADDR(sc->sc_ifp)[3] = 0;
 			IF_LLADDR(sc->sc_ifp)[4] = 1;
 			IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
 			error--;
 		}
 		if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) {
 			if (carpr.carpr_advskew >= 255) {
 				error = EINVAL;
 				break;
 			}
 			if (carpr.carpr_advbase > 255) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advbase = carpr.carpr_advbase;
 			sc->sc_advskew = carpr.carpr_advskew;
 			error--;
 		}
 		bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
 		if (error > 0)
 			error = EINVAL;
 		else {
 			error = 0;
 			carp_setrun(sc, 0);
 		}
 		break;
 
 	case SIOCGVH:
 		/* XXX: lockless read */
 		bzero(&carpr, sizeof(carpr));
 		carpr.carpr_state = sc->sc_state;
 		carpr.carpr_vhid = sc->sc_vhid;
 		carpr.carpr_advbase = sc->sc_advbase;
 		carpr.carpr_advskew = sc->sc_advskew;
 		error = priv_check(curthread, PRIV_NETINET_CARP);
 		if (error == 0)
 			bcopy(sc->sc_key, carpr.carpr_key,
 			    sizeof(carpr.carpr_key));
 		error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
 		break;
 
 	default:
 		error = EINVAL;
 	}
 
 	if (locked)
 		CARP_SCUNLOCK(sc);
 
 	carp_hmac_prepare(sc);
 
 	return (error);
 }
 
 /*
  * XXX: this is looutput. We should eventually use it from there.
  */
 static int
 carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
     struct rtentry *rt)
 {
 	u_int32_t af;
 
 	M_ASSERTPKTHDR(m); /* check if we have the packet header */
 
 	if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		m_freem(m);
 		return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
 			rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
 	}
 
 	ifp->if_opackets++;
 	ifp->if_obytes += m->m_pkthdr.len;
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC) {
 		bcopy(dst->sa_data, &af, sizeof(af));
 		dst->sa_family = af;
 	}
 
 #if 1	/* XXX */
 	switch (dst->sa_family) {
 	case AF_INET:
 	case AF_INET6:
 	case AF_IPX:
 	case AF_APPLETALK:
 		break;
 	default:
 		printf("carp_looutput: af=%d unexpected\n", dst->sa_family);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 #endif
 	return(if_simloop(ifp, m, dst->sa_family, 0));
 }
 
 /*
  * Start output on carp interface. This function should never be called.
  */
 static void
 carp_start(struct ifnet *ifp)
 {
 #ifdef DEBUG
 	printf("%s: start called\n", ifp->if_xname);
 #endif
 }
 
 int
 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
     struct rtentry *rt)
 {
 	struct m_tag *mtag;
 	struct carp_softc *sc;
 	struct ifnet *carp_ifp;
 
 	if (!sa)
 		return (0);
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif /* INET6 */
 	default:
 		return (0);
 	}
 
 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *));
 	sc = carp_ifp->if_softc;
 
 	/* Set the source MAC address to Virtual Router MAC Address */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN: {
 			struct ether_header *eh;
 
 			eh = mtod(m, struct ether_header *);
 			eh->ether_shost[0] = 0;
 			eh->ether_shost[1] = 0;
 			eh->ether_shost[2] = 0x5e;
 			eh->ether_shost[3] = 0;
 			eh->ether_shost[4] = 1;
 			eh->ether_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_FDDI: {
 			struct fddi_header *fh;
 
 			fh = mtod(m, struct fddi_header *);
 			fh->fddi_shost[0] = 0;
 			fh->fddi_shost[1] = 0;
 			fh->fddi_shost[2] = 0x5e;
 			fh->fddi_shost[3] = 0;
 			fh->fddi_shost[4] = 1;
 			fh->fddi_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_ISO88025: {
  			struct iso88025_header *th;
  			th = mtod(m, struct iso88025_header *);
 			th->iso88025_shost[0] = 3;
 			th->iso88025_shost[1] = 0;
 			th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[4] = 0;
 			th->iso88025_shost[5] = 0;
 		}
 		break;
 	default:
 		printf("%s: carp is not supported for this interface type\n",
 		    ifp->if_xname);
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static void
 carp_set_state(struct carp_softc *sc, int state)
 {
 
 	if (sc->sc_carpdev)
 		CARP_SCLOCK_ASSERT(sc);
 
 	if (sc->sc_state == state)
 		return;
 
 	sc->sc_state = state;
 	switch (state) {
 	case BACKUP:
 		SC2IFP(sc)->if_link_state = LINK_STATE_DOWN;
 		break;
 	case MASTER:
 		SC2IFP(sc)->if_link_state = LINK_STATE_UP;
 		break;
 	default:
 		SC2IFP(sc)->if_link_state = LINK_STATE_UNKNOWN;
 		break;
 	}
 	rt_ifmsg(SC2IFP(sc));
 }
 
 void
 carp_carpdev_state(void *v)
 {
 	struct carp_if *cif = v;
 
 	CARP_LOCK(cif);
 	carp_carpdev_state_locked(cif);
 	CARP_UNLOCK(cif);
 }
 
 static void
 carp_carpdev_state_locked(struct carp_if *cif)
 {
 	struct carp_softc *sc;
 
 	TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
 		carp_sc_state_locked(sc);
 }
 
 static void
 carp_sc_state_locked(struct carp_softc *sc)
 {
 	CARP_SCLOCK_ASSERT(sc);
 
 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    !(sc->sc_carpdev->if_flags & IFF_UP)) {
 		sc->sc_flags_backup = SC2IFP(sc)->if_flags;
 		SC2IFP(sc)->if_flags &= ~IFF_UP;
 		SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		callout_stop(&sc->sc_ad_tmo);
 		callout_stop(&sc->sc_md_tmo);
 		callout_stop(&sc->sc_md6_tmo);
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (!sc->sc_suppress) {
 			carp_suppress_preempt++;
 			if (carp_suppress_preempt == 1) {
 				CARP_SCUNLOCK(sc);
 				carp_send_ad_all();
 				CARP_SCLOCK(sc);
 			}
 		}
 		sc->sc_suppress = 1;
 	} else {
 		SC2IFP(sc)->if_flags |= sc->sc_flags_backup;
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (sc->sc_suppress)
 			carp_suppress_preempt--;
 		sc->sc_suppress = 0;
 	}
 
 	return;
 }
 
 static int
 carp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 		if (if_detach_event_tag == NULL)
 			return (ENOMEM);
 		mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
 		LIST_INIT(&carpif_list);
 		if_clone_attach(&carp_cloner);
 		break;
 
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 		if_clone_detach(&carp_cloner);
 		mtx_destroy(&carp_mtx);
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static moduledata_t carp_mod = {
 	"carp",
 	carp_modevent,
 	0
 };
 
 DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: head/sys/netinet/ip_dummynet.c
===================================================================
--- head/sys/netinet/ip_dummynet.c	(revision 171636)
+++ head/sys/netinet/ip_dummynet.c	(revision 171637)
@@ -1,2206 +1,2206 @@
 /*-
  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #define	DUMMYNET_DEBUG
 
 #include "opt_inet6.h"
 
 /*
  * This module implements IP dummynet, a bandwidth limiter/delay emulator
  * used in conjunction with the ipfw package.
  * Description of the data structures used is in ip_dummynet.h
  * Here you mainly find the following blocks of code:
  *  + variable declarations;
  *  + heap management functions;
  *  + scheduler and dummynet functions;
  *  + configuration and initialization.
  *
  * NOTA BENE: critical sections are protected by the "dummynet lock".
  *
  * Most important Changes:
  *
  * 011004: KLDable
  * 010124: Fixed WF2Q behaviour
  * 010122: Fixed spl protection.
  * 000601: WF2Q support
  * 000106: large rewrite, use heaps to handle very many pipes.
  * 980513:	initial release
  *
  * include files marked with XXX are probably not needed
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 #include <netinet/ip_var.h>
 
 #include <netinet/if_ether.h> /* for struct arpcom */
 
 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
 #include <netinet6/ip6_var.h>
 
 /*
  * We keep a private variable for the simulation time, but we could
  * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
  */
 static dn_key curr_time = 0 ; /* current simulation time */
 
 static int dn_hash_size = 64 ;	/* default hash size */
 
 /* statistics on number of queue searches and search steps */
 static long searches, search_steps ;
 static int pipe_expire = 1 ;   /* expire queue if empty */
 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
 
 static int red_lookup_depth = 256;	/* RED - default lookup table depth */
 static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
 static int red_max_pkt_size = 1500;     /* RED - default max packet size */
 
 static struct timeval prev_t, t;
 static long tick_last;			/* Last tick duration (usec). */
 static long tick_delta;			/* Last vs standard tick diff (usec). */
 static long tick_delta_sum;		/* Accumulated tick difference (usec).*/
 static long tick_adjustment;		/* Tick adjustments done. */
 static long tick_lost;			/* Lost(coalesced) ticks number. */
 /* Adjusted vs non-adjusted curr_time difference (ticks). */
 static long tick_diff;
 
 /*
  * Three heaps contain queues and pipes that the scheduler handles:
  *
  * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
  *
  * wfq_ready_heap contains the pipes associated with WF2Q flows
  *
  * extract_heap contains pipes associated with delay lines.
  *
  */
 
 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
 
 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
 
 static int	heap_init(struct dn_heap *h, int size);
 static int	heap_insert (struct dn_heap *h, dn_key key1, void *p);
 static void	heap_extract(struct dn_heap *h, void *obj);
 static void	transmit_event(struct dn_pipe *pipe, struct mbuf **head,
 		    struct mbuf **tail);
 static void	ready_event(struct dn_flow_queue *q, struct mbuf **head,
 		    struct mbuf **tail);
 static void	ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
 		    struct mbuf **tail);
 
 #define	HASHSIZE	16
 #define	HASH(num)	((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
 static struct dn_pipe_head	pipehash[HASHSIZE];	/* all pipes */
 static struct dn_flow_set_head	flowsethash[HASHSIZE];	/* all flowsets */
 
 static struct callout dn_timeout;
 
 extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
     CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
     CTLFLAG_RD, &curr_time, 0, "Current tick");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
     CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
     CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
     CTLFLAG_RD, &searches, 0, "Number of queue searches");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
     CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
     CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
     CTLFLAG_RW, &dn_max_ratio, 0,
     "Max ratio between dynamic queues and buckets");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
     CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
     CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
     CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
     CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
     CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
     CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
     CTLFLAG_RD, &tick_diff, 0,
     "Adjusted vs non-adjusted curr_time difference (ticks).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
     CTLFLAG_RD, &tick_lost, 0,
     "Number of ticks coalesced by dummynet taskqueue.");
 #endif
 
 #ifdef DUMMYNET_DEBUG
 int	dummynet_debug = 0;
 #ifdef SYSCTL_NODE
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
 	    0, "control debugging printfs");
 #endif
 #define	DPRINTF(X)	if (dummynet_debug) printf X
 #else
 #define	DPRINTF(X)
 #endif
 
 static struct task	dn_task;
 static struct taskqueue	*dn_tq = NULL;
 static void dummynet_task(void *, int);
 
 static struct mtx dummynet_mtx;
 #define	DUMMYNET_LOCK_INIT() \
 	mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
 #define	DUMMYNET_LOCK_DESTROY()	mtx_destroy(&dummynet_mtx)
 #define	DUMMYNET_LOCK()		mtx_lock(&dummynet_mtx)
 #define	DUMMYNET_UNLOCK()	mtx_unlock(&dummynet_mtx)
 #define	DUMMYNET_LOCK_ASSERT()	do {				\
 	mtx_assert(&dummynet_mtx, MA_OWNED);			\
 	NET_ASSERT_GIANT();					\
 } while (0)
 
 static int config_pipe(struct dn_pipe *p);
 static int ip_dn_ctl(struct sockopt *sopt);
 
 static void dummynet(void *);
 static void dummynet_flush(void);
 static void dummynet_send(struct mbuf *);
 void dummynet_drain(void);
 static ip_dn_io_t dummynet_io;
 static void dn_rule_delete(void *);
 
 /*
  * Heap management functions.
  *
  * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
  * Some macros help finding parent/children so we can optimize them.
  *
  * heap_init() is called to expand the heap when needed.
  * Increment size in blocks of 16 entries.
  * XXX failure to allocate a new element is a pretty bad failure
  * as we basically stall a whole queue forever!!
  * Returns 1 on error, 0 on success
  */
 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
 #define HEAP_LEFT(x) ( 2*(x) + 1 )
 #define HEAP_IS_LEFT(x) ( (x) & 1 )
 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
 #define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
 #define HEAP_INCREMENT	15
 
 static int
 heap_init(struct dn_heap *h, int new_size)
 {
     struct dn_heap_entry *p;
 
     if (h->size >= new_size ) {
 	printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
 		h->size, new_size);
 	return 0 ;
     }
     new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
     p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
     if (p == NULL) {
 	printf("dummynet: %s, resize %d failed\n", __func__, new_size );
 	return 1 ; /* error */
     }
     if (h->size > 0) {
 	bcopy(h->p, p, h->size * sizeof(*p) );
 	free(h->p, M_DUMMYNET);
     }
     h->p = p ;
     h->size = new_size ;
     return 0 ;
 }
 
 /*
  * Insert element in heap. Normally, p != NULL, we insert p in
  * a new position and bubble up. If p == NULL, then the element is
  * already in place, and key is the position where to start the
  * bubble-up.
  * Returns 1 on failure (cannot allocate new heap entry)
  *
  * If offset > 0 the position (index, int) of the element in the heap is
  * also stored in the element itself at the given offset in bytes.
  */
 #define SET_OFFSET(heap, node) \
     if (heap->offset > 0) \
 	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
 /*
  * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
  */
 #define RESET_OFFSET(heap, node) \
     if (heap->offset > 0) \
 	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
 static int
 heap_insert(struct dn_heap *h, dn_key key1, void *p)
 {
     int son = h->elements ;
 
     if (p == NULL)	/* data already there, set starting point */
 	son = key1 ;
     else {		/* insert new element at the end, possibly resize */
 	son = h->elements ;
 	if (son == h->size) /* need resize... */
 	    if (heap_init(h, h->elements+1) )
 		return 1 ; /* failure... */
 	h->p[son].object = p ;
 	h->p[son].key = key1 ;
 	h->elements++ ;
     }
     while (son > 0) {				/* bubble up */
 	int father = HEAP_FATHER(son) ;
 	struct dn_heap_entry tmp  ;
 
 	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
 	    break ; /* found right position */
 	/* son smaller than father, swap and repeat */
 	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
 	SET_OFFSET(h, son);
 	son = father ;
     }
     SET_OFFSET(h, son);
     return 0 ;
 }
 
 /*
  * remove top element from heap, or obj if obj != NULL
  */
 static void
 heap_extract(struct dn_heap *h, void *obj)
 {
     int child, father, max = h->elements - 1 ;
 
     if (max < 0) {
 	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
 	return ;
     }
     father = 0 ; /* default: move up smallest child */
     if (obj != NULL) { /* extract specific element, index is at offset */
 	if (h->offset <= 0)
 	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
 	father = *((int *)((char *)obj + h->offset)) ;
 	if (father < 0 || father >= h->elements) {
 	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
 		father, h->elements);
 	    panic("dummynet: heap_extract");
 	}
     }
     RESET_OFFSET(h, father);
     child = HEAP_LEFT(father) ;		/* left child */
     while (child <= max) {		/* valid entry */
 	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
 	    child = child+1 ;		/* take right child, otherwise left */
 	h->p[father] = h->p[child] ;
 	SET_OFFSET(h, father);
 	father = child ;
 	child = HEAP_LEFT(child) ;   /* left child for next loop */
     }
     h->elements-- ;
     if (father != max) {
 	/*
 	 * Fill hole with last entry and bubble up, reusing the insert code
 	 */
 	h->p[father] = h->p[max] ;
 	heap_insert(h, father, NULL); /* this one cannot fail */
     }
 }
 
 #if 0
 /*
  * change object position and update references
  * XXX this one is never used!
  */
 static void
 heap_move(struct dn_heap *h, dn_key new_key, void *object)
 {
     int temp;
     int i ;
     int max = h->elements-1 ;
     struct dn_heap_entry buf ;
 
     if (h->offset <= 0)
 	panic("cannot move items on this heap");
 
     i = *((int *)((char *)object + h->offset));
     if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
 	h->p[i].key = new_key ;
 	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
 		 i = temp ) { /* bubble up */
 	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
 	    SET_OFFSET(h, i);
 	}
     } else {		/* must move down */
 	h->p[i].key = new_key ;
 	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
 	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
 		temp++ ; /* select child with min key */
 	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
 		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
 		SET_OFFSET(h, i);
 	    } else
 		break ;
 	    i = temp ;
 	}
     }
     SET_OFFSET(h, i);
 }
 #endif /* heap_move, unused */
 
 /*
  * heapify() will reorganize data inside an array to maintain the
  * heap property. It is needed when we delete a bunch of entries.
  */
 static void
 heapify(struct dn_heap *h)
 {
     int i ;
 
     for (i = 0 ; i < h->elements ; i++ )
 	heap_insert(h, i , NULL) ;
 }
 
 /*
  * cleanup the heap and free data structure
  */
 static void
 heap_free(struct dn_heap *h)
 {
     if (h->size >0 )
 	free(h->p, M_DUMMYNET);
     bzero(h, sizeof(*h) );
 }
 
 /*
  * --- end of heap management functions ---
  */
 
 /*
  * Return the mbuf tag holding the dummynet state.  As an optimization
  * this is assumed to be the first tag on the list.  If this turns out
  * wrong we'll need to search the list.
  */
 static struct dn_pkt_tag *
 dn_tag_get(struct mbuf *m)
 {
     struct m_tag *mtag = m_tag_first(m);
     KASSERT(mtag != NULL &&
 	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
 	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
 	    ("packet on dummynet queue w/o dummynet tag!"));
     return (struct dn_pkt_tag *)(mtag+1);
 }
 
 /*
  * Scheduler functions:
  *
  * transmit_event() is called when the delay-line needs to enter
  * the scheduler, either because of existing pkts getting ready,
  * or new packets entering the queue. The event handled is the delivery
  * time of the packet.
  *
  * ready_event() does something similar with fixed-rate queues, and the
  * event handled is the finish time of the head pkt.
  *
  * wfq_ready_event() does something similar with WF2Q queues, and the
  * event handled is the start time of the head pkt.
  *
  * In all cases, we make sure that the data structures are consistent
  * before passing pkts out, because this might trigger recursive
  * invocations of the procedures.
  */
 static void
 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
 {
 	struct mbuf *m;
 	struct dn_pkt_tag *pkt;
 
 	DUMMYNET_LOCK_ASSERT();
 
 	while ((m = pipe->head) != NULL) {
 		pkt = dn_tag_get(m);
 		if (!DN_KEY_LEQ(pkt->output_time, curr_time))
 			break;
 
 		pipe->head = m->m_nextpkt;
 		if (*tail != NULL)
 			(*tail)->m_nextpkt = m;
 		else
 			*head = m;
 		*tail = m;
 	}
 	if (*tail != NULL)
 		(*tail)->m_nextpkt = NULL;
 
 	/* If there are leftover packets, put into the heap for next event. */
 	if ((m = pipe->head) != NULL) {
 		pkt = dn_tag_get(m);
 		/*
 		 * XXX: Should check errors on heap_insert, by draining the
 		 * whole pipe p and hoping in the future we are more successful.
 		 */
 		heap_insert(&extract_heap, pkt->output_time, pipe);
 	}
 }
 
 /*
  * the following macro computes how many ticks we have to wait
  * before being able to transmit a packet. The credit is taken from
  * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
  */
 #define SET_TICKS(_m, q, p)	\
     ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
 	    p->bandwidth ;
 
 /*
  * extract pkt from queue, compute output time (could be now)
  * and put into delay line (p_queue)
  */
 static void
 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
     int len)
 {
     struct dn_pkt_tag *dt = dn_tag_get(pkt);
 
     q->head = pkt->m_nextpkt ;
     q->len-- ;
     q->len_bytes -= len ;
 
     dt->output_time = curr_time + p->delay ;
 
     if (p->head == NULL)
 	p->head = pkt;
     else
 	p->tail->m_nextpkt = pkt;
     p->tail = pkt;
     p->tail->m_nextpkt = NULL;
 }
 
 /*
  * ready_event() is invoked every time the queue must enter the
  * scheduler, either because the first packet arrives, or because
  * a previously scheduled event fired.
  * On invokation, drain as many pkts as possible (could be 0) and then
  * if there are leftover packets reinsert the pkt in the scheduler.
  */
 static void
 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
 {
     struct mbuf *pkt;
     struct dn_pipe *p = q->fs->pipe ;
     int p_was_empty ;
 
     DUMMYNET_LOCK_ASSERT();
 
     if (p == NULL) {
 	printf("dummynet: ready_event- pipe is gone\n");
 	return ;
     }
     p_was_empty = (p->head == NULL) ;
 
     /*
      * schedule fixed-rate queues linked to this pipe:
      * Account for the bw accumulated since last scheduling, then
      * drain as many pkts as allowed by q->numbytes and move to
      * the delay line (in p) computing output time.
      * bandwidth==0 (no limit) means we can drain the whole queue,
      * setting len_scaled = 0 does the job.
      */
     q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
     while ( (pkt = q->head) != NULL ) {
 	int len = pkt->m_pkthdr.len;
 	int len_scaled = p->bandwidth ? len*8*hz : 0 ;
 	if (len_scaled > q->numbytes )
 	    break ;
 	q->numbytes -= len_scaled ;
 	move_pkt(pkt, q, p, len);
     }
     /*
      * If we have more packets queued, schedule next ready event
      * (can only occur when bandwidth != 0, otherwise we would have
      * flushed the whole queue in the previous loop).
      * To this purpose we record the current time and compute how many
      * ticks to go for the finish time of the packet.
      */
     if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
 	dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
 	q->sched_time = curr_time ;
 	heap_insert(&ready_heap, curr_time + t, (void *)q );
 	/* XXX should check errors on heap_insert, and drain the whole
 	 * queue on error hoping next time we are luckier.
 	 */
     } else {	/* RED needs to know when the queue becomes empty */
 	q->q_time = curr_time;
 	q->numbytes = 0;
     }
     /*
      * If the delay line was empty call transmit_event() now.
      * Otherwise, the scheduler will take care of it.
      */
     if (p_was_empty)
 	transmit_event(p, head, tail);
 }
 
 /*
  * Called when we can transmit packets on WF2Q queues. Take pkts out of
  * the queues at their start time, and enqueue into the delay line.
  * Packets are drained until p->numbytes < 0. As long as
  * len_scaled >= p->numbytes, the packet goes into the delay line
  * with a deadline p->delay. For the last packet, if p->numbytes<0,
  * there is an additional delay.
  */
 static void
 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
 {
     int p_was_empty = (p->head == NULL) ;
     struct dn_heap *sch = &(p->scheduler_heap);
     struct dn_heap *neh = &(p->not_eligible_heap) ;
 
     DUMMYNET_LOCK_ASSERT();
 
     if (p->if_name[0] == 0) /* tx clock is simulated */
 	p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
     else { /* tx clock is for real, the ifq must be empty or this is a NOP */
 	if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
 	    return ;
 	else {
 	    DPRINTF(("dummynet: pipe %d ready from %s --\n",
 		p->pipe_nr, p->if_name));
 	}
     }
 
     /*
      * While we have backlogged traffic AND credit, we need to do
      * something on the queue.
      */
     while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
 	if (sch->elements > 0) { /* have some eligible pkts to send out */
 	    struct dn_flow_queue *q = sch->p[0].object ;
 	    struct mbuf *pkt = q->head;
 	    struct dn_flow_set *fs = q->fs;
 	    u_int64_t len = pkt->m_pkthdr.len;
 	    int len_scaled = p->bandwidth ? len*8*hz : 0 ;
 
 	    heap_extract(sch, NULL); /* remove queue from heap */
 	    p->numbytes -= len_scaled ;
 	    move_pkt(pkt, q, p, len);
 
 	    p->V += (len<<MY_M) / p->sum ; /* update V */
 	    q->S = q->F ; /* update start time */
 	    if (q->len == 0) { /* Flow not backlogged any more */
 		fs->backlogged-- ;
 		heap_insert(&(p->idle_heap), q->F, q);
 	    } else { /* still backlogged */
 		/*
 		 * update F and position in backlogged queue, then
 		 * put flow in not_eligible_heap (we will fix this later).
 		 */
 		len = (q->head)->m_pkthdr.len;
 		q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
 		if (DN_KEY_LEQ(q->S, p->V))
 		    heap_insert(neh, q->S, q);
 		else
 		    heap_insert(sch, q->F, q);
 	    }
 	}
 	/*
 	 * now compute V = max(V, min(S_i)). Remember that all elements in sch
 	 * have by definition S_i <= V so if sch is not empty, V is surely
 	 * the max and we must not update it. Conversely, if sch is empty
 	 * we only need to look at neh.
 	 */
 	if (sch->elements == 0 && neh->elements > 0)
 	    p->V = MAX64 ( p->V, neh->p[0].key );
 	/* move from neh to sch any packets that have become eligible */
 	while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
 	    struct dn_flow_queue *q = neh->p[0].object ;
 	    heap_extract(neh, NULL);
 	    heap_insert(sch, q->F, q);
 	}
 
 	if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
 	    p->numbytes = -1 ; /* mark not ready for I/O */
 	    break ;
 	}
     }
     if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
 	    && p->idle_heap.elements > 0) {
 	/*
 	 * no traffic and no events scheduled. We can get rid of idle-heap.
 	 */
 	int i ;
 
 	for (i = 0 ; i < p->idle_heap.elements ; i++) {
 	    struct dn_flow_queue *q = p->idle_heap.p[i].object ;
 
 	    q->F = 0 ;
 	    q->S = q->F + 1 ;
 	}
 	p->sum = 0 ;
 	p->V = 0 ;
 	p->idle_heap.elements = 0 ;
     }
     /*
      * If we are getting clocks from dummynet (not a real interface) and
      * If we are under credit, schedule the next ready event.
      * Also fix the delivery time of the last packet.
      */
     if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
 	dn_key t=0 ; /* number of ticks i have to wait */
 
 	if (p->bandwidth > 0)
 	    t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
 	dn_tag_get(p->tail)->output_time += t ;
 	p->sched_time = curr_time ;
 	heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
 	/* XXX should check errors on heap_insert, and drain the whole
 	 * queue on error hoping next time we are luckier.
 	 */
     }
     /*
      * If the delay line was empty call transmit_event() now.
      * Otherwise, the scheduler will take care of it.
      */
     if (p_was_empty)
 	transmit_event(p, head, tail);
 }
 
 /*
  * This is called one tick, after previous run. It is used to
  * schedule next run.
  */
 static void
 dummynet(void * __unused unused)
 {
 
 	taskqueue_enqueue(dn_tq, &dn_task);
 }
 
 /*
  * The main dummynet processing function.
  */
 static void
 dummynet_task(void *context, int pending)
 {
 	struct mbuf *head = NULL, *tail = NULL;
 	struct dn_pipe *pipe;
 	struct dn_heap *heaps[3];
 	struct dn_heap *h;
 	void *p;	/* generic parameter to handler */
 	int i;
 
 	NET_LOCK_GIANT();
 	DUMMYNET_LOCK();
 
 	heaps[0] = &ready_heap;			/* fixed-rate queues */
 	heaps[1] = &wfq_ready_heap;		/* wfq queues */
 	heaps[2] = &extract_heap;		/* delay line */
 
  	/* Update number of lost(coalesced) ticks. */
  	tick_lost += pending - 1;
  
  	getmicrouptime(&t);
  	/* Last tick duration (usec). */
  	tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
  	    (t.tv_usec - prev_t.tv_usec);
  	/* Last tick vs standard tick difference (usec). */
  	tick_delta = (tick_last * hz - 1000000) / hz;
  	/* Accumulated tick difference (usec). */
  	tick_delta_sum += tick_delta;
  
  	prev_t = t;
  
  	/*
  	 * Adjust curr_time if accumulated tick difference greater than
  	 * 'standard' tick. Since curr_time should be monotonically increasing,
  	 * we do positive adjustment as required and throttle curr_time in
  	 * case of negative adjustment.
  	 */
   	curr_time++;
  	if (tick_delta_sum - tick >= 0) {
  		int diff = tick_delta_sum / tick;
  
  		curr_time += diff;
  		tick_diff += diff;
  		tick_delta_sum %= tick;
  		tick_adjustment++;
  	} else if (tick_delta_sum + tick <= 0) {
  		curr_time--;
  		tick_diff--;
  		tick_delta_sum += tick;
  		tick_adjustment++;
  	}
 
 	for (i = 0; i < 3; i++) {
 		h = heaps[i];
 		while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
 			if (h->p[0].key > curr_time)
 				printf("dummynet: warning, "
 				    "heap %d is %d ticks late\n",
 				    i, (int)(curr_time - h->p[0].key));
 			/* store a copy before heap_extract */
 			p = h->p[0].object;
 			/* need to extract before processing */
 			heap_extract(h, NULL);
 			if (i == 0)
 				ready_event(p, &head, &tail);
 			else if (i == 1) {
 				struct dn_pipe *pipe = p;
 				if (pipe->if_name[0] != '\0')
 					printf("dummynet: bad ready_event_wfq "
 					    "for pipe %s\n", pipe->if_name);
 				else
 					ready_event_wfq(p, &head, &tail);
 			} else
 				transmit_event(p, &head, &tail);
 		}
 	}
 
 	/* Sweep pipes trying to expire idle flow_queues. */
 	for (i = 0; i < HASHSIZE; i++)
 		SLIST_FOREACH(pipe, &pipehash[i], next)
 			if (pipe->idle_heap.elements > 0 &&
 			    DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
 				struct dn_flow_queue *q =
 				    pipe->idle_heap.p[0].object;
 
 				heap_extract(&(pipe->idle_heap), NULL);
 				/* Mark timestamp as invalid. */
 				q->S = q->F + 1;
 				pipe->sum -= q->fs->weight;
 			}
 
 	DUMMYNET_UNLOCK();
 
 	if (head != NULL)
 		dummynet_send(head);
 
 	callout_reset(&dn_timeout, 1, dummynet, NULL);
 
 	NET_UNLOCK_GIANT();
 }
 
 static void
 dummynet_send(struct mbuf *m)
 {
 	struct dn_pkt_tag *pkt;
 	struct mbuf *n;
 	struct ip *ip;
 
 	for (; m != NULL; m = n) {
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		pkt = dn_tag_get(m);
 		switch (pkt->dn_dir) {
 		case DN_TO_IP_OUT:
 			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
 			break ;
 		  case DN_TO_IP_IN :
 			ip = mtod(m, struct ip *);
 			ip->ip_len = htons(ip->ip_len);
 			ip->ip_off = htons(ip->ip_off);
 			netisr_dispatch(NETISR_IP, m);
 			break;
 #ifdef INET6
 		case DN_TO_IP6_IN:
 			netisr_dispatch(NETISR_IPV6, m);
 			break;
 
 		case DN_TO_IP6_OUT:
 			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
 			break;
 #endif
 		case DN_TO_IFB_FWD:
 			if (bridge_dn_p != NULL)
 				((*bridge_dn_p)(m, pkt->ifp));
 			else
 				printf("dummynet: if_bridge not loaded\n");
 
 			break;
 		case DN_TO_ETH_DEMUX:
 			/*
 			 * The Ethernet code assumes the Ethernet header is
 			 * contiguous in the first mbuf header.
 			 * Insure this is true.
 			 */
 			if (m->m_len < ETHER_HDR_LEN &&
 			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
 				printf("dummynet/ether: pullup failed, "
 				    "dropping packet\n");
 				break;
 			}
 			ether_demux(m->m_pkthdr.rcvif, m);
 			break;
 		case DN_TO_ETH_OUT:
 			ether_output_frame(pkt->ifp, m);
 			break;
 		default:
 			printf("dummynet: bad switch %d!\n", pkt->dn_dir);
 			m_freem(m);
 			break;
 		}
 	}
 }
 
 /*
  * Unconditionally expire empty queues in case of shortage.
  * Returns the number of queues freed.
  */
 static int
 expire_queues(struct dn_flow_set *fs)
 {
     struct dn_flow_queue *q, *prev ;
     int i, initial_elements = fs->rq_elements ;
 
     if (fs->last_expired == time_uptime)
 	return 0 ;
     fs->last_expired = time_uptime ;
     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
 	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
 	    if (q->head != NULL || q->S != q->F+1) {
   		prev = q ;
   	        q = q->next ;
   	    } else { /* entry is idle, expire it */
 		struct dn_flow_queue *old_q = q ;
 
 		if (prev != NULL)
 		    prev->next = q = q->next ;
 		else
 		    fs->rq[i] = q = q->next ;
 		fs->rq_elements-- ;
 		free(old_q, M_DUMMYNET);
 	    }
     return initial_elements - fs->rq_elements ;
 }
 
 /*
  * If room, create a new queue and put at head of slot i;
  * otherwise, create or use the default queue.
  */
 static struct dn_flow_queue *
 create_queue(struct dn_flow_set *fs, int i)
 {
     struct dn_flow_queue *q ;
 
     if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
 	    expire_queues(fs) == 0) {
 	/*
 	 * No way to get room, use or create overflow queue.
 	 */
 	i = fs->rq_size ;
 	if ( fs->rq[i] != NULL )
 	    return fs->rq[i] ;
     }
     q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
     if (q == NULL) {
 	printf("dummynet: sorry, cannot allocate queue for new flow\n");
 	return NULL ;
     }
     q->fs = fs ;
     q->hash_slot = i ;
     q->next = fs->rq[i] ;
     q->S = q->F + 1;   /* hack - mark timestamp as invalid */
     fs->rq[i] = q ;
     fs->rq_elements++ ;
     return q ;
 }
 
 /*
  * Given a flow_set and a pkt in last_pkt, find a matching queue
  * after appropriate masking. The queue is moved to front
  * so that further searches take less time.
  */
 static struct dn_flow_queue *
 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
 {
     int i = 0 ; /* we need i and q for new allocations */
     struct dn_flow_queue *q, *prev;
     int is_v6 = IS_IP6_FLOW_ID(id);
 
     if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
 	q = fs->rq[0] ;
     else {
 	/* first, do the masking, then hash */
 	id->dst_port &= fs->flow_mask.dst_port ;
 	id->src_port &= fs->flow_mask.src_port ;
 	id->proto &= fs->flow_mask.proto ;
 	id->flags = 0 ; /* we don't care about this one */
 	if (is_v6) {
 	    APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
 	    APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
 	    id->flow_id6 &= fs->flow_mask.flow_id6;
 
 	    i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
 
 		((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
 		((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
 
 		((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
 
 		((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
 		((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
 
 		(id->dst_port << 1) ^ (id->src_port) ^
 		(id->proto ) ^
 		(id->flow_id6);
 	} else {
 	    id->dst_ip &= fs->flow_mask.dst_ip ;
 	    id->src_ip &= fs->flow_mask.src_ip ;
 
 	    i = ( (id->dst_ip) & 0xffff ) ^
 		( (id->dst_ip >> 15) & 0xffff ) ^
 		( (id->src_ip << 1) & 0xffff ) ^
 		( (id->src_ip >> 16 ) & 0xffff ) ^
 		(id->dst_port << 1) ^ (id->src_port) ^
 		(id->proto );
 	}
 	i = i % fs->rq_size ;
 	/* finally, scan the current list for a match */
 	searches++ ;
 	for (prev=NULL, q = fs->rq[i] ; q ; ) {
 	    search_steps++;
 	    if (is_v6 &&
 		    IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
 		    IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
 		    id->dst_port == q->id.dst_port &&
 		    id->src_port == q->id.src_port &&
 		    id->proto == q->id.proto &&
 		    id->flags == q->id.flags &&
 		    id->flow_id6 == q->id.flow_id6)
 		break ; /* found */
 
 	    if (!is_v6 && id->dst_ip == q->id.dst_ip &&
 		    id->src_ip == q->id.src_ip &&
 		    id->dst_port == q->id.dst_port &&
 		    id->src_port == q->id.src_port &&
 		    id->proto == q->id.proto &&
 		    id->flags == q->id.flags)
 		break ; /* found */
 
 	    /* No match. Check if we can expire the entry */
 	    if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
 		/* entry is idle and not in any heap, expire it */
 		struct dn_flow_queue *old_q = q ;
 
 		if (prev != NULL)
 		    prev->next = q = q->next ;
 		else
 		    fs->rq[i] = q = q->next ;
 		fs->rq_elements-- ;
 		free(old_q, M_DUMMYNET);
 		continue ;
 	    }
 	    prev = q ;
 	    q = q->next ;
 	}
 	if (q && prev != NULL) { /* found and not in front */
 	    prev->next = q->next ;
 	    q->next = fs->rq[i] ;
 	    fs->rq[i] = q ;
 	}
     }
     if (q == NULL) { /* no match, need to allocate a new entry */
 	q = create_queue(fs, i);
 	if (q != NULL)
 	q->id = *id ;
     }
     return q ;
 }
 
 static int
 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
 {
 	/*
 	 * RED algorithm
 	 *
 	 * RED calculates the average queue size (avg) using a low-pass filter
 	 * with an exponential weighted (w_q) moving average:
 	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
 	 * where q_size is the queue length (measured in bytes or * packets).
 	 *
 	 * If q_size == 0, we compute the idle time for the link, and set
 	 *	avg = (1 - w_q)^(idle/s)
 	 * where s is the time needed for transmitting a medium-sized packet.
 	 *
 	 * Now, if avg < min_th the packet is enqueued.
 	 * If avg > max_th the packet is dropped. Otherwise, the packet is
 	 * dropped with probability P function of avg.
 	 */
 
 	int64_t p_b = 0;
 
 	/* Queue in bytes or packets? */
 	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
 	    q->len_bytes : q->len;
 
 	DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
 
 	/* Average queue size estimation. */
 	if (q_size != 0) {
 		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
 		int diff = SCALE(q_size) - q->avg;
 		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
 
 		q->avg += (int)v;
 	} else {
 		/*
 		 * Queue is empty, find for how long the queue has been
 		 * empty and use a lookup table for computing
 		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
 		 * (small) packet.
 		 * XXX check wraps...
 		 */
 		if (q->avg) {
 			u_int t = (curr_time - q->q_time) / fs->lookup_step;
 
 			q->avg = (t < fs->lookup_depth) ?
 			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
 		}
 	}
 	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
 
 	/* Should i drop? */
 	if (q->avg < fs->min_th) {
 		q->count = -1;
 		return (0);	/* accept packet */
 	}
 	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
 		if (fs->flags_fs & DN_IS_GENTLE_RED) {
 			/*
 			 * According to Gentle-RED, if avg is greater than
 			 * max_th the packet is dropped with a probability
 			 *	 p_b = c_3 * avg - c_4
 			 * where c_3 = (1 - max_p) / max_th
 			 *       c_4 = 1 - 2 * max_p
 			 */
 			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
 			    fs->c_4;
 		} else {
 			q->count = -1;
 			DPRINTF(("dummynet: - drop"));
 			return (1);
 		}
 	} else if (q->avg > fs->min_th) {
 		/*
 		 * We compute p_b using the linear dropping function
 		 *	 p_b = c_1 * avg - c_2
 		 * where c_1 = max_p / (max_th - min_th)
 		 * 	 c_2 = max_p * min_th / (max_th - min_th)
 		 */
 		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
 	}
 
 	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
 		p_b = (p_b * len) / fs->max_pkt_size;
 	if (++q->count == 0)
 		q->random = random() & 0xffff;
 	else {
 		/*
 		 * q->count counts packets arrived since last drop, so a greater
 		 * value of q->count means a greater packet drop probability.
 		 */
 		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
 			q->count = 0;
 			DPRINTF(("dummynet: - red drop"));
 			/* After a drop we calculate a new random value. */
 			q->random = random() & 0xffff;
 			return (1);	/* drop */
 		}
 	}
 	/* End of RED algorithm. */
 
 	return (0);	/* accept */
 }
 
 static __inline struct dn_flow_set *
 locate_flowset(int fs_nr)
 {
 	struct dn_flow_set *fs;
 
 	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
 		if (fs->fs_nr == fs_nr)
 			return (fs);
 
 	return (NULL);
 }
 
 static __inline struct dn_pipe *
 locate_pipe(int pipe_nr)
 {
 	struct dn_pipe *pipe;
 
 	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
 		if (pipe->pipe_nr == pipe_nr)
 			return (pipe);
 
 	return (NULL);
 }
 
 /*
  * dummynet hook for packets. Below 'pipe' is a pipe or a queue
  * depending on whether WF2Q or fixed bw is used.
  *
  * pipe_nr	pipe or queue the packet is destined for.
  * dir		where shall we send the packet after dummynet.
  * m		the mbuf with the packet
  * ifp		the 'ifp' parameter from the caller.
  *		NULL in ip_input, destination interface in ip_output,
  * rule		matching rule, in case of multiple passes
  *
  */
 static int
 dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa)
 {
     struct mbuf *head = NULL, *tail = NULL;
     struct dn_pkt_tag *pkt;
     struct m_tag *mtag;
     struct dn_flow_set *fs = NULL;
     struct dn_pipe *pipe ;
     u_int64_t len = m->m_pkthdr.len ;
     struct dn_flow_queue *q = NULL ;
     int is_pipe;
     ipfw_insn *cmd = ACTION_PTR(fwa->rule);
 
     KASSERT(m->m_nextpkt == NULL,
 	("dummynet_io: mbuf queue passed to dummynet"));
 
     if (cmd->opcode == O_LOG)
 	cmd += F_LEN(cmd);
     if (cmd->opcode == O_ALTQ)
 	cmd += F_LEN(cmd);
     if (cmd->opcode == O_TAG)
 	cmd += F_LEN(cmd);
     is_pipe = (cmd->opcode == O_PIPE);
 
     DUMMYNET_LOCK();
     /*
      * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
      *
      * XXXGL: probably the pipe->fs and fs->pipe logic here
      * below can be simplified.
      */
     if (is_pipe) {
 	pipe = locate_pipe(fwa->cookie);
 	if (pipe != NULL)
 		fs = &(pipe->fs);
     } else
 	fs = locate_flowset(fwa->cookie);
 
     if (fs == NULL)
 	goto dropit;	/* This queue/pipe does not exist! */
     pipe = fs->pipe;
     if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */
 	pipe = locate_pipe(fs->parent_nr);
 	if (pipe != NULL)
 	    fs->pipe = pipe;
 	else {
 	    printf("dummynet: no pipe %d for queue %d, drop pkt\n",
 		fs->parent_nr, fs->fs_nr);
 	    goto dropit ;
 	}
     }
     q = find_queue(fs, &(fwa->f_id));
     if ( q == NULL )
 	goto dropit ;		/* cannot allocate queue		*/
     /*
      * update statistics, then check reasons to drop pkt
      */
     q->tot_bytes += len ;
     q->tot_pkts++ ;
     if ( fs->plr && random() < fs->plr )
 	goto dropit ;		/* random pkt drop			*/
     if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
     	if (q->len_bytes > fs->qsize)
 	    goto dropit ;	/* queue size overflow			*/
     } else {
 	if (q->len >= fs->qsize)
 	    goto dropit ;	/* queue count overflow			*/
     }
     if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
 	goto dropit ;
 
     /* XXX expensive to zero, see if we can remove it*/
     mtag = m_tag_get(PACKET_TAG_DUMMYNET,
 		sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO);
     if ( mtag == NULL )
 	goto dropit ;		/* cannot allocate packet header	*/
     m_tag_prepend(m, mtag);	/* attach to mbuf chain */
 
     pkt = (struct dn_pkt_tag *)(mtag+1);
     /* ok, i can handle the pkt now... */
     /* build and enqueue packet + parameters */
     pkt->rule = fwa->rule ;
     pkt->dn_dir = dir ;
 
     pkt->ifp = fwa->oif;
 
     if (q->head == NULL)
 	q->head = m;
     else
 	q->tail->m_nextpkt = m;
     q->tail = m;
     q->len++;
     q->len_bytes += len ;
 
     if ( q->head != m )		/* flow was not idle, we are done */
 	goto done;
     /*
      * If we reach this point the flow was previously idle, so we need
      * to schedule it. This involves different actions for fixed-rate or
      * WF2Q queues.
      */
     if (is_pipe) {
 	/*
 	 * Fixed-rate queue: just insert into the ready_heap.
 	 */
 	dn_key t = 0 ;
 	if (pipe->bandwidth)
 	    t = SET_TICKS(m, q, pipe);
 	q->sched_time = curr_time ;
 	if (t == 0)	/* must process it now */
 	    ready_event(q, &head, &tail);
 	else
 	    heap_insert(&ready_heap, curr_time + t , q );
     } else {
 	/*
 	 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
 	 * set S to the virtual time V for the controlling pipe, and update
 	 * the sum of weights for the pipe; otherwise, remove flow from
 	 * idle_heap and set S to max(F,V).
 	 * Second, compute finish time F = S + len/weight.
 	 * Third, if pipe was idle, update V=max(S, V).
 	 * Fourth, count one more backlogged flow.
 	 */
 	if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
 	    q->S = pipe->V ;
 	    pipe->sum += fs->weight ; /* add weight of new queue */
 	} else {
 	    heap_extract(&(pipe->idle_heap), q);
 	    q->S = MAX64(q->F, pipe->V ) ;
 	}
 	q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
 
 	if (pipe->not_eligible_heap.elements == 0 &&
 		pipe->scheduler_heap.elements == 0)
 	    pipe->V = MAX64 ( q->S, pipe->V );
 	fs->backlogged++ ;
 	/*
 	 * Look at eligibility. A flow is not eligibile if S>V (when
 	 * this happens, it means that there is some other flow already
 	 * scheduled for the same pipe, so the scheduler_heap cannot be
 	 * empty). If the flow is not eligible we just store it in the
 	 * not_eligible_heap. Otherwise, we store in the scheduler_heap
 	 * and possibly invoke ready_event_wfq() right now if there is
 	 * leftover credit.
 	 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
 	 * and for all flows in not_eligible_heap (NEH), S_i > V .
 	 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
 	 * we only need to look into NEH.
 	 */
 	if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
 	    if (pipe->scheduler_heap.elements == 0)
 		printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
 	    heap_insert(&(pipe->not_eligible_heap), q->S, q);
 	} else {
 	    heap_insert(&(pipe->scheduler_heap), q->F, q);
 	    if (pipe->numbytes >= 0) { /* pipe is idle */
 		if (pipe->scheduler_heap.elements != 1)
 		    printf("dummynet: OUCH! pipe should have been idle!\n");
 		DPRINTF(("dummynet: waking up pipe %d at %d\n",
 			pipe->pipe_nr, (int)(q->F >> MY_M)));
 		pipe->sched_time = curr_time ;
 		ready_event_wfq(pipe, &head, &tail);
 	    }
 	}
     }
 done:
     DUMMYNET_UNLOCK();
     if (head != NULL)
 	dummynet_send(head);
     return 0;
 
 dropit:
     if (q)
 	q->drops++ ;
     DUMMYNET_UNLOCK();
     m_freem(m);
     return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
 }
 
 /*
  * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
  * Doing this would probably save us the initial bzero of dn_pkt
  */
 #define	DN_FREE_PKT(_m) do {				\
 	m_freem(_m);					\
 } while (0)
 
 /*
  * Dispose all packets and flow_queues on a flow_set.
  * If all=1, also remove red lookup table and other storage,
  * including the descriptor itself.
  * For the one in dn_pipe MUST also cleanup ready_heap...
  */
 static void
 purge_flow_set(struct dn_flow_set *fs, int all)
 {
 	struct dn_flow_queue *q, *qn;
 	int i;
 
 	DUMMYNET_LOCK_ASSERT();
 
 	for (i = 0; i <= fs->rq_size; i++) {
 		for (q = fs->rq[i]; q != NULL; q = qn) {
 			struct mbuf *m, *mnext;
 
 			mnext = q->head;
 			while ((m = mnext) != NULL) {
 				mnext = m->m_nextpkt;
 				DN_FREE_PKT(m);
 			}
 			qn = q->next;
 			free(q, M_DUMMYNET);
 		}
 		fs->rq[i] = NULL;
 	}
 
 	fs->rq_elements = 0;
 	if (all) {
 		/* RED - free lookup table. */
 		if (fs->w_q_lookup != NULL)
 			free(fs->w_q_lookup, M_DUMMYNET);
 		if (fs->rq != NULL)
 			free(fs->rq, M_DUMMYNET);
 		/* If this fs is not part of a pipe, free it. */
 		if (fs->pipe == NULL || fs != &(fs->pipe->fs))
 			free(fs, M_DUMMYNET);
 	}
 }
 
 /*
  * Dispose all packets queued on a pipe (not a flow_set).
  * Also free all resources associated to a pipe, which is about
  * to be deleted.
  */
 static void
 purge_pipe(struct dn_pipe *pipe)
 {
     struct mbuf *m, *mnext;
 
     purge_flow_set( &(pipe->fs), 1 );
 
     mnext = pipe->head;
     while ((m = mnext) != NULL) {
 	mnext = m->m_nextpkt;
 	DN_FREE_PKT(m);
     }
 
     heap_free( &(pipe->scheduler_heap) );
     heap_free( &(pipe->not_eligible_heap) );
     heap_free( &(pipe->idle_heap) );
 }
 
 /*
  * Delete all pipes and heaps returning memory. Must also
  * remove references from all ipfw rules to all pipes.
  */
 static void
 dummynet_flush(void)
 {
 	struct dn_pipe *pipe, *pipe1;
 	struct dn_flow_set *fs, *fs1;
 	int i;
 
 	DUMMYNET_LOCK();
 	/* Free heaps so we don't have unwanted events. */
 	heap_free(&ready_heap);
 	heap_free(&wfq_ready_heap);
 	heap_free(&extract_heap);
 
 	/*
 	 * Now purge all queued pkts and delete all pipes.
 	 *
 	 * XXXGL: can we merge the for(;;) cycles into one or not?
 	 */
 	for (i = 0; i < HASHSIZE; i++)
 		SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
 			SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
 			purge_flow_set(fs, 1);
 		}
 	for (i = 0; i < HASHSIZE; i++)
 		SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
 			SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
 			purge_pipe(pipe);
 			free(pipe, M_DUMMYNET);
 		}
 	DUMMYNET_UNLOCK();
 }
 
 extern struct ip_fw *ip_fw_default_rule ;
 static void
 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
 {
     int i ;
     struct dn_flow_queue *q ;
     struct mbuf *m ;
 
     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
 	for (q = fs->rq[i] ; q ; q = q->next )
 	    for (m = q->head ; m ; m = m->m_nextpkt ) {
 		struct dn_pkt_tag *pkt = dn_tag_get(m) ;
 		if (pkt->rule == r)
 		    pkt->rule = ip_fw_default_rule ;
 	    }
 }
 /*
  * when a firewall rule is deleted, scan all queues and remove the flow-id
  * from packets matching this rule.
  */
 void
 dn_rule_delete(void *r)
 {
     struct dn_pipe *pipe;
     struct dn_flow_set *fs;
     struct dn_pkt_tag *pkt;
     struct mbuf *m;
     int i;
 
     DUMMYNET_LOCK();
     /*
      * If the rule references a queue (dn_flow_set), then scan
      * the flow set, otherwise scan pipes. Should do either, but doing
      * both does not harm.
      */
     for (i = 0; i < HASHSIZE; i++)
 	SLIST_FOREACH(fs, &flowsethash[i], next)
 		dn_rule_delete_fs(fs, r);
 
     for (i = 0; i < HASHSIZE; i++)
 	SLIST_FOREACH(pipe, &pipehash[i], next) {
 		fs = &(pipe->fs);
 		dn_rule_delete_fs(fs, r);
 		for (m = pipe->head ; m ; m = m->m_nextpkt ) {
 			pkt = dn_tag_get(m);
 			if (pkt->rule == r)
 				pkt->rule = ip_fw_default_rule;
 		}
 	}
     DUMMYNET_UNLOCK();
 }
 
 /*
  * setup RED parameters
  */
 static int
 config_red(struct dn_flow_set *p, struct dn_flow_set *x)
 {
 	int i;
 
 	x->w_q = p->w_q;
 	x->min_th = SCALE(p->min_th);
 	x->max_th = SCALE(p->max_th);
 	x->max_p = p->max_p;
 
 	x->c_1 = p->max_p / (p->max_th - p->min_th);
 	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
 
 	if (x->flags_fs & DN_IS_GENTLE_RED) {
 		x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
 		x->c_4 = SCALE(1) - 2 * p->max_p;
 	}
 
 	/* If the lookup table already exist, free and create it again. */
 	if (x->w_q_lookup) {
 		free(x->w_q_lookup, M_DUMMYNET);
 		x->w_q_lookup = NULL;
 	}
 	if (red_lookup_depth == 0) {
 		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
 		    "must be > 0\n");
 		free(x, M_DUMMYNET);
 		return (EINVAL);
 	}
 	x->lookup_depth = red_lookup_depth;
 	x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
 	    M_DUMMYNET, M_NOWAIT);
 	if (x->w_q_lookup == NULL) {
 		printf("dummynet: sorry, cannot allocate red lookup table\n");
 		free(x, M_DUMMYNET);
 		return(ENOSPC);
 	}
 
 	/* Fill the lookup table with (1 - w_q)^x */
 	x->lookup_step = p->lookup_step;
 	x->lookup_weight = p->lookup_weight;
 	x->w_q_lookup[0] = SCALE(1) - x->w_q;
 
 	for (i = 1; i < x->lookup_depth; i++)
 		x->w_q_lookup[i] =
 		    SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
 
 	if (red_avg_pkt_size < 1)
 		red_avg_pkt_size = 512;
 	x->avg_pkt_size = red_avg_pkt_size;
 	if (red_max_pkt_size < 1)
 		red_max_pkt_size = 1500;
 	x->max_pkt_size = red_max_pkt_size;
 	return (0);
 }
 
 static int
 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
 {
     if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
 	int l = pfs->rq_size;
 
 	if (l == 0)
 	    l = dn_hash_size;
 	if (l < 4)
 	    l = 4;
 	else if (l > DN_MAX_HASH_SIZE)
 	    l = DN_MAX_HASH_SIZE;
 	x->rq_size = l;
     } else                  /* one is enough for null mask */
 	x->rq_size = 1;
     x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
 	    M_DUMMYNET, M_NOWAIT | M_ZERO);
     if (x->rq == NULL) {
 	printf("dummynet: sorry, cannot allocate queue\n");
 	return (ENOMEM);
     }
     x->rq_elements = 0;
     return 0 ;
 }
 
 static void
 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
 {
 	x->flags_fs = src->flags_fs;
 	x->qsize = src->qsize;
 	x->plr = src->plr;
 	x->flow_mask = src->flow_mask;
 	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
 		if (x->qsize > 1024 * 1024)
 			x->qsize = 1024 * 1024;
 	} else {
 		if (x->qsize == 0)
 			x->qsize = 50;
 		if (x->qsize > 100)
 			x->qsize = 50;
 	}
 	/* Configuring RED. */
 	if (x->flags_fs & DN_IS_RED)
 		config_red(src, x);	/* XXX should check errors */
 }
 
 /*
  * Setup pipe or queue parameters.
  */
 static int
 config_pipe(struct dn_pipe *p)
 {
 	struct dn_flow_set *pfs = &(p->fs);
 	struct dn_flow_queue *q;
 	int i, error;
 
 	/*
 	 * The config program passes parameters as follows:
 	 * bw = bits/second (0 means no limits),
 	 * delay = ms, must be translated into ticks.
 	 * qsize = slots/bytes
 	 */
 	p->delay = (p->delay * hz) / 1000;
 	/* We need either a pipe number or a flow_set number. */
 	if (p->pipe_nr == 0 && pfs->fs_nr == 0)
 		return (EINVAL);
 	if (p->pipe_nr != 0 && pfs->fs_nr != 0)
 		return (EINVAL);
 	if (p->pipe_nr != 0) {			/* this is a pipe */
 		struct dn_pipe *pipe;
 
 		DUMMYNET_LOCK();
 		pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
 
 		if (pipe == NULL) {		/* new pipe */
 			pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
 			    M_NOWAIT | M_ZERO);
 			if (pipe == NULL) {
 				DUMMYNET_UNLOCK();
 				printf("dummynet: no memory for new pipe\n");
 				return (ENOMEM);
 			}
 			pipe->pipe_nr = p->pipe_nr;
 			pipe->fs.pipe = pipe;
 			/*
 			 * idle_heap is the only one from which
 			 * we extract from the middle.
 			 */
 			pipe->idle_heap.size = pipe->idle_heap.elements = 0;
 			pipe->idle_heap.offset =
 			    offsetof(struct dn_flow_queue, heap_pos);
 		} else
 			/* Flush accumulated credit for all queues. */
 			for (i = 0; i <= pipe->fs.rq_size; i++)
 				for (q = pipe->fs.rq[i]; q; q = q->next)
 					q->numbytes = 0;
 
 		pipe->bandwidth = p->bandwidth;
 		pipe->numbytes = 0;		/* just in case... */
 		bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
 		pipe->ifp = NULL;		/* reset interface ptr */
 		pipe->delay = p->delay;
 		set_fs_parms(&(pipe->fs), pfs);
 
 		if (pipe->fs.rq == NULL) {	/* a new pipe */
 			error = alloc_hash(&(pipe->fs), pfs);
 			if (error) {
 				DUMMYNET_UNLOCK();
 				free(pipe, M_DUMMYNET);
 				return (error);
 			}
 			SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
 			    pipe, next);
 		}
 		DUMMYNET_UNLOCK();
 	} else {				/* config queue */
 		struct dn_flow_set *fs;
 
 		DUMMYNET_LOCK();
 		fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
 
 		if (fs == NULL) {		/* new */
 			if (pfs->parent_nr == 0) { /* need link to a pipe */
 				DUMMYNET_UNLOCK();
 				return (EINVAL);
 			}
 			fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
 			    M_NOWAIT | M_ZERO);
 			if (fs == NULL) {
 				DUMMYNET_UNLOCK();
 				printf(
 				    "dummynet: no memory for new flow_set\n");
 				return (ENOMEM);
 			}
 			fs->fs_nr = pfs->fs_nr;
 			fs->parent_nr = pfs->parent_nr;
 			fs->weight = pfs->weight;
 			if (fs->weight == 0)
 				fs->weight = 1;
 			else if (fs->weight > 100)
 				fs->weight = 100;
 		} else {
 			/*
 			 * Change parent pipe not allowed;
 			 * must delete and recreate.
 			 */
 			if (pfs->parent_nr != 0 &&
 			    fs->parent_nr != pfs->parent_nr) {
 				DUMMYNET_UNLOCK();
 				return (EINVAL);
 			}
 		}
 
 		set_fs_parms(fs, pfs);
 
 		if (fs->rq == NULL) {		/* a new flow_set */
 			error = alloc_hash(fs, pfs);
 			if (error) {
 				DUMMYNET_UNLOCK();
 				free(fs, M_DUMMYNET);
 				return (error);
 			}
 			SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
 			    fs, next);
 		}
 		DUMMYNET_UNLOCK();
 	}
 	return (0);
 }
 
 /*
  * Helper function to remove from a heap queues which are linked to
  * a flow_set about to be deleted.
  */
 static void
 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
 {
     int i = 0, found = 0 ;
     for (; i < h->elements ;)
 	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
 	    h->elements-- ;
 	    h->p[i] = h->p[h->elements] ;
 	    found++ ;
 	} else
 	    i++ ;
     if (found)
 	heapify(h);
 }
 
 /*
  * helper function to remove a pipe from a heap (can be there at most once)
  */
 static void
 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
 {
     if (h->elements > 0) {
 	int i = 0 ;
 	for (i=0; i < h->elements ; i++ ) {
 	    if (h->p[i].object == p) { /* found it */
 		h->elements-- ;
 		h->p[i] = h->p[h->elements] ;
 		heapify(h);
 		break ;
 	    }
 	}
     }
 }
 
 /*
  * drain all queues. Called in case of severe mbuf shortage.
  */
 void
 dummynet_drain(void)
 {
     struct dn_flow_set *fs;
     struct dn_pipe *pipe;
     struct mbuf *m, *mnext;
     int i;
 
     DUMMYNET_LOCK_ASSERT();
 
     heap_free(&ready_heap);
     heap_free(&wfq_ready_heap);
     heap_free(&extract_heap);
     /* remove all references to this pipe from flow_sets */
     for (i = 0; i < HASHSIZE; i++)
 	SLIST_FOREACH(fs, &flowsethash[i], next)
 		purge_flow_set(fs, 0);
 
     for (i = 0; i < HASHSIZE; i++) {
 	SLIST_FOREACH(pipe, &pipehash[i], next) {
 		purge_flow_set(&(pipe->fs), 0);
 
 		mnext = pipe->head;
 		while ((m = mnext) != NULL) {
 			mnext = m->m_nextpkt;
 			DN_FREE_PKT(m);
 		}
 		pipe->head = pipe->tail = NULL;
 	}
     }
 }
 
 /*
  * Fully delete a pipe or a queue, cleaning up associated info.
  */
 static int
 delete_pipe(struct dn_pipe *p)
 {
 
     if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
 	return EINVAL ;
     if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
 	return EINVAL ;
     if (p->pipe_nr != 0) { /* this is an old-style pipe */
 	struct dn_pipe *pipe;
 	struct dn_flow_set *fs;
 	int i;
 
 	DUMMYNET_LOCK();
 	pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
 
 	if (pipe == NULL) {
 	    DUMMYNET_UNLOCK();
 	    return (ENOENT);	/* not found */
 	}
 
 	/* Unlink from list of pipes. */
 	SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
 
 	/* Remove all references to this pipe from flow_sets. */
 	for (i = 0; i < HASHSIZE; i++)
 	    SLIST_FOREACH(fs, &flowsethash[i], next)
 		if (fs->pipe == pipe) {
 			printf("dummynet: ++ ref to pipe %d from fs %d\n",
 			    p->pipe_nr, fs->fs_nr);
 			fs->pipe = NULL ;
 			purge_flow_set(fs, 0);
 		}
 	fs_remove_from_heap(&ready_heap, &(pipe->fs));
 	purge_pipe(pipe); /* remove all data associated to this pipe */
 	/* remove reference to here from extract_heap and wfq_ready_heap */
 	pipe_remove_from_heap(&extract_heap, pipe);
 	pipe_remove_from_heap(&wfq_ready_heap, pipe);
 	DUMMYNET_UNLOCK();
 
 	free(pipe, M_DUMMYNET);
     } else { /* this is a WF2Q queue (dn_flow_set) */
 	struct dn_flow_set *fs;
 
 	DUMMYNET_LOCK();
 	fs = locate_flowset(p->fs.fs_nr); /* locate set */
 
 	if (fs == NULL) {
 	    DUMMYNET_UNLOCK();
 	    return (ENOENT); /* not found */
 	}
 
 	/* Unlink from list of flowsets. */
 	SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
 
 	if (fs->pipe != NULL) {
 	    /* Update total weight on parent pipe and cleanup parent heaps. */
 	    fs->pipe->sum -= fs->weight * fs->backlogged ;
 	    fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
 	    fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
 #if 1	/* XXX should i remove from idle_heap as well ? */
 	    fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
 #endif
 	}
 	purge_flow_set(fs, 1);
 	DUMMYNET_UNLOCK();
     }
     return 0 ;
 }
 
 /*
  * helper function used to copy data from kernel in DUMMYNET_GET
  */
 static char *
 dn_copy_set(struct dn_flow_set *set, char *bp)
 {
     int i, copied = 0 ;
     struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
 
     DUMMYNET_LOCK_ASSERT();
 
     for (i = 0 ; i <= set->rq_size ; i++)
 	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
 	    if (q->hash_slot != i)
 		printf("dummynet: ++ at %d: wrong slot (have %d, "
 		    "should be %d)\n", copied, q->hash_slot, i);
 	    if (q->fs != set)
 		printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
 			i, q->fs, set);
 	    copied++ ;
 	    bcopy(q, qp, sizeof( *q ) );
 	    /* cleanup pointers */
 	    qp->next = NULL ;
 	    qp->head = qp->tail = NULL ;
 	    qp->fs = NULL ;
 	}
     if (copied != set->rq_elements)
 	printf("dummynet: ++ wrong count, have %d should be %d\n",
 	    copied, set->rq_elements);
     return (char *)qp ;
 }
 
 static size_t
 dn_calc_size(void)
 {
     struct dn_flow_set *fs;
     struct dn_pipe *pipe;
     size_t size = 0;
     int i;
 
     DUMMYNET_LOCK_ASSERT();
     /*
      * Compute size of data structures: list of pipes and flow_sets.
      */
     for (i = 0; i < HASHSIZE; i++) {
 	SLIST_FOREACH(pipe, &pipehash[i], next)
 		size += sizeof(*pipe) +
 		    pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
 	SLIST_FOREACH(fs, &flowsethash[i], next)
 		size += sizeof (*fs) +
 		    fs->rq_elements * sizeof(struct dn_flow_queue);
     }
     return size;
 }
 
 static int
 dummynet_get(struct sockopt *sopt)
 {
     char *buf, *bp ; /* bp is the "copy-pointer" */
     size_t size ;
     struct dn_flow_set *fs;
     struct dn_pipe *pipe;
     int error=0, i ;
 
     /* XXX lock held too long */
     DUMMYNET_LOCK();
     /*
      * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
      *      cannot use this flag while holding a mutex.
      */
     for (i = 0; i < 10; i++) {
 	size = dn_calc_size();
 	DUMMYNET_UNLOCK();
 	buf = malloc(size, M_TEMP, M_WAITOK);
 	DUMMYNET_LOCK();
 	if (size == dn_calc_size())
 		break;
 	free(buf, M_TEMP);
 	buf = NULL;
     }
     if (buf == NULL) {
 	DUMMYNET_UNLOCK();
 	return ENOBUFS ;
     }
     bp = buf;
     for (i = 0; i < HASHSIZE; i++)
 	SLIST_FOREACH(pipe, &pipehash[i], next) {
 		struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
 
 		/*
 		 * Copy pipe descriptor into *bp, convert delay back to ms,
 		 * then copy the flow_set descriptor(s) one at a time.
 		 * After each flow_set, copy the queue descriptor it owns.
 		 */
 		bcopy(pipe, bp, sizeof(*pipe));
 		pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
 		/*
 		 * XXX the following is a hack based on ->next being the
 		 * first field in dn_pipe and dn_flow_set. The correct
 		 * solution would be to move the dn_flow_set to the beginning
 		 * of struct dn_pipe.
 		 */
 		pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
 		/* Clean pointers. */
 		pipe_bp->head = pipe_bp->tail = NULL;
 		pipe_bp->fs.next.sle_next = NULL;
 		pipe_bp->fs.pipe = NULL;
 		pipe_bp->fs.rq = NULL;
 
 		bp += sizeof(*pipe) ;
 		bp = dn_copy_set(&(pipe->fs), bp);
 	}
 
     for (i = 0; i < HASHSIZE; i++)
 	SLIST_FOREACH(fs, &flowsethash[i], next) {
 		struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
 
 		bcopy(fs, bp, sizeof(*fs));
 		/* XXX same hack as above */
 		fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
 		fs_bp->pipe = NULL;
 		fs_bp->rq = NULL;
 		bp += sizeof(*fs);
 		bp = dn_copy_set(fs, bp);
 	}
 
     DUMMYNET_UNLOCK();
 
     error = sooptcopyout(sopt, buf, size);
     free(buf, M_TEMP);
     return error ;
 }
 
 /*
  * Handler for the various dummynet socket options (get, flush, config, del)
  */
 static int
 ip_dn_ctl(struct sockopt *sopt)
 {
     int error = 0 ;
     struct dn_pipe *p, tmp_pipe;
 
     /* Disallow sets in really-really secure mode. */
     if (sopt->sopt_dir == SOPT_SET) {
 #if __FreeBSD_version >= 500034
 	error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
 	if (error)
 	    return (error);
 #else
 	if (securelevel >= 3)
 	    return (EPERM);
 #endif
     }
 
     switch (sopt->sopt_name) {
     default :
 	printf("dummynet: -- unknown option %d", sopt->sopt_name);
 	return EINVAL ;
 
     case IP_DUMMYNET_GET :
 	error = dummynet_get(sopt);
 	break ;
 
     case IP_DUMMYNET_FLUSH :
 	dummynet_flush() ;
 	break ;
 
     case IP_DUMMYNET_CONFIGURE :
 	p = &tmp_pipe ;
 	error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 	if (error)
 	    break ;
 	error = config_pipe(p);
 	break ;
 
     case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
 	p = &tmp_pipe ;
 	error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 	if (error)
 	    break ;
 
 	error = delete_pipe(p);
 	break ;
     }
     return error ;
 }
 
 static void
 ip_dn_init(void)
 {
 	int i;
 
 	if (bootverbose)
 		printf("DUMMYNET with IPv6 initialized (040826)\n");
 
 	DUMMYNET_LOCK_INIT();
 
 	for (i = 0; i < HASHSIZE; i++) {
 		SLIST_INIT(&pipehash[i]);
 		SLIST_INIT(&flowsethash[i]);
 	}
 	ready_heap.size = ready_heap.elements = 0;
 	ready_heap.offset = 0;
 
 	wfq_ready_heap.size = wfq_ready_heap.elements = 0;
 	wfq_ready_heap.offset = 0;
 
 	extract_heap.size = extract_heap.elements = 0;
 	extract_heap.offset = 0;
 
 	ip_dn_ctl_ptr = ip_dn_ctl;
 	ip_dn_io_ptr = dummynet_io;
 	ip_dn_ruledel_ptr = dn_rule_delete;
 
 	TASK_INIT(&dn_task, 0, dummynet_task, NULL);
 	dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
 	    taskqueue_thread_enqueue, &dn_tq);
 	taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
 
-	callout_init(&dn_timeout, NET_CALLOUT_MPSAFE);
+	callout_init(&dn_timeout, CALLOUT_MPSAFE);
 	callout_reset(&dn_timeout, 1, dummynet, NULL);
 
 	/* Initialize curr_time adjustment mechanics. */
 	getmicrouptime(&prev_t);
 }
 
 #ifdef KLD_MODULE
 static void
 ip_dn_destroy(void)
 {
 	ip_dn_ctl_ptr = NULL;
 	ip_dn_io_ptr = NULL;
 	ip_dn_ruledel_ptr = NULL;
 
 	DUMMYNET_LOCK();
 	callout_stop(&dn_timeout);
 	DUMMYNET_UNLOCK();
 	taskqueue_drain(dn_tq, &dn_task);
 	taskqueue_free(dn_tq);
 
 	dummynet_flush();
 
 	DUMMYNET_LOCK_DESTROY();
 }
 #endif /* KLD_MODULE */
 
 static int
 dummynet_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		if (DUMMYNET_LOADED) {
 		    printf("DUMMYNET already loaded\n");
 		    return EEXIST ;
 		}
 		ip_dn_init();
 		break;
 
 	case MOD_UNLOAD:
 #if !defined(KLD_MODULE)
 		printf("dummynet statically compiled, cannot unload\n");
 		return EINVAL ;
 #else
 		ip_dn_destroy();
 #endif
 		break ;
 	default:
 		return EOPNOTSUPP;
 		break ;
 	}
 	return 0 ;
 }
 
 static moduledata_t dummynet_mod = {
 	"dummynet",
 	dummynet_modevent,
 	NULL
 };
 DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 MODULE_VERSION(dummynet, 1);
Index: head/sys/netinet/ip_fw2.c
===================================================================
--- head/sys/netinet/ip_fw2.c	(revision 171636)
+++ head/sys/netinet/ip_fw2.c	(revision 171637)
@@ -1,5056 +1,5056 @@
 /*-
  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #define        DEB(x)
 #define        DDB(x) x
 
 /*
  * Implement IP packet firewall (new version)
  */
 
 #if !defined(KLD_MODULE)
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_ipdn.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #endif
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/if.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/pf_mtag.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_divert.h>
 #include <netinet/ip_dummynet.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 #ifdef IPFIREWALL_NAT
 #include <netinet/libalias/alias.h>
 #include <netinet/libalias/alias_local.h>
 #endif
 #include <netgraph/ng_ipfw.h>
 
 #include <altq/if_altq.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #ifdef INET6
 #include <netinet6/scope6_var.h>
 #endif
 
 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * set_disable contains one bit per set value (0..31).
  * If the bit is set, all rules with the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted explicitly.
  */
 static u_int32_t set_disable;
 
 static int fw_verbose;
 static int verbose_limit;
 
 static struct callout ipfw_timeout;
 static uma_zone_t ipfw_dyn_rule_zone;
 #define	IPFW_DEFAULT_RULE	65535
 
 /*
  * Data structure to cache our ucred related
  * information. This structure only gets used if
  * the user specified UID/GID based constraints in
  * a firewall rule.
  */
 struct ip_fw_ugid {
 	gid_t		fw_groups[NGROUPS];
 	int		fw_ngroups;
 	uid_t		fw_uid;
 	int		fw_prid;
 };
 
 #define	IPFW_TABLES_MAX		128
 struct ip_fw_chain {
 	struct ip_fw	*rules;		/* list of rules */
 	struct ip_fw	*reap;		/* list of rules to reap */
 	LIST_HEAD(, cfg_nat) nat;       /* list of nat entries */
 	struct radix_node_head *tables[IPFW_TABLES_MAX];
 	struct rwlock	rwmtx;
 };
 #define	IPFW_LOCK_INIT(_chain) \
 	rw_init(&(_chain)->rwmtx, "IPFW static rules")
 #define	IPFW_LOCK_DESTROY(_chain)	rw_destroy(&(_chain)->rwmtx)
 #define	IPFW_WLOCK_ASSERT(_chain)	do {				\
 	rw_assert(&(_chain)->rwmtx, RA_WLOCKED);					\
 	NET_ASSERT_GIANT();						\
 } while (0)
 
 #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
 #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
 #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
 #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
 
 /*
  * list of rules for layer 3
  */
 static struct ip_fw_chain layer3_chain;
 
 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
 
 struct table_entry {
 	struct radix_node	rn[2];
 	struct sockaddr_in	addr, mask;
 	u_int32_t		value;
 };
 
 static int fw_debug = 1;
 static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
 
 extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0,
     ipfw_chg_hook, "I", "Enable ipfw");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
     &autoinc_step, 0, "Rule number autincrement step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_RW | CTLFLAG_SECURE3,
     &fw_one_pass, 0,
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
     &fw_debug, 0, "Enable printing of debug ip_fw statements");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_RW | CTLFLAG_SECURE3,
     &fw_verbose, 0, "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
 
 /*
  * Description of dynamic rules.
  *
  * Dynamic rules are stored in lists accessed through a hash table
  * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
  * be modified through the sysctl variable dyn_buckets which is
  * updated when the table becomes empty.
  *
  * XXX currently there is only one list, ipfw_dyn.
  *
  * When a packet is received, its address fields are first masked
  * with the mask defined for the rule, then hashed, then matched
  * against the entries in the corresponding list.
  * Dynamic rules can be used for different purposes:
  *  + stateful rules;
  *  + enforcing limits on the number of sessions;
  *  + in-kernel NAT (not implemented yet)
  *
  * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
  * measured in seconds and depending on the flags.
  *
  * The total number of dynamic rules is stored in dyn_count.
  * The max number of dynamic rules is dyn_max. When we reach
  * the maximum number of rules we do not create anymore. This is
  * done to avoid consuming too much memory, but also too much
  * time when searching on each packet (ideally, we should try instead
  * to put a limit on the length of the list on each bucket...).
  *
  * Each dynamic rule holds a pointer to the parent ipfw rule so
  * we know what action to perform. Dynamic rules are removed when
  * the parent rule is deleted. XXX we should make them survive.
  *
  * There are some limitations with dynamic rules -- we do not
  * obey the 'randomized match', and we do not do multiple
  * passes through the firewall. XXX check the latter!!!
  */
 static ipfw_dyn_rule **ipfw_dyn_v = NULL;
 static u_int32_t dyn_buckets = 256; /* must be power of 2 */
 static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */
 
 static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
 #define	IPFW_DYN_LOCK_INIT() \
 	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
 #define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
 #define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
 #define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
 #define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
 
 /*
  * Timeouts for various events in handing dynamic rules.
  */
 static u_int32_t dyn_ack_lifetime = 300;
 static u_int32_t dyn_syn_lifetime = 20;
 static u_int32_t dyn_fin_lifetime = 1;
 static u_int32_t dyn_rst_lifetime = 1;
 static u_int32_t dyn_udp_lifetime = 10;
 static u_int32_t dyn_short_lifetime = 5;
 
 /*
  * Keepalives are sent if dyn_keepalive is set. They are sent every
  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
  * seconds of lifetime of a rule.
  * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
  * than dyn_keepalive_period.
  */
 
 static u_int32_t dyn_keepalive_interval = 20;
 static u_int32_t dyn_keepalive_period = 5;
 static u_int32_t dyn_keepalive = 1;	/* do send keepalives */
 
 static u_int32_t static_count;	/* # of static rules */
 static u_int32_t static_len;	/* size in bytes of static rules */
 static u_int32_t dyn_count;		/* # of dynamic rules */
 static u_int32_t dyn_max = 4096;	/* max # of dynamic rules */
 
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
     &dyn_buckets, 0, "Number of dyn. buckets");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
     &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
     &dyn_count, 0, "Number of dyn. rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
     &dyn_max, 0, "Max number of dyn. rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
     &static_count, 0, "Number of static rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
 
 #ifdef INET6
 /*
  * IPv6 specific variables
  */
 SYSCTL_DECL(_net_inet6_ip6);
 
 static struct sysctl_ctx_list ip6_fw_sysctl_ctx;
 static struct sysctl_oid *ip6_fw_sysctl_tree;
 #endif /* INET6 */
 #endif /* SYSCTL_NODE */
 
 #ifdef IPFIREWALL_NAT
 MODULE_DEPEND(ipfw, libalias, 1, 1, 1);
 #endif
 static int fw_deny_unknown_exthdrs = 1;
 
 
 /*
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(tcp + 1);
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 {
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return 0;
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 		struct ifaddr *ia;
 
 		/* XXX lock? */
 		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr)
 				return(1);	/* match */
 		}
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.  The 'versrcreach'
  * option just checks that the source address is reachable via any route
  * (except default) in the routing table.  These two are a measure to block
  * forged packets.  This is also commonly known as "anti-spoofing" or Unicast
  * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that syntax is
  * misleading. The check may be performed on all IP packets whether unicast,
  * multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp)
 {
 	struct route ro;
 	struct sockaddr_in *dst;
 
 	bzero(&ro, sizeof(ro));
 
 	dst = (struct sockaddr_in *)&(ro.ro_dst);
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr = src;
 	rtalloc_ign(&ro, RTF_CLONING);
 
 	if (ro.ro_rt == NULL)
 		return 0;
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * if useloopback == 1 routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL &&
 	     satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match (int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i )
 		if (curr_flow == cmd->d[i] )
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static int
 search_ip6_addr_net (struct in6_addr * ip6_addr)
 {
 	struct ifnet *mdc;
 	struct ifaddr *mdc2;
 	struct in6_ifaddr *fdm;
 	struct in6_addr copia;
 
 	TAILQ_FOREACH(mdc, &ifnet, if_link)
 		TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) {
 			if (mdc2->ifa_addr->sa_family == AF_INET6) {
 				fdm = (struct in6_ifaddr *)mdc2;
 				copia = fdm->ia_addr.sin6_addr;
 				/* need for leaving scope_id in the sock_addr */
 				in6_clearscope(&copia);
 				if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia))
 					return 1;
 			}
 		}
 	return 0;
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp)
 {
 	struct route_in6 ro;
 	struct sockaddr_in6 *dst;
 
 	bzero(&ro, sizeof(ro));
 
 	dst = (struct sockaddr_in6 * )&(ro.ro_dst);
 	dst->sin6_family = AF_INET6;
 	dst->sin6_len = sizeof(*dst);
 	dst->sin6_addr = *src;
 	rtalloc_ign((struct route *)&ro, RTF_CLONING);
 
 	if (ro.ro_rt == NULL)
 		return 0;
 
 	/* 
 	 * if ifp is provided, check for equality with rtentry
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * to support the case of sending packets to an address of our own.
 	 * (where the former interface is the first argument of if_simloop()
 	 *  (=ifp), the latter is lo0)
 	 */
 	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL &&
 	    IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
 
 }
 static __inline int
 hash_packet6(struct ipfw_flow_id *id)
 {
 	u_int32_t i;
 	i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
 	    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
 	    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
 	    (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
 	    (id->dst_port) ^ (id->src_port);
 	return i;
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp_seq ack, seq;
 		int flags;
 		struct {
 			struct ip6_hdr ip6;
 			struct tcphdr th;
 		} ti;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) != 0) {
 			m_freem(m);
 			args->m = NULL;
 			return;
 		}
 
 		ti.ip6 = *ip6;
 		ti.th = *tcp;
 		ti.th.th_seq = ntohl(ti.th.th_seq);
 		ti.th.th_ack = ntohl(ti.th.th_ack);
 		ti.ip6.ip6_nxt = IPPROTO_TCP;
 
 		if (ti.th.th_flags & TH_ACK) {
 			ack = 0;
 			seq = ti.th.th_ack;
 			flags = TH_RST;
 		} else {
 			ack = ti.th.th_seq;
 			if ((m->m_flags & M_PKTHDR) != 0) {
 				/*
 				 * total new data to ACK is:
 				 * total packet length,
 				 * minus the header length,
 				 * minus the tcp header length.
 				 */
 				ack += m->m_pkthdr.len - hlen
 					- (ti.th.th_off << 2);
 			} else if (ip6->ip6_plen) {
 				ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) -
 				    hlen - (ti.th.th_off << 2);
 			} else {
 				m_freem(m);
 				return;
 			}
 			if (tcp->th_flags & TH_SYN)
 				ack++;
 			seq = 0;
 			flags = TH_RST|TH_ACK;
 		}
 		bcopy(&ti, ip6, sizeof(ti));
 		/*
 		 * m is only used to recycle the mbuf
 		 * The data in it is never read so we don't need
 		 * to correct the offsets or anything
 		 */
 		tcp_respond(NULL, ip6, tcp, m, ack, seq, flags);
 	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		m_freem(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
 
 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
 #define SNP(buf) buf, sizeof(buf)
 
 /*
  * We enter here when we have a rule with O_LOG.
  * XXX this function alone takes about 2Kbytes of code!
  */
 static void
 ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
     struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
     struct ip *ip)
 {
 	struct ether_header *eh = args->eh;
 	char *action;
 	int limit_reached = 0;
 	char action2[40], proto[128], fragment[32];
 
 	fragment[0] = '\0';
 	proto[0] = '\0';
 
 	if (f == NULL) {	/* bogus pkt */
 		if (verbose_limit != 0 && norule_counter >= verbose_limit)
 			return;
 		norule_counter++;
 		if (norule_counter == verbose_limit)
 			limit_reached = verbose_limit;
 		action = "Refuse";
 	} else {	/* O_LOG is the first action, find the real one */
 		ipfw_insn *cmd = ACTION_PTR(f);
 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
 
 		if (l->max_log != 0 && l->log_left == 0)
 			return;
 		l->log_left--;
 		if (l->log_left == 0)
 			limit_reached = l->max_log;
 		cmd += F_LEN(cmd);	/* point to first action */
 		if (cmd->opcode == O_ALTQ) {
 			ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 			snprintf(SNPARGS(action2, 0), "Altq %d",
 				altq->qid);
 			cmd += F_LEN(cmd);
 		}
 		if (cmd->opcode == O_PROB)
 			cmd += F_LEN(cmd);
 
 		if (cmd->opcode == O_TAG)
 			cmd += F_LEN(cmd);
 
 		action = action2;
 		switch (cmd->opcode) {
 		case O_DENY:
 			action = "Deny";
 			break;
 
 		case O_REJECT:
 			if (cmd->arg1==ICMP_REJECT_RST)
 				action = "Reset";
 			else if (cmd->arg1==ICMP_UNREACH_HOST)
 				action = "Reject";
 			else
 				snprintf(SNPARGS(action2, 0), "Unreach %d",
 					cmd->arg1);
 			break;
 
 		case O_UNREACH6:
 			if (cmd->arg1==ICMP6_UNREACH_RST)
 				action = "Reset";
 			else
 				snprintf(SNPARGS(action2, 0), "Unreach %d",
 					cmd->arg1);
 			break;
 
 		case O_ACCEPT:
 			action = "Accept";
 			break;
 		case O_COUNT:
 			action = "Count";
 			break;
 		case O_DIVERT:
 			snprintf(SNPARGS(action2, 0), "Divert %d",
 				cmd->arg1);
 			break;
 		case O_TEE:
 			snprintf(SNPARGS(action2, 0), "Tee %d",
 				cmd->arg1);
 			break;
 		case O_SKIPTO:
 			snprintf(SNPARGS(action2, 0), "SkipTo %d",
 				cmd->arg1);
 			break;
 		case O_PIPE:
 			snprintf(SNPARGS(action2, 0), "Pipe %d",
 				cmd->arg1);
 			break;
 		case O_QUEUE:
 			snprintf(SNPARGS(action2, 0), "Queue %d",
 				cmd->arg1);
 			break;
 		case O_FORWARD_IP: {
 			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
 			int len;
 			struct in_addr dummyaddr;
 			if (sa->sa.sin_addr.s_addr == INADDR_ANY)
 				dummyaddr.s_addr = htonl(tablearg);
 			else
 				dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
 
 			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
 				inet_ntoa(dummyaddr));
 
 			if (sa->sa.sin_port)
 				snprintf(SNPARGS(action2, len), ":%d",
 				    sa->sa.sin_port);
 			}
 			break;
 		case O_NETGRAPH:
 			snprintf(SNPARGS(action2, 0), "Netgraph %d",
 				cmd->arg1);
 			break;
 		case O_NGTEE:
 			snprintf(SNPARGS(action2, 0), "Ngtee %d",
 				cmd->arg1);
 			break;
 		case O_NAT:
 			action = "Nat";
  			break;
 		default:
 			action = "UNKNOWN";
 			break;
 		}
 	}
 
 	if (hlen == 0) {	/* non-ip */
 		snprintf(SNPARGS(proto, 0), "MAC");
 
 	} else {
 		int len;
 		char src[48], dst[48];
 		struct icmphdr *icmp;
 		struct tcphdr *tcp;
 		struct udphdr *udp;
 #ifdef INET6
 		struct ip6_hdr *ip6 = NULL;
 		struct icmp6_hdr *icmp6;
 #endif
 		src[0] = '\0';
 		dst[0] = '\0';
 #ifdef INET6
 		if (IS_IP6_FLOW_ID(&(args->f_id))) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			snprintf(src, sizeof(src), "[%s]",
 			    ip6_sprintf(ip6buf, &args->f_id.src_ip6));
 			snprintf(dst, sizeof(dst), "[%s]",
 			    ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
 
 			ip6 = (struct ip6_hdr *)ip;
 			tcp = (struct tcphdr *)(((char *)ip) + hlen);
 			udp = (struct udphdr *)(((char *)ip) + hlen);
 		} else
 #endif
 		{
 			tcp = L3HDR(struct tcphdr, ip);
 			udp = L3HDR(struct udphdr, ip);
 
 			inet_ntoa_r(ip->ip_src, src);
 			inet_ntoa_r(ip->ip_dst, dst);
 		}
 
 		switch (args->f_id.proto) {
 		case IPPROTO_TCP:
 			len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(tcp->th_sport),
 				    dst,
 				    ntohs(tcp->th_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 
 		case IPPROTO_UDP:
 			len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(udp->uh_sport),
 				    dst,
 				    ntohs(udp->uh_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 
 		case IPPROTO_ICMP:
 			icmp = L3HDR(struct icmphdr, ip);
 			if (offset == 0)
 				len = snprintf(SNPARGS(proto, 0),
 				    "ICMP:%u.%u ",
 				    icmp->icmp_type, icmp->icmp_code);
 			else
 				len = snprintf(SNPARGS(proto, 0), "ICMP ");
 			len += snprintf(SNPARGS(proto, len), "%s", src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 			icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
 			if (offset == 0)
 				len = snprintf(SNPARGS(proto, 0),
 				    "ICMPv6:%u.%u ",
 				    icmp6->icmp6_type, icmp6->icmp6_code);
 			else
 				len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
 			len += snprintf(SNPARGS(proto, len), "%s", src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 #endif
 		default:
 			len = snprintf(SNPARGS(proto, 0), "P:%d %s",
 			    args->f_id.proto, src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 		}
 
 #ifdef INET6
 		if (IS_IP6_FLOW_ID(&(args->f_id))) {
 			if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
 				snprintf(SNPARGS(fragment, 0),
 				    " (frag %08x:%d@%d%s)",
 				    args->f_id.frag_id6,
 				    ntohs(ip6->ip6_plen) - hlen,
 				    ntohs(offset & IP6F_OFF_MASK) << 3,
 				    (offset & IP6F_MORE_FRAG) ? "+" : "");
 		} else
 #endif
 		{
 			int ip_off, ip_len;
 			if (eh != NULL) { /* layer 2 packets are as on the wire */
 				ip_off = ntohs(ip->ip_off);
 				ip_len = ntohs(ip->ip_len);
 			} else {
 				ip_off = ip->ip_off;
 				ip_len = ip->ip_len;
 			}
 			if (ip_off & (IP_MF | IP_OFFMASK))
 				snprintf(SNPARGS(fragment, 0),
 				    " (frag %d:%d@%d%s)",
 				    ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
 				    offset << 3,
 				    (ip_off & IP_MF) ? "+" : "");
 		}
 	}
 	if (oif || m->m_pkthdr.rcvif)
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s %s via %s%s\n",
 		    f ? f->rulenum : -1,
 		    action, proto, oif ? "out" : "in",
 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
 		    fragment);
 	else
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s [no if info]%s\n",
 		    f ? f->rulenum : -1,
 		    action, proto, fragment);
 	if (limit_reached)
 		log(LOG_SECURITY | LOG_NOTICE,
 		    "ipfw: limit %d reached on entry %d\n",
 		    limit_reached, f ? f->rulenum : -1);
 }
 
 /*
  * IMPORTANT: the hash function for dynamic rules must be commutative
  * in source and destination (ip,port), because rules are bidirectional
  * and we want to find both in the same bucket.
  */
 static __inline int
 hash_packet(struct ipfw_flow_id *id)
 {
 	u_int32_t i;
 
 #ifdef INET6
 	if (IS_IP6_FLOW_ID(id)) 
 		i = hash_packet6(id);
 	else
 #endif /* INET6 */
 	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
 	i &= (curr_dyn_buckets - 1);
 	return i;
 }
 
 /**
  * unlink a dynamic rule from a chain. prev is a pointer to
  * the previous one, q is a pointer to the rule to delete,
  * head is a pointer to the head of the queue.
  * Modifies q and potentially also head.
  */
 #define UNLINK_DYN_RULE(prev, head, q) {				\
 	ipfw_dyn_rule *old_q = q;					\
 									\
 	/* remove a refcount to the parent */				\
 	if (q->dyn_type == O_LIMIT)					\
 		q->parent->count--;					\
 	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
 		(q->id.src_ip), (q->id.src_port),			\
 		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
 	if (prev != NULL)						\
 		prev->next = q = q->next;				\
 	else								\
 		head = q = q->next;					\
 	dyn_count--;							\
 	uma_zfree(ipfw_dyn_rule_zone, old_q); }
 
 #define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
 
 /**
  * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
  *
  * If keep_me == NULL, rules are deleted even if not expired,
  * otherwise only expired rules are removed.
  *
  * The value of the second parameter is also used to point to identify
  * a rule we absolutely do not want to remove (e.g. because we are
  * holding a reference to it -- this is the case with O_LIMIT_PARENT
  * rules). The pointer is only used for comparison, so any non-null
  * value will do.
  */
 static void
 remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
 {
 	static u_int32_t last_remove = 0;
 
 #define FORCE (keep_me == NULL)
 
 	ipfw_dyn_rule *prev, *q;
 	int i, pass = 0, max_pass = 0;
 
 	IPFW_DYN_LOCK_ASSERT();
 
 	if (ipfw_dyn_v == NULL || dyn_count == 0)
 		return;
 	/* do not expire more than once per second, it is useless */
 	if (!FORCE && last_remove == time_uptime)
 		return;
 	last_remove = time_uptime;
 
 	/*
 	 * because O_LIMIT refer to parent rules, during the first pass only
 	 * remove child and mark any pending LIMIT_PARENT, and remove
 	 * them in a second pass.
 	 */
 next_pass:
 	for (i = 0 ; i < curr_dyn_buckets ; i++) {
 		for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
 			/*
 			 * Logic can become complex here, so we split tests.
 			 */
 			if (q == keep_me)
 				goto next;
 			if (rule != NULL && rule != q->rule)
 				goto next; /* not the one we are looking for */
 			if (q->dyn_type == O_LIMIT_PARENT) {
 				/*
 				 * handle parent in the second pass,
 				 * record we need one.
 				 */
 				max_pass = 1;
 				if (pass == 0)
 					goto next;
 				if (FORCE && q->count != 0 ) {
 					/* XXX should not happen! */
 					printf("ipfw: OUCH! cannot remove rule,"
 					     " count %d\n", q->count);
 				}
 			} else {
 				if (!FORCE &&
 				    !TIME_LEQ( q->expire, time_uptime ))
 					goto next;
 			}
              if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
                      UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
                      continue;
              }
 next:
 			prev=q;
 			q=q->next;
 		}
 	}
 	if (pass++ < max_pass)
 		goto next_pass;
 }
 
 
 /**
  * lookup a dynamic rule.
  */
 static ipfw_dyn_rule *
 lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
     struct tcphdr *tcp)
 {
 	/*
 	 * stateful ipfw extensions.
 	 * Lookup into dynamic session queue
 	 */
 #define MATCH_REVERSE	0
 #define MATCH_FORWARD	1
 #define MATCH_NONE	2
 #define MATCH_UNKNOWN	3
 	int i, dir = MATCH_NONE;
 	ipfw_dyn_rule *prev, *q=NULL;
 
 	IPFW_DYN_LOCK_ASSERT();
 
 	if (ipfw_dyn_v == NULL)
 		goto done;	/* not found */
 	i = hash_packet( pkt );
 	for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
 		if (q->dyn_type == O_LIMIT_PARENT && q->count)
 			goto next;
 		if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
 			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
 			continue;
 		}
 		if (pkt->proto == q->id.proto &&
 		    q->dyn_type != O_LIMIT_PARENT) {
 			if (IS_IP6_FLOW_ID(pkt)) {
 			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
 				&(q->id.src_ip6)) &&
 			    IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
 				&(q->id.dst_ip6)) &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port ) {
 				dir = MATCH_FORWARD;
 				break;
 			    }
 			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
 				    &(q->id.dst_ip6)) &&
 				IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
 				    &(q->id.src_ip6)) &&
 				pkt->src_port == q->id.dst_port &&
 				pkt->dst_port == q->id.src_port ) {
 				    dir = MATCH_REVERSE;
 				    break;
 			    }
 			} else {
 			    if (pkt->src_ip == q->id.src_ip &&
 				pkt->dst_ip == q->id.dst_ip &&
 				pkt->src_port == q->id.src_port &&
 				pkt->dst_port == q->id.dst_port ) {
 				    dir = MATCH_FORWARD;
 				    break;
 			    }
 			    if (pkt->src_ip == q->id.dst_ip &&
 				pkt->dst_ip == q->id.src_ip &&
 				pkt->src_port == q->id.dst_port &&
 				pkt->dst_port == q->id.src_port ) {
 				    dir = MATCH_REVERSE;
 				    break;
 			    }
 			}
 		}
 next:
 		prev = q;
 		q = q->next;
 	}
 	if (q == NULL)
 		goto done; /* q = NULL, not found */
 
 	if ( prev != NULL) { /* found and not in front */
 		prev->next = q->next;
 		q->next = ipfw_dyn_v[i];
 		ipfw_dyn_v[i] = q;
 	}
 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
 		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
 
 #define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
 #define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
 		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
 		switch (q->state) {
 		case TH_SYN:				/* opening */
 			q->expire = time_uptime + dyn_syn_lifetime;
 			break;
 
 		case BOTH_SYN:			/* move to established */
 		case BOTH_SYN | TH_FIN :	/* one side tries to close */
 		case BOTH_SYN | (TH_FIN << 8) :
  			if (tcp) {
 #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
 			    u_int32_t ack = ntohl(tcp->th_ack);
 			    if (dir == MATCH_FORWARD) {
 				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
 				    q->ack_fwd = ack;
 				else { /* ignore out-of-sequence */
 				    break;
 				}
 			    } else {
 				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
 				    q->ack_rev = ack;
 				else { /* ignore out-of-sequence */
 				    break;
 				}
 			    }
 			}
 			q->expire = time_uptime + dyn_ack_lifetime;
 			break;
 
 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
 			if (dyn_fin_lifetime >= dyn_keepalive_period)
 				dyn_fin_lifetime = dyn_keepalive_period - 1;
 			q->expire = time_uptime + dyn_fin_lifetime;
 			break;
 
 		default:
 #if 0
 			/*
 			 * reset or some invalid combination, but can also
 			 * occur if we use keep-state the wrong way.
 			 */
 			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
 				printf("invalid state: 0x%x\n", q->state);
 #endif
 			if (dyn_rst_lifetime >= dyn_keepalive_period)
 				dyn_rst_lifetime = dyn_keepalive_period - 1;
 			q->expire = time_uptime + dyn_rst_lifetime;
 			break;
 		}
 	} else if (pkt->proto == IPPROTO_UDP) {
 		q->expire = time_uptime + dyn_udp_lifetime;
 	} else {
 		/* other protocols */
 		q->expire = time_uptime + dyn_short_lifetime;
 	}
 done:
 	if (match_direction)
 		*match_direction = dir;
 	return q;
 }
 
 static ipfw_dyn_rule *
 lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
     struct tcphdr *tcp)
 {
 	ipfw_dyn_rule *q;
 
 	IPFW_DYN_LOCK();
 	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
 	if (q == NULL)
 		IPFW_DYN_UNLOCK();
 	/* NB: return table locked when q is not NULL */
 	return q;
 }
 
 static void
 realloc_dynamic_table(void)
 {
 	IPFW_DYN_LOCK_ASSERT();
 
 	/*
 	 * Try reallocation, make sure we have a power of 2 and do
 	 * not allow more than 64k entries. In case of overflow,
 	 * default to 1024.
 	 */
 
 	if (dyn_buckets > 65536)
 		dyn_buckets = 1024;
 	if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */
 		dyn_buckets = curr_dyn_buckets; /* reset */
 		return;
 	}
 	curr_dyn_buckets = dyn_buckets;
 	if (ipfw_dyn_v != NULL)
 		free(ipfw_dyn_v, M_IPFW);
 	for (;;) {
 		ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
 		       M_IPFW, M_NOWAIT | M_ZERO);
 		if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2)
 			break;
 		curr_dyn_buckets /= 2;
 	}
 }
 
 /**
  * Install state of type 'type' for a dynamic session.
  * The hash table contains two type of rules:
  * - regular rules (O_KEEP_STATE)
  * - rules for sessions with limited number of sess per user
  *   (O_LIMIT). When they are created, the parent is
  *   increased by 1, and decreased on delete. In this case,
  *   the third parameter is the parent rule and not the chain.
  * - "parent" rules for the above (O_LIMIT_PARENT).
  */
 static ipfw_dyn_rule *
 add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *r;
 	int i;
 
 	IPFW_DYN_LOCK_ASSERT();
 
 	if (ipfw_dyn_v == NULL ||
 	    (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
 		realloc_dynamic_table();
 		if (ipfw_dyn_v == NULL)
 			return NULL; /* failed ! */
 	}
 	i = hash_packet(id);
 
 	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
 	if (r == NULL) {
 		printf ("ipfw: sorry cannot allocate state\n");
 		return NULL;
 	}
 
 	/* increase refcount on parent, and set pointer */
 	if (dyn_type == O_LIMIT) {
 		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
 		if ( parent->dyn_type != O_LIMIT_PARENT)
 			panic("invalid parent");
 		parent->count++;
 		r->parent = parent;
 		rule = parent->rule;
 	}
 
 	r->id = *id;
 	r->expire = time_uptime + dyn_syn_lifetime;
 	r->rule = rule;
 	r->dyn_type = dyn_type;
 	r->pcnt = r->bcnt = 0;
 	r->count = 0;
 
 	r->bucket = i;
 	r->next = ipfw_dyn_v[i];
 	ipfw_dyn_v[i] = r;
 	dyn_count++;
 	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
 	   dyn_type,
 	   (r->id.src_ip), (r->id.src_port),
 	   (r->id.dst_ip), (r->id.dst_port),
 	   dyn_count ); )
 	return r;
 }
 
 /**
  * lookup dynamic parent rule using pkt and rule as search keys.
  * If the lookup fails, then install one.
  */
 static ipfw_dyn_rule *
 lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *q;
 	int i;
 
 	IPFW_DYN_LOCK_ASSERT();
 
 	if (ipfw_dyn_v) {
 		int is_v6 = IS_IP6_FLOW_ID(pkt);
 		i = hash_packet( pkt );
 		for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
 			if (q->dyn_type == O_LIMIT_PARENT &&
 			    rule== q->rule &&
 			    pkt->proto == q->id.proto &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port &&
 			    (
 				(is_v6 &&
 				 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
 					&(q->id.src_ip6)) &&
 				 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
 					&(q->id.dst_ip6))) ||
 				(!is_v6 &&
 				 pkt->src_ip == q->id.src_ip &&
 				 pkt->dst_ip == q->id.dst_ip)
 			    )
 			) {
 				q->expire = time_uptime + dyn_short_lifetime;
 				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
 				return q;
 			}
 	}
 	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
 }
 
 /**
  * Install dynamic state for rule type cmd->o.opcode
  *
  * Returns 1 (failure) if state is not installed because of errors or because
  * session limitations are enforced.
  */
 static int
 install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
     struct ip_fw_args *args, uint32_t tablearg)
 {
 	static int last_log;
 	ipfw_dyn_rule *q;
 	struct in_addr da;
 	char src[48], dst[48];
 
 	src[0] = '\0';
 	dst[0] = '\0';
 
 	DEB(
 	printf("ipfw: %s: type %d 0x%08x %u -> 0x%08x %u\n",
 	    __func__, cmd->o.opcode,
 	    (args->f_id.src_ip), (args->f_id.src_port),
 	    (args->f_id.dst_ip), (args->f_id.dst_port));
 	)
 
 	IPFW_DYN_LOCK();
 
 	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
 
 	if (q != NULL) {	/* should never occur */
 		if (last_log != time_uptime) {
 			last_log = time_uptime;
 			printf("ipfw: %s: entry already present, done\n",
 			    __func__);
 		}
 		IPFW_DYN_UNLOCK();
 		return (0);
 	}
 
 	if (dyn_count >= dyn_max)
 		/* Run out of slots, try to remove any expired rule. */
 		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
 
 	if (dyn_count >= dyn_max) {
 		if (last_log != time_uptime) {
 			last_log = time_uptime;
 			printf("ipfw: %s: Too many dynamic rules\n", __func__);
 		}
 		IPFW_DYN_UNLOCK();
 		return (1);	/* cannot install, notify caller */
 	}
 
 	switch (cmd->o.opcode) {
 	case O_KEEP_STATE:	/* bidir rule */
 		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
 		break;
 
 	case O_LIMIT: {		/* limit number of sessions */
 		struct ipfw_flow_id id;
 		ipfw_dyn_rule *parent;
 		uint32_t conn_limit;
 		uint16_t limit_mask = cmd->limit_mask;
 
 		conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
 		    tablearg : cmd->conn_limit;
 		  
 		DEB(
 		if (cmd->conn_limit == IP_FW_TABLEARG)
 			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
 			    "(tablearg)\n", __func__, conn_limit);
 		else
 			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
 			    __func__, conn_limit);
 		)
 
 		id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
 		id.proto = args->f_id.proto;
 		id.addr_type = args->f_id.addr_type;
 
 		if (IS_IP6_FLOW_ID (&(args->f_id))) {
 			if (limit_mask & DYN_SRC_ADDR)
 				id.src_ip6 = args->f_id.src_ip6;
 			if (limit_mask & DYN_DST_ADDR)
 				id.dst_ip6 = args->f_id.dst_ip6;
 		} else {
 			if (limit_mask & DYN_SRC_ADDR)
 				id.src_ip = args->f_id.src_ip;
 			if (limit_mask & DYN_DST_ADDR)
 				id.dst_ip = args->f_id.dst_ip;
 		}
 		if (limit_mask & DYN_SRC_PORT)
 			id.src_port = args->f_id.src_port;
 		if (limit_mask & DYN_DST_PORT)
 			id.dst_port = args->f_id.dst_port;
 		if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
 			printf("ipfw: %s: add parent failed\n", __func__);
 			IPFW_DYN_UNLOCK();
 			return (1);
 		}
 
 		if (parent->count >= conn_limit) {
 			/* See if we can remove some expired rule. */
 			remove_dyn_rule(rule, parent);
 			if (parent->count >= conn_limit) {
 				if (fw_verbose && last_log != time_uptime) {
 					last_log = time_uptime;
 #ifdef INET6
 					/*
 					 * XXX IPv6 flows are not
 					 * supported yet.
 					 */
 					if (IS_IP6_FLOW_ID(&(args->f_id))) {
 						char ip6buf[INET6_ADDRSTRLEN];
 						snprintf(src, sizeof(src),
 						    "[%s]", ip6_sprintf(ip6buf,
 							&args->f_id.src_ip6));
 						snprintf(dst, sizeof(dst),
 						    "[%s]", ip6_sprintf(ip6buf,
 							&args->f_id.dst_ip6));
 					} else
 #endif
 					{
 						da.s_addr =
 						    htonl(args->f_id.src_ip);
 						inet_ntoa_r(da, src);
 						da.s_addr =
 						    htonl(args->f_id.dst_ip);
 						inet_ntoa_r(da, dst);
 					}
 					log(LOG_SECURITY | LOG_DEBUG,
 					    "%s %s:%u -> %s:%u, %s\n",
 					    "drop session",
 					    src, (args->f_id.src_port),
 					    dst, (args->f_id.dst_port),
 					    "too many entries");
 				}
 				IPFW_DYN_UNLOCK();
 				return (1);
 			}
 		}
 		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
 		break;
 	}
 	default:
 		printf("ipfw: %s: unknown dynamic rule type %u\n",
 		    __func__, cmd->o.opcode);
 		IPFW_DYN_UNLOCK();
 		return (1);
 	}
 
 	/* XXX just set lifetime */
 	lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
 
 	IPFW_DYN_UNLOCK();
 	return (0);
 }
 
 /*
  * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
  * so that MAC can label the reply appropriately.
  */
 static struct mbuf *
 send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
 	struct mbuf *m;
 	struct ip *ip;
 	struct tcphdr *tcp;
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m == 0)
 		return (NULL);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 
 #ifdef MAC
 	if (replyto != NULL)
 		mac_create_mbuf_netlayer(replyto, m);
 	else
 		mac_create_mbuf_from_firewall(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
 	m->m_data += max_linkhdr;
 
 	ip = mtod(m, struct ip *);
 	bzero(ip, m->m_len);
 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
 	ip->ip_p = IPPROTO_TCP;
 	tcp->th_off = 5;
 	/*
 	 * Assume we are sending a RST (or a keepalive in the reverse
 	 * direction), swap src and destination addresses and ports.
 	 */
 	ip->ip_src.s_addr = htonl(id->dst_ip);
 	ip->ip_dst.s_addr = htonl(id->src_ip);
 	tcp->th_sport = htons(id->dst_port);
 	tcp->th_dport = htons(id->src_port);
 	if (flags & TH_RST) {	/* we are sending a RST */
 		if (flags & TH_ACK) {
 			tcp->th_seq = htonl(ack);
 			tcp->th_ack = htonl(0);
 			tcp->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			tcp->th_seq = htonl(0);
 			tcp->th_ack = htonl(seq);
 			tcp->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * We are sending a keepalive. flags & TH_SYN determines
 		 * the direction, forward if set, reverse if clear.
 		 * NOTE: seq and ack are always assumed to be correct
 		 * as set by the caller. This may be confusing...
 		 */
 		if (flags & TH_SYN) {
 			/*
 			 * we have to rewrite the correct addresses!
 			 */
 			ip->ip_dst.s_addr = htonl(id->dst_ip);
 			ip->ip_src.s_addr = htonl(id->src_ip);
 			tcp->th_dport = htons(id->dst_port);
 			tcp->th_sport = htons(id->src_port);
 		}
 		tcp->th_seq = htonl(seq);
 		tcp->th_ack = htonl(ack);
 		tcp->th_flags = TH_ACK;
 	}
 	/*
 	 * set ip_len to the payload size so we can compute
 	 * the tcp checksum on the pseudoheader
 	 * XXX check this, could save a couple of words ?
 	 */
 	ip->ip_len = htons(sizeof(struct tcphdr));
 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
 	/*
 	 * now fill fields left out earlier
 	 */
 	ip->ip_ttl = ip_defttl;
 	ip->ip_len = m->m_pkthdr.len;
 	m->m_flags |= M_SKIP_FIREWALL;
 	return (m);
 }
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
 {
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
 		/* We need the IP header in host order for icmp_error(). */
 		if (args->eh != NULL) {
 			ip->ip_len = ntohs(ip->ip_len);
 			ip->ip_off = ntohs(ip->ip_off);
 		}
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	} else if (args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		m_freem(args->m);
 	} else
 		m_freem(args->m);
 	args->m = NULL;
 }
 
 /**
  *
  * Given an ip_fw *, lookup_next_rule will return a pointer
  * to the next rule, which can be either the jump
  * target (for skipto instructions) or the next one in the list (in
  * all other cases including a missing jump target).
  * The result is also written in the "next_rule" field of the rule.
  * Backward jumps are not allowed, so start looking from the next
  * rule...
  *
  * This never returns NULL -- in case we do not have an exact match,
  * the next rule is returned. When the ruleset is changed,
  * pointers are flushed so we are always correct.
  */
 
 static struct ip_fw *
 lookup_next_rule(struct ip_fw *me)
 {
 	struct ip_fw *rule = NULL;
 	ipfw_insn *cmd;
 
 	/* look for action, in case it is a skipto */
 	cmd = ACTION_PTR(me);
 	if (cmd->opcode == O_LOG)
 		cmd += F_LEN(cmd);
 	if (cmd->opcode == O_ALTQ)
 		cmd += F_LEN(cmd);
 	if (cmd->opcode == O_TAG)
 		cmd += F_LEN(cmd);
 	if ( cmd->opcode == O_SKIPTO )
 		for (rule = me->next; rule ; rule = rule->next)
 			if (rule->rulenum >= cmd->arg1)
 				break;
 	if (rule == NULL)			/* failure or not a skipto */
 		rule = me->next;
 	me->next_rule = rule;
 	return rule;
 }
 
 static int
 add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
     uint8_t mlen, uint32_t value)
 {
 	struct radix_node_head *rnh;
 	struct table_entry *ent;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
 	rnh = ch->tables[tbl];
 	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
 	if (ent == NULL)
 		return (ENOMEM);
 	ent->value = value;
 	ent->addr.sin_len = ent->mask.sin_len = 8;
 	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
 	IPFW_WLOCK(&layer3_chain);
 	if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) ==
 	    NULL) {
 		IPFW_WUNLOCK(&layer3_chain);
 		free(ent, M_IPFW_TBL);
 		return (EEXIST);
 	}
 	IPFW_WUNLOCK(&layer3_chain);
 	return (0);
 }
 
 static int
 del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
     uint8_t mlen)
 {
 	struct radix_node_head *rnh;
 	struct table_entry *ent;
 	struct sockaddr_in sa, mask;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
 	rnh = ch->tables[tbl];
 	sa.sin_len = mask.sin_len = 8;
 	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
 	IPFW_WLOCK(ch);
 	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
 	if (ent == NULL) {
 		IPFW_WUNLOCK(ch);
 		return (ESRCH);
 	}
 	IPFW_WUNLOCK(ch);
 	free(ent, M_IPFW_TBL);
 	return (0);
 }
 
 static int
 flush_table_entry(struct radix_node *rn, void *arg)
 {
 	struct radix_node_head * const rnh = arg;
 	struct table_entry *ent;
 
 	ent = (struct table_entry *)
 	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
 	if (ent != NULL)
 		free(ent, M_IPFW_TBL);
 	return (0);
 }
 
 static int
 flush_table(struct ip_fw_chain *ch, uint16_t tbl)
 {
 	struct radix_node_head *rnh;
 
 	IPFW_WLOCK_ASSERT(ch);
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
 	rnh = ch->tables[tbl];
 	KASSERT(rnh != NULL, ("NULL IPFW table"));
 	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
 	return (0);
 }
 
 static void
 flush_tables(struct ip_fw_chain *ch)
 {
 	uint16_t tbl;
 
 	IPFW_WLOCK_ASSERT(ch);
 
 	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
 		flush_table(ch, tbl);
 }
 
 static int
 init_tables(struct ip_fw_chain *ch)
 { 
 	int i;
 	uint16_t j;
 
 	for (i = 0; i < IPFW_TABLES_MAX; i++) {
 		if (!rn_inithead((void **)&ch->tables[i], 32)) {
 			for (j = 0; j < i; j++) {
 				(void) flush_table(ch, j);
 			}
 			return (ENOMEM);
 		}
 	}
 	return (0);
 }
 
 static int
 lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
     uint32_t *val)
 {
 	struct radix_node_head *rnh;
 	struct table_entry *ent;
 	struct sockaddr_in sa;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (0);
 	rnh = ch->tables[tbl];
 	sa.sin_len = 8;
 	sa.sin_addr.s_addr = addr;
 	ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
 	if (ent != NULL) {
 		*val = ent->value;
 		return (1);
 	}
 	return (0);
 }
 
 static int
 count_table_entry(struct radix_node *rn, void *arg)
 {
 	u_int32_t * const cnt = arg;
 
 	(*cnt)++;
 	return (0);
 }
 
 static int
 count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
 {
 	struct radix_node_head *rnh;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
 	rnh = ch->tables[tbl];
 	*cnt = 0;
 	rnh->rnh_walktree(rnh, count_table_entry, cnt);
 	return (0);
 }
 
 static int
 dump_table_entry(struct radix_node *rn, void *arg)
 {
 	struct table_entry * const n = (struct table_entry *)rn;
 	ipfw_table * const tbl = arg;
 	ipfw_table_entry *ent;
 
 	if (tbl->cnt == tbl->size)
 		return (1);
 	ent = &tbl->ent[tbl->cnt];
 	ent->tbl = tbl->tbl;
 	if (in_nullhost(n->mask.sin_addr))
 		ent->masklen = 0;
 	else
 		ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
 	ent->addr = n->addr.sin_addr.s_addr;
 	ent->value = n->value;
 	tbl->cnt++;
 	return (0);
 }
 
 static int
 dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
 {
 	struct radix_node_head *rnh;
 
 	if (tbl->tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
 	rnh = ch->tables[tbl->tbl];
 	tbl->cnt = 0;
 	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
 	return (0);
 }
 
 static void
 fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp)
 {
 	struct ucred *cr;
 
 	if (inp->inp_socket != NULL) {
 		cr = inp->inp_socket->so_cred;
 		ugp->fw_prid = jailed(cr) ?
 		    cr->cr_prison->pr_id : -1;
 		ugp->fw_uid = cr->cr_uid;
 		ugp->fw_ngroups = cr->cr_ngroups;
 		bcopy(cr->cr_groups, ugp->fw_groups,
 		    sizeof(ugp->fw_groups));
 	}
 }
 
 static int
 check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
     struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
     u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup,
     struct inpcb *inp)
 {
 	struct inpcbinfo *pi;
 	int wildcard;
 	struct inpcb *pcb;
 	int match;
 	gid_t *gp;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *lookup == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			fill_ugid_cache(inp, ugp);
 			*lookup = 1;
 		}
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*lookup == -1)
 		return (0);
 	if (proto == IPPROTO_TCP) {
 		wildcard = 0;
 		pi = &tcbinfo;
 	} else if (proto == IPPROTO_UDP) {
 		wildcard = INPLOOKUP_WILDCARD;
 		pi = &udbinfo;
 	} else
 		return 0;
 	match = 0;
 	if (*lookup == 0) {
 		INP_INFO_RLOCK(pi);
 		pcb =  (oif) ?
 			in_pcblookup_hash(pi,
 				dst_ip, htons(dst_port),
 				src_ip, htons(src_port),
 				wildcard, oif) :
 			in_pcblookup_hash(pi,
 				src_ip, htons(src_port),
 				dst_ip, htons(dst_port),
 				wildcard, NULL);
 		if (pcb != NULL) {
 			INP_LOCK(pcb);
 			if (pcb->inp_socket != NULL) {
 				fill_ugid_cache(pcb, ugp);
 				*lookup = 1;
 			}
 			INP_UNLOCK(pcb);
 		}
 		INP_INFO_RUNLOCK(pi);
 		if (*lookup == 0) {
 			/*
 			 * If the lookup did not yield any results, there
 			 * is no sense in coming back and trying again. So
 			 * we can set lookup to -1 and ensure that we wont
 			 * bother the pcb system again.
 			 */
 			*lookup = -1;
 			return (0);
 		}
 	} 
 	if (insn->o.opcode == O_UID)
 		match = (ugp->fw_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID) {
 		for (gp = ugp->fw_groups;
 			gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++)
 			if (*gp == (gid_t)insn->d[0]) {
 				match = 1;
 				break;
 			}
 	} else if (insn->o.opcode == O_JAIL)
 		match = (ugp->fw_prid == (int)insn->d[0]);
 	return match;
 }
 
 #ifdef IPFIREWALL_NAT
 static eventhandler_tag ifaddr_event_tag;
 
 static void 
 ifaddr_change(void *arg __unused, struct ifnet *ifp)
 {
 	struct cfg_nat *ptr;
 	struct ifaddr *ifa;
 
 	IPFW_WLOCK(&layer3_chain);			
 	/* Check every nat entry... */
 	LIST_FOREACH(ptr, &layer3_chain.nat, _next) {
 		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
 		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) {
 			mtx_lock(&ifp->if_addr_mtx);
 			TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
 				if (ifa->ifa_addr == NULL)
 					continue;
 				if (ifa->ifa_addr->sa_family != AF_INET)
 					continue;
 				ptr->ip = ((struct sockaddr_in *) 
 				    (ifa->ifa_addr))->sin_addr;
 				LibAliasSetAddress(ptr->lib, ptr->ip);
 			}
 			mtx_unlock(&ifp->if_addr_mtx);
 		}
 	}
 	IPFW_WUNLOCK(&layer3_chain);	
 }
 
 static void
 flush_nat_ptrs(const int i)
 {
 	struct ip_fw *rule;
 
 	IPFW_WLOCK_ASSERT(&layer3_chain);
 	for (rule = layer3_chain.rules; rule; rule = rule->next) {
 		ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule);
 		if (cmd->o.opcode != O_NAT)
 			continue;
 		if (cmd->nat != NULL && cmd->nat->id == i)
 			cmd->nat = NULL;
 	}
 }
 
 static struct cfg_nat *
 lookup_nat(const int i)
 {
 	struct cfg_nat *ptr;
 
 	LIST_FOREACH(ptr, &layer3_chain.nat, _next)
 		if (ptr->id == i)
 			return(ptr);
 	return (NULL);
 }
 
 #define HOOK_NAT(b, p) do {                                     \
 	IPFW_WLOCK_ASSERT(&layer3_chain);                       \
         LIST_INSERT_HEAD(b, p, _next);                          \
 } while (0)
 
 #define UNHOOK_NAT(p) do {                                      \
 	IPFW_WLOCK_ASSERT(&layer3_chain);                       \
         LIST_REMOVE(p, _next);                                  \
 } while (0)
 
 #define HOOK_REDIR(b, p) do {                                   \
         LIST_INSERT_HEAD(b, p, _next);                          \
 } while (0)
 
 #define HOOK_SPOOL(b, p) do {                                   \
         LIST_INSERT_HEAD(b, p, _next);                          \
 } while (0)
 
 static void
 del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
 {
 	struct cfg_redir *r, *tmp_r;
 	struct cfg_spool *s, *tmp_s;
 	int i, num;
 
 	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
 		num = 1; /* Number of alias_link to delete. */
 		switch (r->mode) {
 		case REDIR_PORT:
 			num = r->pport_cnt;
 			/* FALLTHROUGH */
 		case REDIR_ADDR:
 		case REDIR_PROTO:
 			/* Delete all libalias redirect entry. */
 			for (i = 0; i < num; i++)
 				LibAliasRedirectDelete(n->lib, r->alink[i]);
 			/* Del spool cfg if any. */
 			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
 				LIST_REMOVE(s, _next);
 				free(s, M_IPFW);
 			}
 			free(r->alink, M_IPFW);
 			LIST_REMOVE(r, _next);
 			free(r, M_IPFW);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);				
 			/* XXX - panic?!?!? */
 			break; 
 		}
 	}
 }
 
 static int
 add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
 {
 	struct cfg_redir *r, *ser_r;
 	struct cfg_spool *s, *ser_s;
 	int cnt, off, i;
 	char *panic_err;
 
 	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
 		ser_r = (struct cfg_redir *)&buf[off];
 		r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
 		memcpy(r, ser_r, SOF_REDIR);
 		LIST_INIT(&r->spool_chain);
 		off += SOF_REDIR;
 		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		switch (r->mode) {
 		case REDIR_ADDR:
 			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
 			    r->paddr);
 			break;
 		case REDIR_PORT:
 			for (i = 0 ; i < r->pport_cnt; i++) {
 				/* If remotePort is all ports, set it to 0. */
 				u_short remotePortCopy = r->rport + i;
 				if (r->rport_cnt == 1 && r->rport == 0)
 					remotePortCopy = 0;
 				r->alink[i] = LibAliasRedirectPort(ptr->lib, 
 				    r->laddr, htons(r->lport + i), r->raddr, 
 				    htons(remotePortCopy), r->paddr, 
 				    htons(r->pport + i), r->proto);
 				if (r->alink[i] == NULL) {
 					r->alink[0] = NULL;
 					break;
 				}
 			}
 			break;
 		case REDIR_PROTO:
 			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
 			    r->raddr, r->paddr, r->proto);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			break; 
 		}
 		if (r->alink[0] == NULL) {
 			panic_err = "LibAliasRedirect* returned NULL";
 			goto bad;
 		} else /* LSNAT handling. */
 			for (i = 0; i < r->spool_cnt; i++) {
 				ser_s = (struct cfg_spool *)&buf[off];
 				s = malloc(SOF_REDIR, M_IPFW, 
 				    M_WAITOK | M_ZERO);
 				memcpy(s, ser_s, SOF_SPOOL);
 				LibAliasAddServer(ptr->lib, r->alink[0], 
 				    s->addr, htons(s->port));						  
 				off += SOF_SPOOL;
 				/* Hook spool entry. */
 				HOOK_SPOOL(&r->spool_chain, s);
 			}
 		/* And finally hook this redir entry. */
 		HOOK_REDIR(&ptr->redir_chain, r);
 	}
 	return (1);
 bad:
 	/* something really bad happened: panic! */
 	panic("%s\n", panic_err);
 }
 #endif
 
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->oif	Outgoing interface, or NULL if packet is incoming.
  *		The incoming interface is in the mbuf. (in)
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->cookie	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 	/*
 	 * Local variables holding state during the processing of a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * args->eh	The MAC header. It is non-null for a layer2
 	 *	packet, it is NULL for a layer-3 packet.
 	 * **notyet**
 	 * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 * ip	is the beginning of the ip(4 or 6) header.
 	 *	Calculated by adding the L3offset to the start of data.
 	 *	(Until we start using L3offset, the packet is
 	 *	supposed to start with the ip header).
 	 */
 	struct mbuf *m = args->m;
 	struct ip *ip = mtod(m, struct ip *);
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 	struct ip_fw_ugid fw_ugid_cache;
 	int ugid_lookup = 0;
 
 	/*
 	 * divinput_flags	If non-zero, set to the IP_FW_DIVERT_*_FLAG
 	 *	associated with a packet input on a divert socket.  This
 	 *	will allow to distinguish traffic and its direction when
 	 *	it originates from a divert socket.
 	 */
 	u_int divinput_flags = 0;
 
 	/*
 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
 	 *	inbound path (ether_input, ip_input).
 	 *	If non-NULL, ipfw_chk has been called on the outbound path
 	 *	(ether_output, ip_output).
 	 */
 	struct ifnet *oif = args->oif;
 
 	struct ip_fw *f = NULL;		/* matching rule */
 	int retval = 0;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset == 0 means there is no Fragment Header. 
 	 *	If offset != 0 for IPv6 always use correct mask to
 	 *	get the correct offset because we add IP6F_MORE_FRAG
 	 *	to be able to dectect the first fragment which would
 	 *	otherwise have offset = 0.
 	 */
 	u_short offset = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	u_int8_t proto;
 	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	u_int16_t ip_len=0;
 	int pktlen;
 	u_int16_t	etype = 0;	/* Host order stored ether type */
 
 	/*
 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
 	 * 	MATCH_NONE when checked and not matched (q = NULL),
 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
 	 */
 	int dyn_dir = MATCH_UNKNOWN;
 	ipfw_dyn_rule *q = NULL;
 	struct ip_fw_chain *chain = &layer3_chain;
 	struct m_tag *mtag;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 	u_int16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 	int is_ipv4 = 0;
 
 	if (m->m_flags & M_SKIP_FIREWALL)
 		return (IP_FW_PASS);	/* accept */
 
 	pktlen = m->m_pkthdr.len;
 	proto = args->f_id.proto = 0;	/* mark f_id invalid */
 		/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
 
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(len, p, T)						\
 do {									\
 	int x = (len) + sizeof(T);					\
 	if ((m)->m_len < x) {						\
 		args->m = m = m_pullup(m, x);				\
 		if (m == NULL)						\
 			goto pullup_failed;				\
 	}								\
 	p = (mtod(m, char *) + (len));					\
 } while (0)
 
 	/*
 	 * if we have an ether header,
 	 */
 	if (args->eh)
 		etype = ntohs(args->eh->ether_type);
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 		is_ipv6 = 1;
 		args->f_id.addr_type = 6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 				args->f_id.flags = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				args->f_id.flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				PULLUP_TO(hlen, ulp, struct sctphdr);
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					printf("IPFW2: IPV6 - Unknown Routing "
 					    "Header type(%d)\n",
 					    ((struct ip6_rthdr *)ulp)->ip6r_type);
 					if (fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				/* Add IP6F_MORE_FRAG for offset of first
 				 * fragment to be != 0. */
 				offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (offset == 0) {
 					printf("IPFW2: IPV6 - Invalid Fragment "
 					    "Header\n");
 					if (fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.frag_id6 =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				/* Packet ends here. if ip6e_len!=0 octets
 				 * must be ignored. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, struct carp_header);
 				if (((struct carp_header *)ulp)->carp_version !=
 				    CARP_VERSION) 
 					return (IP_FW_DENY);
 				if (((struct carp_header *)ulp)->carp_type !=
 				    CARP_ADVERTISEMENT) 
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				printf("IPFW2: IPV6 - Unknown Extension "
 				    "Header(%d), ext_hd=%x\n", proto, ext_hd);
 				if (fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		ip = mtod(m, struct ip *);
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.src_ip = 0;
 		args->f_id.dst_ip = 0;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
 	    	is_ipv4 = 1;
 		hlen = ip->ip_hl << 2;
 		args->f_id.addr_type = 4;
 
 		/*
 		 * Collect parameters into local variables for faster matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		if (args->eh != NULL) { /* layer 2 packets are as on the wire */
 			offset = ntohs(ip->ip_off) & IP_OFFMASK;
 			ip_len = ntohs(ip->ip_len);
 		} else {
 			offset = ip->ip_off & IP_OFFMASK;
 			ip_len = ip->ip_len;
 		}
 		pktlen = ip_len < pktlen ? ip_len : pktlen;
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				args->f_id.flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		}
 
 		ip = mtod(m, struct ip *);
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	}
 #undef PULLUP_TO
 	if (proto) { /* we may have port numbers, store them */
 		args->f_id.proto = proto;
 		args->f_id.src_port = src_port = ntohs(src_port);
 		args->f_id.dst_port = dst_port = ntohs(dst_port);
 	}
 
 	IPFW_RLOCK(chain);
 	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
 	if (args->rule) {
 		/*
 		 * Packet has already been tagged. Look for the next rule
 		 * to restart processing.
 		 *
 		 * If fw_one_pass != 0 then just accept it.
 		 * XXX should not happen here, but optimized out in
 		 * the caller.
 		 */
 		if (fw_one_pass) {
 			IPFW_RUNLOCK(chain);
 			return (IP_FW_PASS);
 		}
 
 		f = args->rule->next_rule;
 		if (f == NULL)
 			f = lookup_next_rule(args->rule);
 	} else {
 		/*
 		 * Find the starting rule. It can be either the first
 		 * one, or the one after divert_rule if asked so.
 		 */
 		int skipto = mtag ? divert_cookie(mtag) : 0;
 
 		f = chain->rules;
 		if (args->eh == NULL && skipto != 0) {
 			if (skipto >= IPFW_DEFAULT_RULE) {
 				IPFW_RUNLOCK(chain);
 				return (IP_FW_DENY); /* invalid */
 			}
 			while (f && f->rulenum <= skipto)
 				f = f->next;
 			if (f == NULL) {	/* drop packet */
 				IPFW_RUNLOCK(chain);
 				return (IP_FW_DENY);
 			}
 		}
 	}
 	/* reset divert rule to avoid confusion later */
 	if (mtag) {
 		divinput_flags = divert_info(mtag) &
 		    (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG);
 		m_tag_delete(m, mtag);
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 */
 	for (; f; f = f->next) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 
 again:
 		if (set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 check_body:
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset!=0)
 					break;
 				if (is_ipv6) /* XXX to be fixed later */
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    proto, oif,
 						    dst_ip, dst_port,
 						    src_ip, src_port, &fw_ugid_cache,
 						    &ugid_lookup, args->inp);
 				break;
 
 			case O_RECV:
 				match = iface_match(m->m_pkthdr.rcvif,
 				    (ipfw_insn_if *)cmd);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd);
 				break;
 
 			case O_VIA:
 				match = iface_match(oif ? oif :
 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
 				break;
 
 			case O_MACADDR2:
 				if (args->eh != NULL) {	/* have MAC header */
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)args->eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->eh != NULL) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (etype >= p[0] &&
 						    etype <= p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->eh != NULL);
 				break;
 
 			case O_DIVERTED:
 				match = (cmd->arg1 & 1 && divinput_flags &
 				    IP_FW_DIVERT_LOOPBACK_FLAG) ||
 					(cmd->arg1 & 2 && divinput_flags &
 				    IP_FW_DIVERT_OUTPUT_FLAG);
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_SRC_LOOKUP:
 			case O_IP_DST_LOOKUP:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_LOOKUP) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t v;
 
 				    match = lookup_table(chain, cmd->arg1, a,
 					&v);
 				    if (!match)
 					break;
 				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 					match =
 					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
 				    else
 					tablearg = v;
 				}
 				break;
 
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(src_ip, tif);
 					match = (tif != NULL);
 				}
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(dst_ip, tif);
 					match = (tif != NULL);
 				}
 				break;
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
 				    && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (is_ipv4 &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPLEN:
 			case O_IPTTL:
 				if (is_ipv4) {	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = ip_len;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    tcp = TCP(ulp);
 				    x = ip_len -
 					((ip->ip_hl + tcp->th_off) << 2);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    tcpopts_match(TCP(ulp), cmd));
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPWIN:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    cmd->arg1 == TCP(ulp)->th_win);
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				at = pf_get_mtag(m);
 				if (at == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				at->qid = altq->qid;
 				if (is_ipv4)
 					at->af = AF_INET;
 				else
 					at->af = AF_LINK;
 				at->hdr = ip;
 				m_tag_prepend(m, mtag);
 				break;
 			}
 
 			case O_LOG:
 				if (fw_verbose)
 					ipfw_log(f, hlen, args, m,
 					    oif, offset, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = ((oif != NULL) ||
 				    (m->m_pkthdr.rcvif == NULL) ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    m->m_pkthdr.rcvif) :
 #endif
 				    verify_path(src_ip, m->m_pkthdr.rcvif)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) ||
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL) :
 #endif
 				    verify_path(src_ip, NULL)));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6),
 					        m->m_pkthdr.rcvif) :
 #endif
 					    verify_path(src_ip,
 					        m->m_pkthdr.rcvif);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 #ifdef IPSEC
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 #endif
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_IP6_SRC_ME:
 				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
 				break;
 
 			case O_IP6_DST_ME:
 				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
 				    tablearg : cmd->arg1;
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 				} else if (mtag == NULL) {
 					if ((mtag = m_tag_alloc(MTAG_IPFW,
 					    tag, 0, M_NOWAIT)) != NULL)
 						m_tag_prepend(m, mtag);
 				}
 				match = (cmd->len & F_NOT) ? 0: 1;
 				break;
 			}
 
 			case O_TAGGED: {
 				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
 				    tablearg : cmd->arg1;
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to do a 'goto done').
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   ('goto next_rule', equivalent to a 'break 2'),
 			 *   or to the SKIPTO target ('goto again' after
 			 *   having set f, cmd and l), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule.
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet *   must be dropped
 			 *   ('goto done' after setting retval);
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   ('goto check_body') if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found ('goto next_rule').
 			 *   The result of the lookup is cached to make
 			 *   further instances of these opcodes are
 			 *   effectively NOPs.
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (install_state(f,
 				    (ipfw_insn_limit *)cmd, args, tablearg)) {
 					retval = IP_FW_DENY;
 					goto done; /* error/limit violation */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_dir.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (dyn_dir == MATCH_UNKNOWN &&
 				    (q = lookup_dyn_rule(&args->f_id,
 				     &dyn_dir, proto == IPPROTO_TCP ?
 					TCP(ulp) : NULL))
 					!= NULL) {
 					/*
 					 * Found dynamic entry, update stats
 					 * and jump to the 'action' part of
 					 * the parent rule.
 					 */
 					q->pcnt++;
 					q->bcnt += pktlen;
 					f = q->rule;
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					IPFW_DYN_UNLOCK();
 					goto check_body;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					goto next_rule;
 				match = 1;
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				goto done;
 
 			case O_PIPE:
 			case O_QUEUE:
 				args->rule = f; /* report matching rule */
 				if (cmd->arg1 == IP_FW_TABLEARG)
 					args->cookie = tablearg;
 				else
 					args->cookie = cmd->arg1;
 				retval = IP_FW_DUMMYNET;
 				goto done;
 
 			case O_DIVERT:
 			case O_TEE: {
 				struct divert_tag *dt;
 
 				if (args->eh) /* not on layer 2 */
 					break;
 				mtag = m_tag_get(PACKET_TAG_DIVERT,
 						sizeof(struct divert_tag),
 						M_NOWAIT);
 				if (mtag == NULL) {
 					/* XXX statistic */
 					/* drop packet */
 					IPFW_RUNLOCK(chain);
 					return (IP_FW_DENY);
 				}
 				dt = (struct divert_tag *)(mtag+1);
 				dt->cookie = f->rulenum;
 				if (cmd->arg1 == IP_FW_TABLEARG)
 					dt->info = tablearg;
 				else
 					dt->info = cmd->arg1;
 				m_tag_prepend(m, mtag);
 				retval = (cmd->opcode == O_DIVERT) ?
 				    IP_FW_DIVERT : IP_FW_TEE;
 				goto done;
 			}
 
 			case O_COUNT:
 			case O_SKIPTO:
 				f->pcnt++;	/* update stats */
 				f->bcnt += pktlen;
 				f->timestamp = time_uptime;
 				if (cmd->opcode == O_COUNT)
 					goto next_rule;
 				/* handle skipto */
 				if (f->next_rule == NULL)
 					lookup_next_rule(f);
 				f = f->next_rule;
 				goto again;
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd->arg1, ip_len, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(args->f_id.flags) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
 					send_reject6(
 					    args, cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				goto done;
 
 			case O_FORWARD_IP: {
 				struct sockaddr_in *sa;
 				sa = &(((ipfw_insn_sa *)cmd)->sa);
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (!q || dyn_dir == MATCH_FORWARD) {
 					if (sa->sin_addr.s_addr == INADDR_ANY) {
 						bcopy(sa, &args->hopstore,
 							sizeof(*sa));
 						args->hopstore.sin_addr.s_addr =
 						    htonl(tablearg);
 						args->next_hop =
 						    &args->hopstore;
 					} else {
 						args->next_hop = sa;
 					}
 				}
 				retval = IP_FW_PASS;
 			    }
 			    goto done;
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				args->rule = f;	/* report matching rule */
 				if (cmd->arg1 == IP_FW_TABLEARG)
 					args->cookie = tablearg;
 				else
 					args->cookie = cmd->arg1;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				goto done;
 
 #ifdef IPFIREWALL_NAT
 			case O_NAT: {
 				struct cfg_nat *t;
 				struct mbuf *mcl;
 				/* XXX - libalias duct tape */
 				int ldt; 
 				char *c;
 				
 				ldt = 0;
 				args->rule = f;	/* Report matching rule. */
 				retval = 0;
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					t = lookup_nat(cmd->arg1);
 					if (t == NULL) {
 						retval = IP_FW_DENY;
 						goto done;
 					} else 
 						((ipfw_insn_nat *)cmd)->nat = 
 						    t;
 				}
 				if ((mcl = m_megapullup(m, m->m_pkthdr.len)) ==
 				    NULL)
 					goto badnat;
 				ip = mtod(mcl, struct ip *);
 				if (args->eh == NULL) {
 					ip->ip_len = htons(ip->ip_len);
 					ip->ip_off = htons(ip->ip_off);
 				}
 
 				/* 
 				 * XXX - Libalias checksum offload 'duct tape':
 				 * 
 				 * locally generated packets have only
 				 * pseudo-header checksum calculated
 				 * and libalias will screw it[1], so
 				 * mark them for later fix.  Moreover
 				 * there are cases when libalias
 				 * modify tcp packet data[2], mark it
 				 * for later fix too.
 				 *
 				 * [1] libalias was never meant to run
 				 * in kernel, so it doesn't have any
 				 * knowledge about checksum
 				 * offloading, and it expects a packet
 				 * with a full internet
 				 * checksum. Unfortunately, packets
 				 * generated locally will have just the
 				 * pseudo header calculated, and when
 				 * libalias tries to adjust the
 				 * checksum it will actually screw it.
 				 *
 				 * [2] when libalias modify tcp's data
 				 * content, full TCP checksum has to
 				 * be recomputed: the problem is that
 				 * libalias doesn't have any idea
 				 * about checksum offloading To
 				 * workaround this, we do not do
 				 * checksumming in LibAlias, but only
 				 * mark the packets in th_x2 field. If
 				 * we receive a marked packet, we
 				 * calculate correct checksum for it
 				 * aware of offloading.  Why such a
 				 * terrible hack instead of
 				 * recalculating checksum for each
 				 * packet?  Because the previous
 				 * checksum was not checked!
 				 * Recalculating checksums for EVERY
 				 * packet will hide ALL transmission
 				 * errors. Yes, marked packets still
 				 * suffer from this problem. But,
 				 * sigh, natd(8) has this problem,
 				 * too.
 				 *
 				 * TODO: -make libalias mbuf aware (so
 				 * it can handle delayed checksum and tso)
 				 */
 
 				if (mcl->m_pkthdr.rcvif == NULL && 
 				    mcl->m_pkthdr.csum_flags & 
 				    CSUM_DELAY_DATA)
 					ldt = 1;
 
 				c = mtod(mcl, char *);
 				if (oif == NULL)
 					retval = LibAliasIn(t->lib, c, 
 					    MCLBYTES);
 				else
 					retval = LibAliasOut(t->lib, c, 
 					    MCLBYTES);
 				if (retval != PKT_ALIAS_OK) {
 					/* XXX - should i add some logging? */
 					m_free(mcl);
 				badnat:
 					args->m = NULL;
 					retval = IP_FW_DENY;
 					goto done;
 				}
 				mcl->m_pkthdr.len = mcl->m_len = 
 				    ntohs(ip->ip_len);
 
 				/* 
 				 * XXX - libalias checksum offload 
 				 * 'duct tape' (see above) 
 				 */
 
 				if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && 
 				    ip->ip_p == IPPROTO_TCP) {
 					struct tcphdr 	*th; 
 
 					th = (struct tcphdr *)(ip + 1);
 					if (th->th_x2) 
 						ldt = 1;
 				}
 
 				if (ldt) {
 					struct tcphdr 	*th;
 					struct udphdr 	*uh;
 					u_short cksum;
 
 					ip->ip_len = ntohs(ip->ip_len);
 					cksum = in_pseudo(
 						ip->ip_src.s_addr,
 						ip->ip_dst.s_addr, 
 						htons(ip->ip_p + ip->ip_len - 
 					            (ip->ip_hl << 2))
 						);
 					
 					switch (ip->ip_p) {
 					case IPPROTO_TCP:
 						th = (struct tcphdr *)(ip + 1);
 						/* 
 						 * Maybe it was set in 
 						 * libalias... 
 						 */
 						th->th_x2 = 0;
 						th->th_sum = cksum;
 						mcl->m_pkthdr.csum_data = 
 						    offsetof(struct tcphdr,
 						    th_sum);
 						break;
 					case IPPROTO_UDP:
 						uh = (struct udphdr *)(ip + 1);
 						uh->uh_sum = cksum;
 						mcl->m_pkthdr.csum_data = 
 						    offsetof(struct udphdr,
 						    uh_sum);
 						break;						
 					}
 					/* 
 					 * No hw checksum offloading: do it 
 					 * by ourself. 
 					 */
 					if ((mcl->m_pkthdr.csum_flags & 
 					     CSUM_DELAY_DATA) == 0) {
 						in_delayed_cksum(mcl);
 						mcl->m_pkthdr.csum_flags &= 
 						    ~CSUM_DELAY_DATA;
 					}
 					ip->ip_len = htons(ip->ip_len);
 				}
 
 				if (args->eh == NULL) {
 					ip->ip_len = ntohs(ip->ip_len);
 					ip->ip_off = ntohs(ip->ip_off);
 				}
 
 				args->m = mcl;
 				retval = IP_FW_NAT; 
 				goto done;
 			}
 #endif
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner for, scan opcodes */
 
 next_rule:;		/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	IPFW_RUNLOCK(chain);
 	return (IP_FW_DENY);
 
 done:
 	/* Update statistics */
 	f->pcnt++;
 	f->bcnt += pktlen;
 	f->timestamp = time_uptime;
 	IPFW_RUNLOCK(chain);
 	return (retval);
 
 pullup_failed:
 	if (fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * When a rule is added/deleted, clear the next_rule pointers in all rules.
  * These will be reconstructed on the fly as packets are matched.
  */
 static void
 flush_rule_ptrs(struct ip_fw_chain *chain)
 {
 	struct ip_fw *rule;
 
 	IPFW_WLOCK_ASSERT(chain);
 
 	for (rule = chain->rules; rule; rule = rule->next)
 		rule->next_rule = NULL;
 }
 
 /*
  * Add a new rule to the list. Copy the rule into a malloc'ed area, then
  * possibly create a rule number and add the rule to the list.
  * Update the rule_number in the input struct so the caller knows it as well.
  */
 static int
 add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 {
 	struct ip_fw *rule, *f, *prev;
 	int l = RULESIZE(input_rule);
 
 	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
 		return (EINVAL);
 
 	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
 	if (rule == NULL)
 		return (ENOSPC);
 
 	bcopy(input_rule, rule, l);
 
 	rule->next = NULL;
 	rule->next_rule = NULL;
 
 	rule->pcnt = 0;
 	rule->bcnt = 0;
 	rule->timestamp = 0;
 
 	IPFW_WLOCK(chain);
 
 	if (chain->rules == NULL) {	/* default rule */
 		chain->rules = rule;
 		goto done;
         }
 
 	/*
 	 * If rulenum is 0, find highest numbered rule before the
 	 * default rule, and add autoinc_step
 	 */
 	if (autoinc_step < 1)
 		autoinc_step = 1;
 	else if (autoinc_step > 1000)
 		autoinc_step = 1000;
 	if (rule->rulenum == 0) {
 		/*
 		 * locate the highest numbered rule before default
 		 */
 		for (f = chain->rules; f; f = f->next) {
 			if (f->rulenum == IPFW_DEFAULT_RULE)
 				break;
 			rule->rulenum = f->rulenum;
 		}
 		if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step)
 			rule->rulenum += autoinc_step;
 		input_rule->rulenum = rule->rulenum;
 	}
 
 	/*
 	 * Now insert the new rule in the right place in the sorted list.
 	 */
 	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
 		if (f->rulenum > rule->rulenum) { /* found the location */
 			if (prev) {
 				rule->next = f;
 				prev->next = rule;
 			} else { /* head insert */
 				rule->next = chain->rules;
 				chain->rules = rule;
 			}
 			break;
 		}
 	}
 	flush_rule_ptrs(chain);
 done:
 	static_count++;
 	static_len += l;
 	IPFW_WUNLOCK(chain);
 	DEB(printf("ipfw: installed rule %d, static count now %d\n",
 		rule->rulenum, static_count);)
 	return (0);
 }
 
 /**
  * Remove a static rule (including derived * dynamic rules)
  * and place it on the ``reap list'' for later reclamation.
  * The caller is in charge of clearing rule pointers to avoid
  * dangling pointers.
  * @return a pointer to the next entry.
  * Arguments are not checked, so they better be correct.
  */
 static struct ip_fw *
 remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
     struct ip_fw *prev)
 {
 	struct ip_fw *n;
 	int l = RULESIZE(rule);
 
 	IPFW_WLOCK_ASSERT(chain);
 
 	n = rule->next;
 	IPFW_DYN_LOCK();
 	remove_dyn_rule(rule, NULL /* force removal */);
 	IPFW_DYN_UNLOCK();
 	if (prev == NULL)
 		chain->rules = n;
 	else
 		prev->next = n;
 	static_count--;
 	static_len -= l;
 
 	rule->next = chain->reap;
 	chain->reap = rule;
 
 	return n;
 }
 
 /**
  * Reclaim storage associated with a list of rules.  This is
  * typically the list created using remove_rule.
  */
 static void
 reap_rules(struct ip_fw *head)
 {
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
 		head = head->next;
 		if (DUMMYNET_LOADED)
 			ip_dn_ruledel_ptr(rule);
 		free(rule, M_IPFW);
 	}
 }
 
 /*
  * Remove all rules from a chain (except rules in set RESVD_SET
  * unless kill_default = 1).  The caller is responsible for
  * reclaiming storage for the rules left in chain->reap.
  */
 static void
 free_chain(struct ip_fw_chain *chain, int kill_default)
 {
 	struct ip_fw *prev, *rule;
 
 	IPFW_WLOCK_ASSERT(chain);
 
 	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
 	for (prev = NULL, rule = chain->rules; rule ; )
 		if (kill_default || rule->set != RESVD_SET)
 			rule = remove_rule(chain, rule, prev);
 		else {
 			prev = rule;
 			rule = rule->next;
 		}
 }
 
 /**
  * Remove all rules with given number, and also do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
  *
  * The argument is an u_int32_t. The low 16 bit are the rule or set number,
  * the next 8 bits are the new set, the top 8 bits are the command:
  *
  *	0	delete rules with given number
  *	1	delete rules with given set number
  *	2	move rules with given number to new set
  *	3	move rules with given set number to new set
  *	4	swap sets with given numbers
  *	5	delete rules with given number and with given set number
  */
 static int
 del_entry(struct ip_fw_chain *chain, u_int32_t arg)
 {
 	struct ip_fw *prev = NULL, *rule;
 	u_int16_t rulenum;	/* rule or old_set */
 	u_int8_t cmd, new_set;
 
 	rulenum = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 5 || new_set > RESVD_SET)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2 || cmd == 5) {
 		if (rulenum >= IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
 		if (rulenum > RESVD_SET)	/* old_set */
 			return EINVAL;
 	}
 
 	IPFW_WLOCK(chain);
 	rule = chain->rules;
 	chain->reap = NULL;
 	switch (cmd) {
 	case 0:	/* delete rules with given number */
 		/*
 		 * locate first rule to delete
 		 */
 		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
 			;
 		if (rule->rulenum != rulenum) {
 			IPFW_WUNLOCK(chain);
 			return EINVAL;
 		}
 
 		/*
 		 * flush pointers outside the loop, then delete all matching
 		 * rules. prev remains the same throughout the cycle.
 		 */
 		flush_rule_ptrs(chain);
 		while (rule->rulenum == rulenum)
 			rule = remove_rule(chain, rule, prev);
 		break;
 
 	case 1:	/* delete all rules with given set number */
 		flush_rule_ptrs(chain);
 		rule = chain->rules;
 		while (rule->rulenum < IPFW_DEFAULT_RULE)
 			if (rule->set == rulenum)
 				rule = remove_rule(chain, rule, prev);
 			else {
 				prev = rule;
 				rule = rule->next;
 			}
 		break;
 
 	case 2:	/* move rules with given number to new set */
 		rule = chain->rules;
 		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
 			if (rule->rulenum == rulenum)
 				rule->set = new_set;
 		break;
 
 	case 3: /* move rules with given set number to new set */
 		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
 			if (rule->set == rulenum)
 				rule->set = new_set;
 		break;
 
 	case 4: /* swap two sets */
 		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
 			if (rule->set == rulenum)
 				rule->set = new_set;
 			else if (rule->set == new_set)
 				rule->set = rulenum;
 		break;
 	case 5: /* delete rules with given number and with given set number.
 		 * rulenum - given rule number;
 		 * new_set - given set number.
 		 */
 		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
 			;
 		if (rule->rulenum != rulenum) {
 			IPFW_WUNLOCK(chain);
 			return (EINVAL);
 		}
 		flush_rule_ptrs(chain);
 		while (rule->rulenum == rulenum) {
 			if (rule->set == new_set)
 				rule = remove_rule(chain, rule, prev);
 			else {
 				prev = rule;
 				rule = rule->next;
 			}
 		}
 	}
 	/*
 	 * Look for rules to reclaim.  We grab the list before
 	 * releasing the lock then reclaim them w/o the lock to
 	 * avoid a LOR with dummynet.
 	 */
 	rule = chain->reap;
 	chain->reap = NULL;
 	IPFW_WUNLOCK(chain);
 	if (rule)
 		reap_rules(rule);
 	return 0;
 }
 
 /*
  * Clear counters for a specific rule.
  * The enclosing "table" is assumed locked.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0) {
 		rule->bcnt = rule->pcnt = 0;
 		rule->timestamp = 0;
 	}
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
 }
 
 /**
  * Reset some or all counters on firewall rules.
  * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
  * the next 8 bits are the set number, the top 8 bits are the command:
  *	0	work with rules from all set's;
  *	1	work with rules only from specified set.
  * Specified rule number is zero if we want to clear all entries.
  * log_only is 1 if we only want to reset logs, zero otherwise.
  */
 static int
 zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 {
 	struct ip_fw *rule;
 	char *msg;
 
 	uint16_t rulenum = arg & 0xffff;
 	uint8_t set = (arg >> 16) & 0xff;
 	uint8_t cmd = (arg >> 24) & 0xff;
 
 	if (cmd > 1)
 		return (EINVAL);
 	if (cmd == 1 && set > RESVD_SET)
 		return (EINVAL);
 
 	IPFW_WLOCK(chain);
 	if (rulenum == 0) {
 		norule_counter = 0;
 		for (rule = chain->rules; rule; rule = rule->next) {
 			/* Skip rules from another set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
 			clear_counters(rule, log_only);
 		}
 		msg = log_only ? "ipfw: All logging counts reset.\n" :
 		    "ipfw: Accounting cleared.\n";
 	} else {
 		int cleared = 0;
 		/*
 		 * We can have multiple rules with the same number, so we
 		 * need to clear them all.
 		 */
 		for (rule = chain->rules; rule; rule = rule->next)
 			if (rule->rulenum == rulenum) {
 				while (rule && rule->rulenum == rulenum) {
 					if (cmd == 0 || rule->set == set)
 						clear_counters(rule, log_only);
 					rule = rule->next;
 				}
 				cleared = 1;
 				break;
 			}
 		if (!cleared) {	/* we did not find any matching rules */
 			IPFW_WUNLOCK(chain);
 			return (EINVAL);
 		}
 		msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
 		    "ipfw: Entry %d cleared.\n";
 	}
 	IPFW_WUNLOCK(chain);
 
 	if (fw_verbose)
 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
 	return (0);
 }
 
 /*
  * Check validity of the structure before insert.
  * Fortunately rules are simple, so this mostly need to check rule sizes.
  */
 static int
 check_ipfw_struct(struct ip_fw *rule, int size)
 {
 	int l, cmdlen = 0;
 	int have_action=0;
 	ipfw_insn *cmd;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 	/* first, check for valid size */
 	l = RULESIZE(rule);
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 	/*
 	 * Now go for the individual checks. Very simple ones, basically only
 	 * instruction sizes.
 	 */
 	for (l = rule->cmd_len, cmd = rule->cmd ;
 			l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		if (cmdlen > l) {
 			printf("ipfw: opcode %d size truncated\n",
 			    cmd->opcode);
 			return EINVAL;
 		}
 		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
 		switch (cmd->opcode) {
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
 		case O_PROTO:
 		case O_IP_SRC_ME:
 		case O_IP_DST_ME:
 		case O_LAYER2:
 		case O_IN:
 		case O_FRAG:
 		case O_DIVERTED:
 		case O_IPOPT:
 		case O_IPTOS:
 		case O_IPPRECEDENCE:
 		case O_IPVER:
 		case O_TCPWIN:
 		case O_TCPFLAGS:
 		case O_TCPOPTS:
 		case O_ESTAB:
 		case O_VERREVPATH:
 		case O_VERSRCREACH:
 		case O_ANTISPOOF:
 		case O_IPSEC:
 #ifdef INET6
 		case O_IP6_SRC_ME:
 		case O_IP6_DST_ME:
 		case O_EXT_HDR:
 		case O_IP6:
 #endif
 		case O_IP4:
 		case O_TAG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_UID:
 		case O_GID:
 		case O_JAIL:
 		case O_IP_SRC:
 		case O_IP_DST:
 		case O_TCPSEQ:
 		case O_TCPACK:
 		case O_PROB:
 		case O_ICMPTYPE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_LIMIT:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
 				goto bad_size;
 			break;
 
 		case O_LOG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
 				goto bad_size;
 
 			((ipfw_insn_log *)cmd)->log_left =
 			    ((ipfw_insn_log *)cmd)->max_log;
 
 			break;
 
 		case O_IP_SRC_MASK:
 		case O_IP_DST_MASK:
 			/* only odd command lengths */
 			if ( !(cmdlen & 1) || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_SET:
 		case O_IP_DST_SET:
 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
 				printf("ipfw: invalid set size %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    (cmd->arg1+31)/32 )
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_LOOKUP:
 		case O_IP_DST_LOOKUP:
 			if (cmd->arg1 >= IPFW_TABLES_MAX) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_MACADDR2:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
 				goto bad_size;
 			break;
 
 		case O_NOP:
 		case O_IPID:
 		case O_IPTTL:
 		case O_IPLEN:
 		case O_TCPDATALEN:
 		case O_TAGGED:
 			if (cmdlen < 1 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_MAC_TYPE:
 		case O_IP_SRCPORT:
 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
 			if (cmdlen < 2 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_RECV:
 		case O_XMIT:
 		case O_VIA:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
 				goto bad_size;
 			break;
 
 		case O_ALTQ:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
 				goto bad_size;
 			break;
 
 		case O_PIPE:
 		case O_QUEUE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_IP:
 #ifdef	IPFIREWALL_FORWARD
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
 				goto bad_size;
 			goto check_action;
 #else
 			return EINVAL;
 #endif
 
 		case O_DIVERT:
 		case O_TEE:
 			if (ip_divert_ptr == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NETGRAPH:
 		case O_NGTEE:
 			if (!NG_IPFW_LOADED)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NAT:
 #ifdef IPFIREWALL_NAT
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
  				goto bad_size;		
  			goto check_action;
 #else
 			return EINVAL;
 #endif
 		case O_FORWARD_MAC: /* XXX not implemented yet */
 		case O_CHECK_STATE:
 		case O_COUNT:
 		case O_ACCEPT:
 		case O_DENY:
 		case O_REJECT:
 #ifdef INET6
 		case O_UNREACH6:
 #endif
 		case O_SKIPTO:
 check_size:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 check_action:
 			if (have_action) {
 				printf("ipfw: opcode %d, multiple actions"
 					" not allowed\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 			have_action = 1;
 			if (l != cmdlen) {
 				printf("ipfw: opcode %d, action must be"
 					" last opcode\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 			break;
 #ifdef INET6
 		case O_IP6_SRC:
 		case O_IP6_DST:
 			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
 			    F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_FLOW6ID:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    ((ipfw_insn_u32 *)cmd)->o.arg1)
 				goto bad_size;
 			break;
 
 		case O_IP6_SRC_MASK:
 		case O_IP6_DST_MASK:
 			if ( !(cmdlen & 1) || cmdlen > 127)
 				goto bad_size;
 			break;
 		case O_ICMP6TYPE:
 			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
 				goto bad_size;
 			break;
 #endif
 
 		default:
 			switch (cmd->opcode) {
 #ifndef INET6
 			case O_IP6_SRC_ME:
 			case O_IP6_DST_ME:
 			case O_EXT_HDR:
 			case O_IP6:
 			case O_UNREACH6:
 			case O_IP6_SRC:
 			case O_IP6_DST:
 			case O_FLOW6ID:
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 			case O_ICMP6TYPE:
 				printf("ipfw: no IPv6 support in kernel\n");
 				return EPROTONOSUPPORT;
 #endif
 			default:
 				printf("ipfw: opcode %d, unknown opcode\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 		}
 	}
 	if (have_action == 0) {
 		printf("ipfw: missing action\n");
 		return EINVAL;
 	}
 	return 0;
 
 bad_size:
 	printf("ipfw: opcode %d size %d wrong\n",
 		cmd->opcode, cmdlen);
 	return EINVAL;
 }
 
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
  */
 static size_t
 ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 {
 	char *bp = buf;
 	char *ep = bp + space;
 	struct ip_fw *rule;
 	int i;
 	time_t	boot_seconds;
 
         boot_seconds = boottime.tv_sec;
 	/* XXX this can take a long time and locking will block packet flow */
 	IPFW_RLOCK(chain);
 	for (rule = chain->rules; rule ; rule = rule->next) {
 		/*
 		 * Verify the entry fits in the buffer in case the
 		 * rules changed between calculating buffer space and
 		 * now.  This would be better done using a generation
 		 * number but should suffice for now.
 		 */
 		i = RULESIZE(rule);
 		if (bp + i <= ep) {
 			bcopy(rule, bp, i);
 			/*
 			 * XXX HACK. Store the disable mask in the "next" pointer
 			 * in a wild attempt to keep the ABI the same.
 			 * Why do we do this on EVERY rule?
 			 */
 			bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule),
 			    sizeof(set_disable));
 			if (((struct ip_fw *)bp)->timestamp)
 				((struct ip_fw *)bp)->timestamp += boot_seconds;
 			bp += i;
 		}
 	}
 	IPFW_RUNLOCK(chain);
 	if (ipfw_dyn_v) {
 		ipfw_dyn_rule *p, *last = NULL;
 
 		IPFW_DYN_LOCK();
 		for (i = 0 ; i < curr_dyn_buckets; i++)
 			for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) {
 				if (bp + sizeof *p <= ep) {
 					ipfw_dyn_rule *dst =
 						(ipfw_dyn_rule *)bp;
 					bcopy(p, dst, sizeof *p);
 					bcopy(&(p->rule->rulenum), &(dst->rule),
 					    sizeof(p->rule->rulenum));
 					/*
 					 * store set number into high word of
 					 * dst->rule pointer.
 					 */
 					bcopy(&(p->rule->set), &dst->rule +
 					    sizeof(p->rule->rulenum),
 					    sizeof(p->rule->set));
 					/*
 					 * store a non-null value in "next".
 					 * The userland code will interpret a
 					 * NULL here as a marker
 					 * for the last dynamic rule.
 					 */
 					bcopy(&dst, &dst->next, sizeof(dst));
 					last = dst;
 					dst->expire =
 					    TIME_LEQ(dst->expire, time_uptime) ?
 						0 : dst->expire - time_uptime ;
 					bp += sizeof(ipfw_dyn_rule);
 				}
 			}
 		IPFW_DYN_UNLOCK();
 		if (last != NULL) /* mark last dynamic rule */
 			bzero(&last->next, sizeof(last));
 	}
 	return (bp - (char *)buf);
 }
 
 
 /**
  * {set|get}sockopt parser.
  */
 static int
 ipfw_ctl(struct sockopt *sopt)
 {
 #define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
 	int error;
 	size_t size;
 	struct ip_fw *buf, *rule;
 	u_int32_t rulenum[2];
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
 	if (error)
 		return (error);
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if (sopt->sopt_name == IP_FW_ADD ||
 	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error)
 			return (error);
 	}
 
 	error = 0;
 
 	switch (sopt->sopt_name) {
 	case IP_FW_GET:
 		/*
 		 * pass up a copy of the current rules. Static rules
 		 * come first (the last of which has number IPFW_DEFAULT_RULE),
 		 * followed by a possibly empty list of dynamic rule.
 		 * The last dynamic rule has NULL in the "next" field.
 		 *
 		 * Note that the calculated size is used to bound the
 		 * amount of data returned to the user.  The rule set may
 		 * change between calculating the size and returning the
 		 * data in which case we'll just return what fits.
 		 */
 		size = static_len;	/* size of static rules */
 		if (ipfw_dyn_v)		/* add size of dyn.rules */
 			size += (dyn_count * sizeof(ipfw_dyn_rule));
 
 		/*
 		 * XXX todo: if the user passes a short length just to know
 		 * how much room is needed, do not bother filling up the
 		 * buffer, just jump to the sooptcopyout.
 		 */
 		buf = malloc(size, M_TEMP, M_WAITOK);
 		error = sooptcopyout(sopt, buf,
 				ipfw_getrules(&layer3_chain, buf, size));
 		free(buf, M_TEMP);
 		break;
 
 	case IP_FW_FLUSH:
 		/*
 		 * Normally we cannot release the lock on each iteration.
 		 * We could do it here only because we start from the head all
 		 * the times so there is no risk of missing some entries.
 		 * On the other hand, the risk is that we end up with
 		 * a very inconsistent ruleset, so better keep the lock
 		 * around the whole cycle.
 		 *
 		 * XXX this code can be improved by resetting the head of
 		 * the list to point to the default rule, and then freeing
 		 * the old list without the need for a lock.
 		 */
 
 		IPFW_WLOCK(&layer3_chain);
 		layer3_chain.reap = NULL;
 		free_chain(&layer3_chain, 0 /* keep default rule */);
 		rule = layer3_chain.reap;
 		layer3_chain.reap = NULL;
 		IPFW_WUNLOCK(&layer3_chain);
 		if (rule != NULL)
 			reap_rules(rule);
 		break;
 
 	case IP_FW_ADD:
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
 			sizeof(struct ip_fw) );
 		if (error == 0)
 			error = check_ipfw_struct(rule, sopt->sopt_valsize);
 		if (error == 0) {
 			error = add_rule(&layer3_chain, rule);
 			size = RULESIZE(rule);
 			if (!error && sopt->sopt_dir == SOPT_GET)
 				error = sooptcopyout(sopt, rule, size);
 		}
 		free(rule, M_TEMP);
 		break;
 
 	case IP_FW_DEL:
 		/*
 		 * IP_FW_DEL is used for deleting single rules or sets,
 		 * and (ab)used to atomically manipulate sets. Argument size
 		 * is used to distinguish between the two:
 		 *    sizeof(u_int32_t)
 		 *	delete single rule or set of rules,
 		 *	or reassign rules (or sets) to a different set.
 		 *    2*sizeof(u_int32_t)
 		 *	atomic disable/enable sets.
 		 *	first u_int32_t contains sets to be disabled,
 		 *	second u_int32_t contains sets to be enabled.
 		 */
 		error = sooptcopyin(sopt, rulenum,
 			2*sizeof(u_int32_t), sizeof(u_int32_t));
 		if (error)
 			break;
 		size = sopt->sopt_valsize;
 		if (size == sizeof(u_int32_t))	/* delete or reassign */
 			error = del_entry(&layer3_chain, rulenum[0]);
 		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
 			set_disable =
 			    (set_disable | rulenum[0]) & ~rulenum[1] &
 			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
 		else
 			error = EINVAL;
 		break;
 
 	case IP_FW_ZERO:
 	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
 		rulenum[0] = 0;
 		if (sopt->sopt_val != 0) {
 		    error = sooptcopyin(sopt, rulenum,
 			    sizeof(u_int32_t), sizeof(u_int32_t));
 		    if (error)
 			break;
 		}
 		error = zero_entry(&layer3_chain, rulenum[0],
 			sopt->sopt_name == IP_FW_RESETLOG);
 		break;
 
 	case IP_FW_TABLE_ADD:
 		{
 			ipfw_table_entry ent;
 
 			error = sooptcopyin(sopt, &ent,
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
 			error = add_table_entry(&layer3_chain, ent.tbl,
 			    ent.addr, ent.masklen, ent.value);
 		}
 		break;
 
 	case IP_FW_TABLE_DEL:
 		{
 			ipfw_table_entry ent;
 
 			error = sooptcopyin(sopt, &ent,
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
 			error = del_table_entry(&layer3_chain, ent.tbl,
 			    ent.addr, ent.masklen);
 		}
 		break;
 
 	case IP_FW_TABLE_FLUSH:
 		{
 			u_int16_t tbl;
 
 			error = sooptcopyin(sopt, &tbl,
 			    sizeof(tbl), sizeof(tbl));
 			if (error)
 				break;
 			IPFW_WLOCK(&layer3_chain);
 			error = flush_table(&layer3_chain, tbl);
 			IPFW_WUNLOCK(&layer3_chain);
 		}
 		break;
 
 	case IP_FW_TABLE_GETSIZE:
 		{
 			u_int32_t tbl, cnt;
 
 			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
 			    sizeof(tbl))))
 				break;
 			IPFW_RLOCK(&layer3_chain);
 			error = count_table(&layer3_chain, tbl, &cnt);
 			IPFW_RUNLOCK(&layer3_chain);
 			if (error)
 				break;
 			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
 		}
 		break;
 
 	case IP_FW_TABLE_LIST:
 		{
 			ipfw_table *tbl;
 
 			if (sopt->sopt_valsize < sizeof(*tbl)) {
 				error = EINVAL;
 				break;
 			}
 			size = sopt->sopt_valsize;
 			tbl = malloc(size, M_TEMP, M_WAITOK);
 			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			tbl->size = (size - sizeof(*tbl)) /
 			    sizeof(ipfw_table_entry);
 			IPFW_RLOCK(&layer3_chain);
 			error = dump_table(&layer3_chain, tbl);
 			IPFW_RUNLOCK(&layer3_chain);
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			error = sooptcopyout(sopt, tbl, size);
 			free(tbl, M_TEMP);
 		}
 		break;
 
 #ifdef IPFIREWALL_NAT
 	case IP_FW_NAT_CFG:
 	{
 		struct cfg_nat *ptr, *ser_n;
 		char *buf;
 
 		buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
 		error = sooptcopyin(sopt, buf, NAT_BUF_LEN, 
 		    sizeof(struct cfg_nat));
 		ser_n = (struct cfg_nat *)buf;
 
 		/* 
 		 * Find/create nat rule.
 		 */
 		IPFW_WLOCK(&layer3_chain);
 		ptr = lookup_nat(ser_n->id);		
 		if (ptr == NULL) {
 			/* New rule: allocate and init new instance. */
 			ptr = malloc(sizeof(struct cfg_nat), 
 		            M_IPFW, M_NOWAIT | M_ZERO);
 			if (ptr == NULL) {
 				IPFW_WUNLOCK(&layer3_chain);				
 				free(buf, M_IPFW);
 				return (ENOSPC);				
 			}
 			ptr->lib = LibAliasInit(NULL);
 			if (ptr->lib == NULL) {
 				IPFW_WUNLOCK(&layer3_chain);
 				free(ptr, M_IPFW);
 				free(buf, M_IPFW);		
 				return (EINVAL);
 			}
 			LIST_INIT(&ptr->redir_chain);
 		} else {
 			/* Entry already present: temporarly unhook it. */
 			UNHOOK_NAT(ptr);
 			flush_nat_ptrs(ser_n->id);
 		}
 		IPFW_WUNLOCK(&layer3_chain);
 
 		/* 
 		 * Basic nat configuration.
 		 */
 		ptr->id = ser_n->id;
 		/* 
 		 * XXX - what if this rule doesn't nat any ip and just 
 		 * redirect? 
 		 * do we set aliasaddress to 0.0.0.0?
 		 */
 		ptr->ip = ser_n->ip;
 		ptr->redir_cnt = ser_n->redir_cnt;
 		ptr->mode = ser_n->mode;
 		LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
 		LibAliasSetAddress(ptr->lib, ptr->ip);
 		memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
 
 		/* 
 		 * Redir and LSNAT configuration.
 		 */
 		/* Delete old cfgs. */
 		del_redir_spool_cfg(ptr, &ptr->redir_chain);
 		/* Add new entries. */
 		add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
 		free(buf, M_IPFW);
 		IPFW_WLOCK(&layer3_chain);
 		HOOK_NAT(&layer3_chain.nat, ptr);
 		IPFW_WUNLOCK(&layer3_chain);
 	}
 	break;
 
 	case IP_FW_NAT_DEL:
 	{
 		struct cfg_nat *ptr;
 		int i;
 		
 		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 		IPFW_WLOCK(&layer3_chain);
 		ptr = lookup_nat(i);
 		if (ptr == NULL) {
 			error = EINVAL;
 			IPFW_WUNLOCK(&layer3_chain);
 			break;
 		}
 		UNHOOK_NAT(ptr);
 		flush_nat_ptrs(i);
 		IPFW_WUNLOCK(&layer3_chain);
 		del_redir_spool_cfg(ptr, &ptr->redir_chain);
 		LibAliasUninit(ptr->lib);
 		free(ptr, M_IPFW);
 	}
 	break;
 
 	case IP_FW_NAT_GET_CONFIG:
 	{
 		uint8_t *data;
 		struct cfg_nat *n;
 		struct cfg_redir *r;
 		struct cfg_spool *s;
 		int nat_cnt, off;
 		
 		nat_cnt = 0;
 		off = sizeof(nat_cnt);
 
 		data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
 		IPFW_RLOCK(&layer3_chain);
 		/* Serialize all the data. */
 		LIST_FOREACH(n, &layer3_chain.nat, _next) {
 			nat_cnt++;
 			if (off + SOF_NAT < NAT_BUF_LEN) {
 				bcopy(n, &data[off], SOF_NAT);
 				off += SOF_NAT;
 				LIST_FOREACH(r, &n->redir_chain, _next) {
 					if (off + SOF_REDIR < NAT_BUF_LEN) {
 						bcopy(r, &data[off], 
 						    SOF_REDIR);
 						off += SOF_REDIR;
 						LIST_FOREACH(s, &r->spool_chain, 
 						    _next) {							     
 							if (off + SOF_SPOOL < 
 							    NAT_BUF_LEN) {
 								bcopy(s, 
 								    &data[off],
 								    SOF_SPOOL);
 								off += 
 								    SOF_SPOOL;
 							} else
 								goto nospace;
 						}
 					} else
 						goto nospace;
 				}
 			} else
 				goto nospace;
 		}
 		bcopy(&nat_cnt, data, sizeof(nat_cnt));
 		IPFW_RUNLOCK(&layer3_chain);
 		error = sooptcopyout(sopt, data, NAT_BUF_LEN);
 		free(data, M_IPFW);
 		break;
 	nospace:
 		IPFW_RUNLOCK(&layer3_chain);
 		printf("serialized data buffer not big enough:"
 		    "please increase NAT_BUF_LEN\n");
 		free(data, M_IPFW);
 	}
 	break;
 
 	case IP_FW_NAT_GET_LOG:
 	{
 		uint8_t *data;
 		struct cfg_nat *ptr;
 		int i, size, cnt, sof;
 
 		data = NULL;
 		sof = LIBALIAS_BUF_SIZE;
 		cnt = 0;
 
 		IPFW_RLOCK(&layer3_chain);
 		size = i = 0;
 		LIST_FOREACH(ptr, &layer3_chain.nat, _next) {
 			if (ptr->lib->logDesc == NULL) 
 				continue;
 			cnt++;
 			size = cnt * (sof + sizeof(int));
 			data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO);
 			if (data == NULL) {
 				IPFW_RUNLOCK(&layer3_chain);
 				return (ENOSPC);
 			}
 			bcopy(&ptr->id, &data[i], sizeof(int));
 			i += sizeof(int);
 			bcopy(ptr->lib->logDesc, &data[i], sof);
 			i += sof;
 		}
 		IPFW_RUNLOCK(&layer3_chain);
 		error = sooptcopyout(sopt, data, size);
 		free(data, M_IPFW);
 	}
 	break;
 #endif
 
 	default:
 		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
 		error = EINVAL;
 	}
 
 	return (error);
 #undef RULE_MAXSIZE
 }
 
 /**
  * dummynet needs a reference to the default rule, because rules can be
  * deleted while packets hold a reference to them. When this happens,
  * dummynet changes the reference to the default rule (it could well be a
  * NULL pointer, but this way we do not need to check for the special
  * case, plus here he have info on the default behaviour).
  */
 struct ip_fw *ip_fw_default_rule;
 
 /*
  * This procedure is only used to handle keepalives. It is invoked
  * every dyn_keepalive_period
  */
 static void
 ipfw_tick(void * __unused unused)
 {
 	struct mbuf *m0, *m, *mnext, **mtailp;
 	int i;
 	ipfw_dyn_rule *q;
 
 	if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
 		goto done;
 
 	/*
 	 * We make a chain of packets to go out here -- not deferring
 	 * until after we drop the IPFW dynamic rule lock would result
 	 * in a lock order reversal with the normal packet input -> ipfw
 	 * call stack.
 	 */
 	m0 = NULL;
 	mtailp = &m0;
 	IPFW_DYN_LOCK();
 	for (i = 0 ; i < curr_dyn_buckets ; i++) {
 		for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
 			if (q->dyn_type == O_LIMIT_PARENT)
 				continue;
 			if (q->id.proto != IPPROTO_TCP)
 				continue;
 			if ( (q->state & BOTH_SYN) != BOTH_SYN)
 				continue;
 			if (TIME_LEQ( time_uptime+dyn_keepalive_interval,
 			    q->expire))
 				continue;	/* too early */
 			if (TIME_LEQ(q->expire, time_uptime))
 				continue;	/* too late, rule expired */
 
 			*mtailp = send_pkt(NULL, &(q->id), q->ack_rev - 1,
 				q->ack_fwd, TH_SYN);
 			if (*mtailp != NULL)
 				mtailp = &(*mtailp)->m_nextpkt;
 			*mtailp = send_pkt(NULL, &(q->id), q->ack_fwd - 1,
 				q->ack_rev, 0);
 			if (*mtailp != NULL)
 				mtailp = &(*mtailp)->m_nextpkt;
 		}
 	}
 	IPFW_DYN_UNLOCK();
 	for (m = mnext = m0; m != NULL; m = mnext) {
 		mnext = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		ip_output(m, NULL, NULL, 0, NULL, NULL);
 	}
 done:
 	callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL);
 }
 
 int
 ipfw_init(void)
 {
 	struct ip_fw default_rule;
 	int error;
 
 #ifdef INET6
 	/* Setup IPv6 fw sysctl tree. */
 	sysctl_ctx_init(&ip6_fw_sysctl_ctx);
 	ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw",
 	    CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall");
 	SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
 	    OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
 	    &fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6");
 	SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
 	    OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE,
 	    &fw_deny_unknown_exthdrs, 0,
 	    "Deny packets with unknown IPv6 Extension Headers");
 #endif
 
 	layer3_chain.rules = NULL;
 	IPFW_LOCK_INIT(&layer3_chain);
 	ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
 	    sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	IPFW_DYN_LOCK_INIT();
-	callout_init(&ipfw_timeout, NET_CALLOUT_MPSAFE);
+	callout_init(&ipfw_timeout, CALLOUT_MPSAFE);
 
 	bzero(&default_rule, sizeof default_rule);
 
 	default_rule.act_ofs = 0;
 	default_rule.rulenum = IPFW_DEFAULT_RULE;
 	default_rule.cmd_len = 1;
 	default_rule.set = RESVD_SET;
 
 	default_rule.cmd[0].len = 1;
 	default_rule.cmd[0].opcode =
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 				1 ? O_ACCEPT :
 #endif
 				O_DENY;
 
 	error = add_rule(&layer3_chain, &default_rule);
 	if (error != 0) {
 		printf("ipfw2: error %u initializing default rule "
 			"(support disabled)\n", error);
 		IPFW_DYN_LOCK_DESTROY();
 		IPFW_LOCK_DESTROY(&layer3_chain);
 		uma_zdestroy(ipfw_dyn_rule_zone);
 		return (error);
 	}
 
 	ip_fw_default_rule = layer3_chain.rules;
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, "
 		"rule-based forwarding "
 #ifdef IPFIREWALL_FORWARD
 		"enabled, "
 #else
 		"disabled, "
 #endif
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
 
 #ifdef IPFIREWALL_VERBOSE
 	fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 	if (fw_verbose == 0)
 		printf("disabled\n");
 	else if (verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    verbose_limit);
 
 	error = init_tables(&layer3_chain);
 	if (error) {
 		IPFW_DYN_LOCK_DESTROY();
 		IPFW_LOCK_DESTROY(&layer3_chain);
 		uma_zdestroy(ipfw_dyn_rule_zone);
 		return (error);
 	}
 	ip_fw_ctl_ptr = ipfw_ctl;
 	ip_fw_chk_ptr = ipfw_chk;
 	callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL);	
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&layer3_chain.nat);
 	ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, 
 	    NULL, EVENTHANDLER_PRI_ANY);
 #endif
 	return (0);
 }
 
 void
 ipfw_destroy(void)
 {
 	struct ip_fw *reap;
 #ifdef IPFIREWALL_NAT
 	struct cfg_nat *ptr, *ptr_temp;
 #endif
 
 	ip_fw_chk_ptr = NULL;
 	ip_fw_ctl_ptr = NULL;
 	callout_drain(&ipfw_timeout);
 	IPFW_WLOCK(&layer3_chain);
 	flush_tables(&layer3_chain);
 #ifdef IPFIREWALL_NAT
 	LIST_FOREACH_SAFE(ptr, &layer3_chain.nat, _next, ptr_temp) {
 		LIST_REMOVE(ptr, _next);
 		del_redir_spool_cfg(ptr, &ptr->redir_chain);
 		LibAliasUninit(ptr->lib);
 		free(ptr, M_IPFW);
 	}
 	EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
 #endif
 	layer3_chain.reap = NULL;
 	free_chain(&layer3_chain, 1 /* kill default rule */);
 	reap = layer3_chain.reap, layer3_chain.reap = NULL;
 	IPFW_WUNLOCK(&layer3_chain);
 	if (reap != NULL)
 		reap_rules(reap);
 	IPFW_DYN_LOCK_DESTROY();
 	uma_zdestroy(ipfw_dyn_rule_zone);
 	IPFW_LOCK_DESTROY(&layer3_chain);
 
 #ifdef INET6
 	/* Free IPv6 fw sysctl tree. */
 	sysctl_ctx_free(&ip6_fw_sysctl_ctx);
 #endif
 
 	printf("IP firewall unloaded\n");
 }
Index: head/sys/netinet/ip_mroute.c
===================================================================
--- head/sys/netinet/ip_mroute.c	(revision 171636)
+++ head/sys/netinet/ip_mroute.c	(revision 171637)
@@ -1,3156 +1,3156 @@
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  * Modified by Ahmed Helmy, SGI, June 1996
  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
  * bandwidth metering and signaling
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 #include "opt_mrouting.h"
 
 #define _PIM_VT 1
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <net/if.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/pim.h>
 #include <netinet/pim_var.h>
 #include <netinet/udp.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Control debugging code for rsvp and multicast routing code.
  * Can only set them with the debugger.
  */
 static u_int    rsvpdebug;		/* non-zero enables debugging	*/
 
 static u_int	mrtdebug;		/* any set of the flags below	*/
 #define		DEBUG_MFC	0x02
 #define		DEBUG_FORWARD	0x04
 #define		DEBUG_EXPIRE	0x08
 #define		DEBUG_XMIT	0x10
 #define		DEBUG_PIM	0x20
 
 #define		VIFI_INVALID	((vifi_t) -1)
 
 #define M_HASCL(m)	((m)->m_flags & M_EXT)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");
 
 /*
  * Locking.  We use two locks: one for the virtual interface table and
  * one for the forwarding table.  These locks may be nested in which case
  * the VIF lock must always be taken first.  Note that each lock is used
  * to cover not only the specific data structure but also related data
  * structures.  It may be better to add more fine-grained locking later;
  * it's not clear how performance-critical this code is.
  *
  * XXX: This module could particularly benefit from being cleaned
  *      up to use the <sys/queue.h> macros.
  *
  */
 
 static struct mrtstat	mrtstat;
 SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
     &mrtstat, mrtstat,
     "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");
 
 static struct mfc	*mfctable[MFCTBLSIZ];
 SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
     &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]",
     "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)");
 
 static struct mtx mrouter_mtx;
 #define	MROUTER_LOCK()		mtx_lock(&mrouter_mtx)
 #define	MROUTER_UNLOCK()	mtx_unlock(&mrouter_mtx)
 #define	MROUTER_LOCK_ASSERT()	do {					\
 	mtx_assert(&mrouter_mtx, MA_OWNED);				\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MROUTER_LOCK_INIT()	\
 	mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER_LOCK_DESTROY()	mtx_destroy(&mrouter_mtx)
 
 static struct mtx mfc_mtx;
 #define	MFC_LOCK()	mtx_lock(&mfc_mtx)
 #define	MFC_UNLOCK()	mtx_unlock(&mfc_mtx)
 #define	MFC_LOCK_ASSERT()	do {					\
 	mtx_assert(&mfc_mtx, MA_OWNED);					\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MFC_LOCK_INIT()	mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF)
 #define	MFC_LOCK_DESTROY()	mtx_destroy(&mfc_mtx)
 
 static struct vif	viftable[MAXVIFS];
 SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
     &viftable, sizeof(viftable), "S,vif[MAXVIFS]",
     "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
 
 static struct mtx vif_mtx;
 #define	VIF_LOCK()	mtx_lock(&vif_mtx)
 #define	VIF_UNLOCK()	mtx_unlock(&vif_mtx)
 #define	VIF_LOCK_ASSERT()	mtx_assert(&vif_mtx, MA_OWNED)
 #define	VIF_LOCK_INIT()	mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF)
 #define	VIF_LOCK_DESTROY()	mtx_destroy(&vif_mtx)
 
 static u_char		nexpire[MFCTBLSIZ];
 
 static eventhandler_tag if_detach_event_tag = NULL;
 
 static struct callout expire_upcalls_ch;
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 #define ENCAP_TTL 64
 
 /*
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
 /*
  * Pending timeouts are stored in a hash table, the key being the
  * expiration time. Periodically, the entries are analysed and processed.
  */
 #define BW_METER_BUCKETS	1024
 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
 static struct callout bw_meter_ch;
 #define BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
 
 /*
  * Pending upcalls are stored in a vector which is flushed when
  * full, or periodically
  */
 static struct bw_upcall	bw_upcalls[BW_UPCALLS_MAX];
 static u_int	bw_upcalls_n; /* # of pending upcalls */
 static struct callout bw_upcalls_ch;
 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
 
 static struct pimstat pimstat;
 
 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD,
     &pimstat, pimstat,
     "PIM Statistics (struct pimstat, netinet/pim_var.h)");
 
 static u_long	pim_squelch_wholepkt = 0;
 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
     &pim_squelch_wholepkt, 0,
     "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
 
 extern  struct domain inetdomain;
 struct protosw in_pim_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		pim_input,
 	.pr_output =		(pr_output_t*)rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 static const struct encaptab *pim_encap_cookie;
 
 #ifdef INET6
 /* ip6_mroute.c glue */
 extern struct in6_protosw in6_pim_protosw;
 static const struct encaptab *pim6_encap_cookie;
 
 extern int X_ip6_mrouter_set(struct socket *, struct sockopt *);
 extern int X_ip6_mrouter_get(struct socket *, struct sockopt *);
 extern int X_ip6_mrouter_done(void);
 extern int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 extern int X_mrt6_ioctl(int, caddr_t);
 #endif
 
 static int pim_encapcheck(const struct mbuf *, int, int, void *);
 
 /*
  * Note: the PIM Register encapsulation adds the following in front of a
  * data packet:
  *
  * struct pim_encap_hdr {
  *    struct ip ip;
  *    struct pim_encap_pimhdr  pim;
  * }
  *
  */
 
 struct pim_encap_pimhdr {
 	struct pim pim;
 	uint32_t   flags;
 };
 
 static struct ip pim_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2,
 	IPVERSION,
 #else
 	IPVERSION,
 	sizeof(struct ip) >> 2,
 #endif
 	0,			/* tos */
 	sizeof(struct ip),	/* total length */
 	0,			/* id */
 	0,			/* frag offset */
 	ENCAP_TTL,
 	IPPROTO_PIM,
 	0,			/* checksum */
 };
 
 static struct pim_encap_pimhdr pim_encap_pimhdr = {
     {
 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
 	0,			/* reserved */
 	0,			/* checksum */
     },
     0				/* flags */
 };
 
 static struct ifnet multicast_register_if;
 static vifi_t reg_vif_num = VIFI_INVALID;
 
 /*
  * Private variables.
  */
 static vifi_t	   numvifs;
 
 static u_long	X_ip_mcast_src(int vifi);
 static int	X_ip_mforward(struct ip *ip, struct ifnet *ifp,
 			struct mbuf *m, struct ip_moptions *imo);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *so, struct sockopt *m);
 static int	X_ip_mrouter_set(struct socket *so, struct sockopt *m);
 static int	X_legal_vif_num(int vif);
 static int	X_mrt_ioctl(int cmd, caddr_t data);
 
 static int get_sg_cnt(struct sioc_sg_req *);
 static int get_vif_cnt(struct sioc_vif_req *);
 static void if_detached_event(void *arg __unused, struct ifnet *);
 static int ip_mrouter_init(struct socket *, int);
 static int add_vif(struct vifctl *);
 static int del_vif_locked(vifi_t);
 static int del_vif(vifi_t);
 static int add_mfc(struct mfcctl2 *);
 static int del_mfc(struct mfcctl2 *);
 static int set_api_config(uint32_t *); /* chose API capabilities */
 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
 static int set_assert(int);
 static void expire_upcalls(void *);
 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
 static void send_packet(struct vif *, struct mbuf *);
 
 /*
  * Bandwidth monitoring
  */
 static void free_bw_list(struct bw_meter *list);
 static int add_bw_upcall(struct bw_upcall *);
 static int del_bw_upcall(struct bw_upcall *);
 static void bw_meter_receive_packet(struct bw_meter *x, int plen,
 		struct timeval *nowp);
 static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp);
 static void bw_upcalls_send(void);
 static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp);
 static void unschedule_bw_meter(struct bw_meter *x);
 static void bw_meter_process(void);
 static void expire_bw_upcalls_send(void *);
 static void expire_bw_meter_process(void *);
 
 static int pim_register_send(struct ip *, struct vif *,
 		struct mbuf *, struct mfc *);
 static int pim_register_send_rp(struct ip *, struct vif *,
 		struct mbuf *, struct mfc *);
 static int pim_register_send_upcall(struct ip *, struct vif *,
 		struct mbuf *, struct mfc *);
 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
 
 /*
  * whether or not special PIM assert processing is enabled.
  */
 static int pim_assert;
 /*
  * Rate limit for assert notification messages, in usec
  */
 #define ASSERT_MSG_TIME		3000000
 
 /*
  * Kernel multicast routing API capabilities and setup.
  * If more API capabilities are added to the kernel, they should be
  * recorded in `mrt_api_support'.
  */
 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
 					 MRT_MFC_FLAGS_BORDER_VIF |
 					 MRT_MFC_RP |
 					 MRT_MFC_BW_UPCALL);
 static uint32_t mrt_api_config = 0;
 
 /*
  * Hash function for a source, group entry
  */
 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
 			((g) >> 20) ^ ((g) >> 10) ^ (g))
 
 /*
  * Find a route for a given origin IP address and Multicast group address
  * Statistics are updated by the caller if needed
  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  */
 static struct mfc *
 mfc_find(in_addr_t o, in_addr_t g)
 {
     struct mfc *rt;
 
     MFC_LOCK_ASSERT();
 
     for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next)
 	if ((rt->mfc_origin.s_addr == o) &&
 		(rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL))
 	    break;
     return rt;
 }
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  */
 #define TV_DELTA(a, b, delta) {					\
 	int xxs;						\
 	delta = (a).tv_usec - (b).tv_usec;			\
 	if ((xxs = (a).tv_sec - (b).tv_sec)) {			\
 		switch (xxs) {					\
 		case 2:						\
 		      delta += 1000000;				\
 		      /* FALLTHROUGH */				\
 		case 1:						\
 		      delta += 1000000;				\
 		      break;					\
 		default:					\
 		      delta += (1000000 * xxs);			\
 		}						\
 	}							\
 }
 
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 static int
 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
     int	error, optval;
     vifi_t	vifi;
     struct	vifctl vifc;
     struct	mfcctl2 mfc;
     struct	bw_upcall bw_upcall;
     uint32_t	i;
 
     if (so != ip_mrouter && sopt->sopt_name != MRT_INIT)
 	return EPERM;
 
     error = 0;
     switch (sopt->sopt_name) {
     case MRT_INIT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	error = ip_mrouter_init(so, optval);
 	break;
 
     case MRT_DONE:
 	error = ip_mrouter_done();
 	break;
 
     case MRT_ADD_VIF:
 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 	if (error)
 	    break;
 	error = add_vif(&vifc);
 	break;
 
     case MRT_DEL_VIF:
 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 	if (error)
 	    break;
 	error = del_vif(vifi);
 	break;
 
     case MRT_ADD_MFC:
     case MRT_DEL_MFC:
 	/*
 	 * select data size depending on API version.
 	 */
 	if (sopt->sopt_name == MRT_ADD_MFC &&
 		mrt_api_config & MRT_API_FLAGS_ALL) {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
 				sizeof(struct mfcctl2));
 	} else {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
 				sizeof(struct mfcctl));
 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
 			sizeof(mfc) - sizeof(struct mfcctl));
 	}
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_MFC)
 	    error = add_mfc(&mfc);
 	else
 	    error = del_mfc(&mfc);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	set_assert(optval);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (!error)
 	    error = set_api_config(&i);
 	if (!error)
 	    error = sooptcopyout(sopt, &i, sizeof i);
 	break;
 
     case MRT_ADD_BW_UPCALL:
     case MRT_DEL_BW_UPCALL:
 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
 				sizeof bw_upcall);
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
 	    error = add_bw_upcall(&bw_upcall);
 	else
 	    error = del_bw_upcall(&bw_upcall);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
     int error;
     static int version = 0x0305; /* !!! why is this here? XXX */
 
     switch (sopt->sopt_name) {
     case MRT_VERSION:
 	error = sooptcopyout(sopt, &version, sizeof version);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
 	break;
 
     case MRT_API_SUPPORT:
 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(int cmd, caddr_t data)
 {
     int error = 0;
 
     /*
      * Currently the only function calling this ioctl routine is rtioctl().
      * Typically, only root can create the raw socket in order to execute
      * this ioctl method, however the request might be coming from a prison
      */
     error = priv_check(curthread, PRIV_NETINET_MROUTE);
     if (error)
 	return (error);
     switch (cmd) {
     case (SIOCGETVIFCNT):
 	error = get_vif_cnt((struct sioc_vif_req *)data);
 	break;
 
     case (SIOCGETSGCNT):
 	error = get_sg_cnt((struct sioc_sg_req *)data);
 	break;
 
     default:
 	error = EINVAL;
 	break;
     }
     return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req *req)
 {
     struct mfc *rt;
 
     MFC_LOCK();
     rt = mfc_find(req->src.s_addr, req->grp.s_addr);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 	return EADDRNOTAVAIL;
     }
     req->pktcnt = rt->mfc_pkt_cnt;
     req->bytecnt = rt->mfc_byte_cnt;
     req->wrong_if = rt->mfc_wrong_if;
     MFC_UNLOCK();
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(struct sioc_vif_req *req)
 {
     vifi_t vifi = req->vifi;
 
     VIF_LOCK();
     if (vifi >= numvifs) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
 
     req->icount = viftable[vifi].v_pkt_in;
     req->ocount = viftable[vifi].v_pkt_out;
     req->ibytes = viftable[vifi].v_bytes_in;
     req->obytes = viftable[vifi].v_bytes_out;
     VIF_UNLOCK();
 
     return 0;
 }
 
 static void
 ip_mrouter_reset(void)
 {
     bzero((caddr_t)mfctable, sizeof(mfctable));
     bzero((caddr_t)nexpire, sizeof(nexpire));
 
     pim_assert = 0;
     mrt_api_config = 0;
 
-    callout_init(&expire_upcalls_ch, NET_CALLOUT_MPSAFE);
+    callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE);
 
     bw_upcalls_n = 0;
     bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers));
-    callout_init(&bw_upcalls_ch, NET_CALLOUT_MPSAFE);
-    callout_init(&bw_meter_ch, NET_CALLOUT_MPSAFE);
+    callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE);
+    callout_init(&bw_meter_ch, CALLOUT_MPSAFE);
 }
 
 static void
 if_detached_event(void *arg __unused, struct ifnet *ifp)
 {
     vifi_t vifi;
     int i;
     struct mfc *mfc;
     struct mfc *nmfc;
     struct mfc **ppmfc;	/* Pointer to previous node's next-pointer */
     struct rtdetq *pq;
     struct rtdetq *npq;
 
     MROUTER_LOCK();
     if (ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
     }
 
     /*
      * Tear down multicast forwarder state associated with this ifnet.
      * 1. Walk the vif list, matching vifs against this ifnet.
      * 2. Walk the multicast forwarding cache (mfc) looking for
      *    inner matches with this vif's index.
      * 3. Free any pending mbufs for this mfc.
      * 4. Free the associated mfc entry and state associated with this vif.
      *    Be very careful about unlinking from a singly-linked list whose
      *    "head node" is a pointer in a simple array.
      * 5. Free vif state. This should disable ALLMULTI on the interface.
      */
     VIF_LOCK();
     MFC_LOCK();
     for (vifi = 0; vifi < numvifs; vifi++) {
 	if (viftable[vifi].v_ifp != ifp)
 		continue;
 	for (i = 0; i < MFCTBLSIZ; i++) {
 	    ppmfc = &mfctable[i];
 	    for (mfc = mfctable[i]; mfc != NULL; ) {
 		nmfc = mfc->mfc_next;
 		if (mfc->mfc_parent == vifi) {
 		    for (pq = mfc->mfc_stall; pq != NULL; ) {
 			npq = pq->next;
 			m_freem(pq->m);
 			free(pq, M_MRTABLE);
 			pq = npq;
 		    }
 		    free_bw_list(mfc->mfc_bw_meter);
 		    free(mfc, M_MRTABLE);
 		    *ppmfc = nmfc;
 		} else {
 		    ppmfc = &mfc->mfc_next;
 		}
 		mfc = nmfc;
 	    }
 	}
 	del_vif_locked(vifi);
     }
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     MROUTER_UNLOCK();
 }
                         
 /*
  * Enable multicast routing
  */
 static int
 ip_mrouter_init(struct socket *so, int version)
 {
     if (mrtdebug)
 	log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
 	    so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
 	return EOPNOTSUPP;
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     MROUTER_LOCK();
 
     if (ip_mrouter != NULL) {
 	MROUTER_UNLOCK();
 	return EADDRINUSE;
     }
 
     if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 
         if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
     if (if_detach_event_tag == NULL) {
 	MROUTER_UNLOCK();
 	return (ENOMEM);
     }
 
     callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL);
 
     callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 	expire_bw_upcalls_send, NULL);
     callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL);
 
     ip_mrouter = so;
 
     MROUTER_UNLOCK();
 
     if (mrtdebug)
 	log(LOG_DEBUG, "ip_mrouter_init\n");
 
     return 0;
 }
 
 /*
  * Disable multicast routing
  */
 static int
 X_ip_mrouter_done(void)
 {
     vifi_t vifi;
     int i;
     struct ifnet *ifp;
     struct ifreq ifr;
     struct mfc *rt;
     struct rtdetq *rte;
 
     MROUTER_LOCK();
 
     if (ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return EINVAL;
     }
 
     /*
      * Detach/disable hooks to the reset of the system.
      */
     ip_mrouter = NULL;
     mrt_api_config = 0;
 
     VIF_LOCK();
     /*
      * For each phyint in use, disable promiscuous reception of all IP
      * multicasts.
      */
     for (vifi = 0; vifi < numvifs; vifi++) {
 	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
 		!(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 	    struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr);
 
 	    so->sin_len = sizeof(struct sockaddr_in);
 	    so->sin_family = AF_INET;
 	    so->sin_addr.s_addr = INADDR_ANY;
 	    ifp = viftable[vifi].v_ifp;
 	    if_allmulti(ifp, 0);
 	}
     }
     bzero((caddr_t)viftable, sizeof(viftable));
     numvifs = 0;
     pim_assert = 0;
     VIF_UNLOCK();
     EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 
     /*
      * Free all multicast forwarding cache entries.
      */
     callout_stop(&expire_upcalls_ch);
     callout_stop(&bw_upcalls_ch);
     callout_stop(&bw_meter_ch);
 
     MFC_LOCK();
     for (i = 0; i < MFCTBLSIZ; i++) {
 	for (rt = mfctable[i]; rt != NULL; ) {
 	    struct mfc *nr = rt->mfc_next;
 
 	    for (rte = rt->mfc_stall; rte != NULL; ) {
 		struct rtdetq *n = rte->next;
 
 		m_freem(rte->m);
 		free(rte, M_MRTABLE);
 		rte = n;
 	    }
 	    free_bw_list(rt->mfc_bw_meter);
 	    free(rt, M_MRTABLE);
 	    rt = nr;
 	}
     }
     bzero((caddr_t)mfctable, sizeof(mfctable));
     bzero((caddr_t)nexpire, sizeof(nexpire));
     bw_upcalls_n = 0;
     bzero(bw_meter_timers, sizeof(bw_meter_timers));
     MFC_UNLOCK();
 
     reg_vif_num = VIFI_INVALID;
 
     MROUTER_UNLOCK();
 
     if (mrtdebug)
 	log(LOG_DEBUG, "ip_mrouter_done\n");
 
     return 0;
 }
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(int i)
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     pim_assert = i;
 
     return 0;
 }
 
 /*
  * Configure API capabilities
  */
 int
 set_api_config(uint32_t *apival)
 {
     int i;
 
     /*
      * We can set the API capabilities only if it is the first operation
      * after MRT_INIT. I.e.:
      *  - there are no vifs installed
      *  - pim_assert is not enabled
      *  - the MFC table is empty
      */
     if (numvifs > 0) {
 	*apival = 0;
 	return EPERM;
     }
     if (pim_assert) {
 	*apival = 0;
 	return EPERM;
     }
     for (i = 0; i < MFCTBLSIZ; i++) {
 	if (mfctable[i] != NULL) {
 	    *apival = 0;
 	    return EPERM;
 	}
     }
 
     mrt_api_config = *apival & mrt_api_support;
     *apival = mrt_api_config;
 
     return 0;
 }
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(struct vifctl *vifcp)
 {
     struct vif *vifp = viftable + vifcp->vifc_vifi;
     struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error;
 
     VIF_LOCK();
     if (vifcp->vifc_vifi >= MAXVIFS) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
     /* rate limiting is no longer supported by this code */
     if (vifcp->vifc_rate_limit != 0) {
 	log(LOG_ERR, "rate limiting is no longer supported\n");
 	VIF_UNLOCK();
 	return EINVAL;
     }
     if (vifp->v_lcl_addr.s_addr != INADDR_ANY) {
 	VIF_UNLOCK();
 	return EADDRINUSE;
     }
     if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) {
 	VIF_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /* Find the interface with an address in AF_INET family */
     if (vifcp->vifc_flags & VIFF_REGISTER) {
 	/*
 	 * XXX: Because VIFF_REGISTER does not really need a valid
 	 * local interface (e.g. it could be 127.0.0.2), we don't
 	 * check its address.
 	 */
 	ifp = NULL;
     } else {
 	sin.sin_addr = vifcp->vifc_lcl_addr;
 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
 	if (ifa == NULL) {
 	    VIF_UNLOCK();
 	    return EADDRNOTAVAIL;
 	}
 	ifp = ifa->ifa_ifp;
     }
 
     if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
 	log(LOG_ERR, "tunnels are no longer supported\n");
 	VIF_UNLOCK();
 	return EOPNOTSUPP;
     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
 	ifp = &multicast_register_if;
 	if (mrtdebug)
 	    log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
 		    (void *)&multicast_register_if);
 	if (reg_vif_num == VIFI_INVALID) {
 	    if_initname(&multicast_register_if, "register_vif", 0);
 	    multicast_register_if.if_flags = IFF_LOOPBACK;
 	    reg_vif_num = vifcp->vifc_vifi;
 	}
     } else {		/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 	    VIF_UNLOCK();
 	    return EOPNOTSUPP;
 	}
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	error = if_allmulti(ifp, 1);
 	if (error) {
 	    VIF_UNLOCK();
 	    return error;
 	}
     }
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     vifp->v_rsvp_on   = 0;
     vifp->v_rsvpd     = NULL;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
     bzero(&vifp->v_route, sizeof(vifp->v_route));
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
 
     VIF_UNLOCK();
 
     if (mrtdebug)
 	log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x\n",
 	    vifcp->vifc_vifi,
 	    (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
 	    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
 	    (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
 	    vifcp->vifc_threshold);
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif_locked(vifi_t vifi)
 {
     struct vif *vifp;
 
     VIF_LOCK_ASSERT();
 
     if (vifi >= numvifs) {
 	return EINVAL;
     }
     vifp = &viftable[vifi];
     if (vifp->v_lcl_addr.s_addr == INADDR_ANY) {
 	return EADDRNOTAVAIL;
     }
 
     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
 	if_allmulti(vifp->v_ifp, 0);
 
     if (vifp->v_flags & VIFF_REGISTER)
 	reg_vif_num = VIFI_INVALID;
 
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     if (mrtdebug)
 	log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);
 
     /* Adjust numvifs down */
     for (vifi = numvifs; vifi > 0; vifi--)
 	if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY)
 	    break;
     numvifs = vifi;
 
     return 0;
 }
 
 static int
 del_vif(vifi_t vifi)
 {
     int cc;
 
     VIF_LOCK();
     cc = del_vif_locked(vifi);
     VIF_UNLOCK();
 
     return cc;
 }
 
 /*
  * update an mfc entry without resetting counters and S,G addresses.
  */
 static void
 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     int i;
 
     rt->mfc_parent = mfccp->mfcc_parent;
     for (i = 0; i < numvifs; i++) {
 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
 	    MRT_MFC_FLAGS_ALL;
     }
     /* set the RP address */
     if (mrt_api_config & MRT_MFC_RP)
 	rt->mfc_rp = mfccp->mfcc_rp;
     else
 	rt->mfc_rp.s_addr = INADDR_ANY;
 }
 
 /*
  * fully initialize an mfc entry from the parameter.
  */
 static void
 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     rt->mfc_origin     = mfccp->mfcc_origin;
     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 
     update_mfc_params(rt, mfccp);
 
     /* initialize pkt counters per src-grp */
     rt->mfc_pkt_cnt    = 0;
     rt->mfc_byte_cnt   = 0;
     rt->mfc_wrong_if   = 0;
     rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
 }
 
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(struct mfcctl2 *mfccp)
 {
     struct mfc *rt;
     u_long hash;
     struct rtdetq *rte;
     u_short nstl;
 
     VIF_LOCK();
     MFC_LOCK();
 
     rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	if (mrtdebug & DEBUG_MFC)
 	    log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
 		(u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		mfccp->mfcc_parent);
 
 	update_mfc_params(rt, mfccp);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return 0;
     }
 
     /*
      * Find the entry for which the upcall was made and update
      */
     hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
     for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {
 
 	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
 		(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
 		(rt->mfc_stall != NULL)) {
 
 	    if (nstl++)
 		log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
 		    "multiple kernel entries",
 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
 
 	    if (mrtdebug & DEBUG_MFC)
 		log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
 
 	    init_mfc_params(rt, mfccp);
 
 	    rt->mfc_expire = 0;	/* Don't clean this guy up */
 	    nexpire[hash]--;
 
 	    /* free packets Qed at the end of this entry */
 	    for (rte = rt->mfc_stall; rte != NULL; ) {
 		struct rtdetq *n = rte->next;
 
 		ip_mdq(rte->m, rte->ifp, rt, -1);
 		m_freem(rte->m);
 		free(rte, M_MRTABLE);
 		rte = n;
 	    }
 	    rt->mfc_stall = NULL;
 	}
     }
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	if (mrtdebug & DEBUG_MFC)
 	    log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
 		hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		mfccp->mfcc_parent);
 
 	for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
 	    if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
 		init_mfc_params(rt, mfccp);
 		if (rt->mfc_expire)
 		    nexpire[hash]--;
 		rt->mfc_expire = 0;
 		break; /* XXX */
 	    }
 	}
 	if (rt == NULL) {		/* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return ENOBUFS;
 	    }
 
 	    init_mfc_params(rt, mfccp);
 	    rt->mfc_expire     = 0;
 	    rt->mfc_stall      = NULL;
 
 	    rt->mfc_bw_meter = NULL;
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_next = mfctable[hash];
 	    mfctable[hash] = rt;
 	}
     }
     MFC_UNLOCK();
     VIF_UNLOCK();
     return 0;
 }
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(struct mfcctl2 *mfccp)
 {
     struct in_addr	origin;
     struct in_addr	mcastgrp;
     struct mfc		*rt;
     struct mfc		**nptr;
     u_long		hash;
     struct bw_meter	*list;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
 
     if (mrtdebug & DEBUG_MFC)
 	log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
 	    (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
 
     MFC_LOCK();
 
     hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
     for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next)
 	if (origin.s_addr == rt->mfc_origin.s_addr &&
 		mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
 		rt->mfc_stall == NULL)
 	    break;
     if (rt == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     *nptr = rt->mfc_next;
 
     /*
      * free the bw_meter entries
      */
     list = rt->mfc_bw_meter;
     rt->mfc_bw_meter = NULL;
 
     free(rt, M_MRTABLE);
 
     free_bw_list(list);
 
     MFC_UNLOCK();
 
     return 0;
 }
 
 /*
  * Send a message to the routing daemon on the multicast routing socket
  */
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 {
     if (s) {
 	SOCKBUF_LOCK(&s->so_rcv);
 	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
 	    NULL) != 0) {
 	    sorwakeup_locked(s);
 	    return 0;
 	}
 	SOCKBUF_UNLOCK(&s->so_rcv);
     }
     m_freem(mm);
     return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
     struct ip_moptions *imo)
 {
     struct mfc *rt;
     int error;
     vifi_t vifi;
 
     if (mrtdebug & DEBUG_FORWARD)
 	log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
 	    (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
 	    (void *)ifp);
 
     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
 	 * Packet arrived via a physical interface or
 	 * an encapsulated tunnel or a register_vif.
 	 */
     } else {
 	/*
 	 * Packet arrived through a source-route tunnel.
 	 * Source-route tunnels are no longer supported.
 	 */
 	static int last_log;
 	if (last_log != time_uptime) {
 	    last_log = time_uptime;
 	    log(LOG_ERR,
 		"ip_mforward: received source-routed packet from %lx\n",
 		(u_long)ntohl(ip->ip_src.s_addr));
 	}
 	return 1;
     }
 
     VIF_LOCK();
     MFC_LOCK();
     if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 	if (ip->ip_ttl < MAXTTL)
 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 	    struct vif *vifp = viftable + vifi;
 
 	    printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n",
 		(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
 		vifi,
 		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 		vifp->v_ifp->if_xname);
 	}
 	error = ip_mdq(m, ifp, NULL, vifi);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     }
     if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 	printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
 	    (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
 	if (!imo)
 	    printf("In fact, no options were specified at all\n");
     }
 
     /*
      * Don't forward a packet with time-to-live of zero or one,
      * or a packet destined to a local-only group.
      */
     if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return 0;
     }
 
     /*
      * Determine forwarding vifs from the forwarding cache table
      */
     ++mrtstat.mrts_mfc_lookups;
     rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr);
 
     /* Entry exists, so forward if necessary */
     if (rt != NULL) {
 	error = ip_mdq(m, ifp, rt, -1);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     } else {
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon
 	 */
 
 	struct mbuf *mb0;
 	struct rtdetq *rte;
 	u_long hash;
 	int hlen = ip->ip_hl << 2;
 
 	++mrtstat.mrts_mfc_misses;
 
 	mrtstat.mrts_no_route++;
 	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 	    log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
 		(u_long)ntohl(ip->ip_src.s_addr),
 		(u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT);
 	if (rte == NULL) {
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 	mb0 = m_copypacket(m, M_DONTWAIT);
 	if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
 	    mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 	    free(rte, M_MRTABLE);
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this flow ? */
 	hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
 	for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
 	    if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
 		    (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
 		    (rt->mfc_stall != NULL))
 		break;
 	}
 
 	if (rt == NULL) {
 	    int i;
 	    struct igmpmsg *im;
 	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 	    struct mbuf *mm;
 
 	    /*
 	     * Locate the vifi for the incoming interface for this packet.
 	     * If none found, drop packet.
 	     */
 	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
 		;
 	    if (vifi >= numvifs)	/* vif not found, drop packet */
 		goto non_fatal;
 
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL)
 		goto fail;
 	    /* Make a copy of the header to send to the user level process */
 	    mm = m_copy(mb0, 0, hlen);
 	    if (mm == NULL)
 		goto fail1;
 
 	    /*
 	     * Send message to routing daemon to install
 	     * a route into the kernel table
 	     */
 
 	    im = mtod(mm, struct igmpmsg *);
 	    im->im_msgtype = IGMPMSG_NOCACHE;
 	    im->im_mbz = 0;
 	    im->im_vif = vifi;
 
 	    mrtstat.mrts_upcalls++;
 
 	    k_igmpsrc.sin_addr = ip->ip_src;
 	    if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
 		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
 		++mrtstat.mrts_upq_sockfull;
 fail1:
 		free(rt, M_MRTABLE);
 fail:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return ENOBUFS;
 	    }
 
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 	    nexpire[hash]++;
 	    for (i = 0; i < numvifs; i++) {
 		rt->mfc_ttls[i] = 0;
 		rt->mfc_flags[i] = 0;
 	    }
 	    rt->mfc_parent = -1;
 
 	    rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */
 
 	    rt->mfc_bw_meter = NULL;
 
 	    /* link into table */
 	    rt->mfc_next   = mfctable[hash];
 	    mfctable[hash] = rt;
 	    rt->mfc_stall = rte;
 
 	} else {
 	    /* determine if q has overflowed */
 	    int npkts = 0;
 	    struct rtdetq **p;
 
 	    /*
 	     * XXX ouch! we need to append to the list, but we
 	     * only have a pointer to the front, so we have to
 	     * scan the entire list every time.
 	     */
 	    for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 		npkts++;
 
 	    if (npkts > MAX_UPQ) {
 		mrtstat.mrts_upq_ovflw++;
 non_fatal:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return 0;
 	    }
 
 	    /* Add this entry to the end of the queue */
 	    *p = rte;
 	}
 
 	rte->m			= mb0;
 	rte->ifp		= ifp;
 	rte->next		= NULL;
 
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 
 	return 0;
     }
 }
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *unused)
 {
     struct rtdetq *rte;
     struct mfc *mfc, **nptr;
     int i;
 
     MFC_LOCK();
     for (i = 0; i < MFCTBLSIZ; i++) {
 	if (nexpire[i] == 0)
 	    continue;
 	nptr = &mfctable[i];
 	for (mfc = *nptr; mfc != NULL; mfc = *nptr) {
 	    /*
 	     * Skip real cache entries
 	     * Make sure it wasn't marked to not expire (shouldn't happen)
 	     * If it expires now
 	     */
 	    if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 &&
 		    --mfc->mfc_expire == 0) {
 		if (mrtdebug & DEBUG_EXPIRE)
 		    log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
 			(u_long)ntohl(mfc->mfc_origin.s_addr),
 			(u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
 		/*
 		 * drop all the packets
 		 * free the mbuf with the pkt, if, timing info
 		 */
 		for (rte = mfc->mfc_stall; rte; ) {
 		    struct rtdetq *n = rte->next;
 
 		    m_freem(rte->m);
 		    free(rte, M_MRTABLE);
 		    rte = n;
 		}
 		++mrtstat.mrts_cache_cleanups;
 		nexpire[i]--;
 
 		/*
 		 * free the bw_meter entries
 		 */
 		while (mfc->mfc_bw_meter != NULL) {
 		    struct bw_meter *x = mfc->mfc_bw_meter;
 
 		    mfc->mfc_bw_meter = x->bm_mfc_next;
 		    free(x, M_BWMETER);
 		}
 
 		*nptr = mfc->mfc_next;
 		free(mfc, M_MRTABLE);
 	    } else {
 		nptr = &mfc->mfc_next;
 	    }
 	}
     }
     MFC_UNLOCK();
 
     callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 {
     struct ip  *ip = mtod(m, struct ip *);
     vifi_t vifi;
     int plen = ip->ip_len;
 
     VIF_LOCK_ASSERT();
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < numvifs) {
 	if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, viftable + xmt_vif, m, rt);
 	else
 		phyint_send(ip, viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 	/* came in the wrong interface */
 	if (mrtdebug & DEBUG_FORWARD)
 	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 		(void *)ifp, vifi, (void *)viftable[vifi].v_ifp);
 	++mrtstat.mrts_wrong_if;
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, send a message
 	 * to the routing daemon.
 	 *
 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 	 * can complete the SPT switch, regardless of the type
 	 * of the iif (broadcast media, GRE tunnel, etc).
 	 */
 	if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 	    struct timeval now;
 	    u_long delta;
 
 	    if (ifp == &multicast_register_if)
 		pimstat.pims_rcv_registers_wrongiif++;
 
 	    /* Get vifi for the incoming packet */
 	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
 		;
 	    if (vifi >= numvifs)
 		return 0;	/* The iif is not found: ignore the packet. */
 
 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 		return 0;	/* WRONGVIF disabled: ignore the packet */
 
 	    GET_TIME(now);
 
 	    TV_DELTA(now, rt->mfc_last_assert, delta);
 
 	    if (delta > ASSERT_MSG_TIME) {
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct igmpmsg *im;
 		int hlen = ip->ip_hl << 2;
 		struct mbuf *mm = m_copy(m, 0, hlen);
 
 		if (mm && (M_HASCL(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL)
 		    return ENOBUFS;
 
 		rt->mfc_last_assert = now;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		mrtstat.mrts_upcalls++;
 
 		k_igmpsrc.sin_addr = im->im_src;
 		if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
 		    log(LOG_WARNING,
 			"ip_mforward: ip_mrouter socket queue full\n");
 		    ++mrtstat.mrts_upq_sockfull;
 		    return ENOBUFS;
 		}
 	    }
 	}
 	return 0;
     }
 
     /* If I sourced this packet, it counts as output, else it was input. */
     if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
 	viftable[vifi].v_pkt_out++;
 	viftable[vifi].v_bytes_out += plen;
     } else {
 	viftable[vifi].v_pkt_in++;
 	viftable[vifi].v_bytes_in += plen;
     }
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifi = 0; vifi < numvifs; vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    viftable[vifi].v_pkt_out++;
 	    viftable[vifi].v_bytes_out += plen;
 	    if (viftable[vifi].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, viftable + vifi, m, rt);
 	    else
 		phyint_send(ip, viftable + vifi, m);
 	}
 
     /*
      * Perform upcall-related bw measuring.
      */
     if (rt->mfc_bw_meter != NULL) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	GET_TIME(now);
 	MFC_LOCK_ASSERT();
 	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 	    bw_meter_receive_packet(x, plen, &now);
     }
 
     return 0;
 }
 
 /*
  * check if a vif number is legal/ok. This is used by ip_output.
  */
 static int
 X_legal_vif_num(int vif)
 {
     /* XXX unlocked, matter? */
     return (vif >= 0 && vif < numvifs);
 }
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(int vifi)
 {
     /* XXX unlocked, matter? */
     if (vifi >= 0 && vifi < numvifs)
 	return viftable[vifi].v_lcl_addr.s_addr;
     else
 	return INADDR_ANY;
 }
 
 static void
 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 {
     struct mbuf *mb_copy;
     int hlen = ip->ip_hl << 2;
 
     VIF_LOCK_ASSERT();
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copypacket(m, M_DONTWAIT);
     if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     send_packet(vifp, mb_copy);
 }
 
 static void
 send_packet(struct vif *vifp, struct mbuf *m)
 {
 	struct ip_moptions imo;
 	struct in_multi *imm[2];
 	int error;
 
 	VIF_LOCK_ASSERT();
 
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_num_memberships = 0;
 	imo.imo_max_memberships = 2;
 	imo.imo_membership  = &imm[0];
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
 	if (mrtdebug & DEBUG_XMIT) {
 	    log(LOG_DEBUG, "phyint_send on vif %td err %d\n",
 		vifp - viftable, error);
 	}
 }
 
 static int
 X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt)
 {
     int error, vifi;
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 	return EOPNOTSUPP;
 
     error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
     if (error)
 	return error;
 
     VIF_LOCK();
 
     if (vifi < 0 || vifi >= numvifs) {	/* Error if vif is invalid */
 	VIF_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     if (sopt->sopt_name == IP_RSVP_VIF_ON) {
 	/* Check if socket is available. */
 	if (viftable[vifi].v_rsvpd != NULL) {
 	    VIF_UNLOCK();
 	    return EADDRINUSE;
 	}
 
 	viftable[vifi].v_rsvpd = so;
 	/* This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!viftable[vifi].v_rsvp_on) {
 	    viftable[vifi].v_rsvp_on = 1;
 	    rsvp_on++;
 	}
     } else { /* must be VIF_OFF */
 	/*
 	 * XXX as an additional consistency check, one could make sure
 	 * that viftable[vifi].v_rsvpd == so, otherwise passing so as
 	 * first parameter is pretty useless.
 	 */
 	viftable[vifi].v_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (viftable[vifi].v_rsvp_on) {
 	    viftable[vifi].v_rsvp_on = 0;
 	    rsvp_on--;
 	}
     }
     VIF_UNLOCK();
     return 0;
 }
 
 static void
 X_ip_rsvp_force_done(struct socket *so)
 {
     int vifi;
 
     /* Don't bother if it is not the right type of socket. */
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 	return;
 
     VIF_LOCK();
 
     /* The socket may be attached to more than one vif...this
      * is perfectly legal.
      */
     for (vifi = 0; vifi < numvifs; vifi++) {
 	if (viftable[vifi].v_rsvpd == so) {
 	    viftable[vifi].v_rsvpd = NULL;
 	    /* This may seem silly, but we need to be sure we don't
 	     * over-decrement the RSVP counter, in case something slips up.
 	     */
 	    if (viftable[vifi].v_rsvp_on) {
 		viftable[vifi].v_rsvp_on = 0;
 		rsvp_on--;
 	    }
 	}
     }
 
     VIF_UNLOCK();
 }
 
 static void
 X_rsvp_input(struct mbuf *m, int off)
 {
     int vifi;
     struct ip *ip = mtod(m, struct ip *);
     struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
     struct ifnet *ifp;
 
     if (rsvpdebug)
 	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
 
     /* Can still get packets with rsvp_on = 0 if there is a local member
      * of the group to which the RSVP packet is addressed.  But in this
      * case we want to throw the packet away.
      */
     if (!rsvp_on) {
 	m_freem(m);
 	return;
     }
 
     if (rsvpdebug)
 	printf("rsvp_input: check vifs\n");
 
 #ifdef DIAGNOSTIC
     M_ASSERTPKTHDR(m);
 #endif
 
     ifp = m->m_pkthdr.rcvif;
 
     VIF_LOCK();
     /* Find which vif the packet arrived on. */
     for (vifi = 0; vifi < numvifs; vifi++)
 	if (viftable[vifi].v_ifp == ifp)
 	    break;
 
     if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) {
 	/*
 	 * Drop the lock here to avoid holding it across rip_input.
 	 * This could make rsvpdebug printfs wrong.  If you care,
 	 * record the state of stuff before dropping the lock.
 	 */
 	VIF_UNLOCK();
 	/*
 	 * If the old-style non-vif-associated socket is set,
 	 * then use it.  Otherwise, drop packet since there
 	 * is no specific socket for this vif.
 	 */
 	if (ip_rsvpd != NULL) {
 	    if (rsvpdebug)
 		printf("rsvp_input: Sending packet up old-style socket\n");
 	    rip_input(m, off);  /* xxx */
 	} else {
 	    if (rsvpdebug && vifi == numvifs)
 		printf("rsvp_input: Can't find vif for packet.\n");
 	    else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
 		printf("rsvp_input: No socket defined for vif %d\n",vifi);
 	    m_freem(m);
 	}
 	return;
     }
     rsvp_src.sin_addr = ip->ip_src;
 
     if (rsvpdebug && m)
 	printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
 	       m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));
 
     if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
 	if (rsvpdebug)
 	    printf("rsvp_input: Failed to append to socket\n");
     } else {
 	if (rsvpdebug)
 	    printf("rsvp_input: send packet up\n");
     }
     VIF_UNLOCK();
 }
 
 /*
  * Code for bandwidth monitors
  */
 
 /*
  * Define common interface for timeval-related methods
  */
 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 
 static uint32_t
 compute_bw_meter_flags(struct bw_upcall *req)
 {
     uint32_t flags = 0;
 
     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 	flags |= BW_METER_UNIT_PACKETS;
     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 	flags |= BW_METER_UNIT_BYTES;
     if (req->bu_flags & BW_UPCALL_GEQ)
 	flags |= BW_METER_GEQ;
     if (req->bu_flags & BW_UPCALL_LEQ)
 	flags |= BW_METER_LEQ;
 
     return flags;
 }
 
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
     struct timeval now;
     struct bw_meter *x;
     uint32_t flags;
 
     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     /* Test if the flags are valid */
     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 	return EINVAL;
     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 	return EINVAL;
     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	return EINVAL;
 
     /* Test if the threshold time interval is valid */
     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 	return EINVAL;
 
     flags = compute_bw_meter_flags(req);
 
     /*
      * Find if we have already same bw_meter entry
      */
     MFC_LOCK();
     mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			   &req->bu_threshold.b_time, ==)) &&
 	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 	    MFC_UNLOCK();
 	    return 0;		/* XXX Already installed */
 	}
     }
 
     /* Allocate the new bw_meter entry */
     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
     if (x == NULL) {
 	MFC_UNLOCK();
 	return ENOBUFS;
     }
 
     /* Set the new bw_meter entry */
     x->bm_threshold.b_time = req->bu_threshold.b_time;
     GET_TIME(now);
     x->bm_start_time = now;
     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags = flags;
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 
     /* Add the new bw_meter entry to the front of entries for this MFC */
     x->bm_mfc = mfc;
     x->bm_mfc_next = mfc->mfc_bw_meter;
     mfc->mfc_bw_meter = x;
     schedule_bw_meter(x, &now);
     MFC_UNLOCK();
 
     return 0;
 }
 
 static void
 free_bw_list(struct bw_meter *list)
 {
     while (list != NULL) {
 	struct bw_meter *x = list;
 
 	list = list->bm_mfc_next;
 	unschedule_bw_meter(x);
 	free(x, M_BWMETER);
     }
 }
 
 /*
  * Delete one or multiple bw_meter entries
  */
 static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct bw_meter *x;
 
     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     MFC_LOCK();
     /* Find the corresponding MFC entry */
     mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 	/*
 	 * Delete all bw_meter entries for this mfc
 	 */
 	struct bw_meter *list;
 
 	list = mfc->mfc_bw_meter;
 	mfc->mfc_bw_meter = NULL;
 	free_bw_list(list);
 	MFC_UNLOCK();
 	return 0;
     } else {			/* Delete a single bw_meter entry */
 	struct bw_meter *prev;
 	uint32_t flags = 0;
 
 	flags = compute_bw_meter_flags(req);
 
 	/* Find the bw_meter entry to delete */
 	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 		break;
 	}
 	if (x != NULL) { /* Delete entry from the list for this MFC */
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
 		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 
 	    unschedule_bw_meter(x);
 	    MFC_UNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
 	    return 0;
 	} else {
 	    MFC_UNLOCK();
 	    return EINVAL;
 	}
     }
     /* NOTREACHED */
 }
 
 /*
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
     struct timeval delta;
 
     MFC_LOCK_ASSERT();
 
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     if (x->bm_flags & BW_METER_GEQ) {
 	/*
 	 * Processing for ">=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /* Reset the bw_meter entry */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 	    }
 	}
     } else if (x->bm_flags & BW_METER_LEQ) {
 	/*
 	 * Processing for "<=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /*
 	     * We are behind time with the multicast forwarding table
 	     * scanning for "<=" type of bw_meter entries, so test now
 	     * if we should deliver an upcall.
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 	    }
 	    /* Reschedule the bw_meter entry */
 	    unschedule_bw_meter(x);
 	    schedule_bw_meter(x, nowp);
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should restart the measuring interval
 	 */
 	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 	    (x->bm_flags & BW_METER_UNIT_BYTES &&
 	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 	    /* Don't restart the measuring interval */
 	} else {
 	    /* Do restart the measuring interval */
 	    /*
 	     * XXX: note that we don't unschedule and schedule, because this
 	     * might be too much overhead per packet. Instead, when we process
 	     * all entries for a given timer hash bin, we check whether it is
 	     * really a timeout. If not, we reschedule at that time.
 	     */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
     }
 }
 
 /*
  * Prepare a bandwidth-related upcall
  */
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
     struct timeval delta;
     struct bw_upcall *u;
 
     MFC_LOCK_ASSERT();
 
     /*
      * Compute the measured time interval
      */
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     /*
      * If there are too many pending upcalls, deliver them now
      */
     if (bw_upcalls_n >= BW_UPCALLS_MAX)
 	bw_upcalls_send();
 
     /*
      * Set the bw_upcall entry
      */
     u = &bw_upcalls[bw_upcalls_n++];
     u->bu_src = x->bm_mfc->mfc_origin;
     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
     u->bu_threshold.b_time = x->bm_threshold.b_time;
     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
     u->bu_measured.b_time = delta;
     u->bu_measured.b_packets = x->bm_measured.b_packets;
     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
     u->bu_flags = 0;
     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
     if (x->bm_flags & BW_METER_UNIT_BYTES)
 	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
     if (x->bm_flags & BW_METER_GEQ)
 	u->bu_flags |= BW_UPCALL_GEQ;
     if (x->bm_flags & BW_METER_LEQ)
 	u->bu_flags |= BW_UPCALL_LEQ;
 }
 
 /*
  * Send the pending bandwidth-related upcalls
  */
 static void
 bw_upcalls_send(void)
 {
     struct mbuf *m;
     int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 				      0,		/* unused2 */
 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 				      0,		/* im_mbz  */
 				      0,		/* im_vif  */
 				      0,		/* unused3 */
 				      { 0 },		/* im_src  */
 				      { 0 } };		/* im_dst  */
 
     MFC_LOCK_ASSERT();
 
     if (bw_upcalls_n == 0)
 	return;			/* No pending upcalls */
 
     bw_upcalls_n = 0;
 
     /*
      * Allocate a new mbuf, initialize it with the header and
      * the payload for the pending calls.
      */
     MGETHDR(m, M_DONTWAIT, MT_DATA);
     if (m == NULL) {
 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 	return;
     }
 
     m->m_len = m->m_pkthdr.len = 0;
     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);
 
     /*
      * Send the upcalls
      * XXX do we need to set the address in k_igmpsrc ?
      */
     mrtstat.mrts_upcalls++;
     if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 	++mrtstat.mrts_upq_sockfull;
     }
 }
 
 /*
  * Compute the timeout hash value for the bw_meter entries
  */
 #define	BW_METER_TIMEHASH(bw_meter, hash)				\
     do {								\
 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
 									\
 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 	(hash) = next_timeval.tv_sec;					\
 	if (next_timeval.tv_usec)					\
 	    (hash)++; /* XXX: make sure we don't timeout early */	\
 	(hash) %= BW_METER_BUCKETS;					\
     } while (0)
 
 /*
  * Schedule a timer to process periodically bw_meter entry of type "<="
  * by linking the entry in the proper hash bucket.
  */
 static void
 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 {
     int time_hash;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Reset the bw_meter entry
      */
     x->bm_start_time = *nowp;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 
     /*
      * Compute the timeout hash value and insert the entry
      */
     BW_METER_TIMEHASH(x, time_hash);
     x->bm_time_next = bw_meter_timers[time_hash];
     bw_meter_timers[time_hash] = x;
     x->bm_time_hash = time_hash;
 }
 
 /*
  * Unschedule the periodic timer that processes bw_meter entry of type "<="
  * by removing the entry from the proper hash bucket.
  */
 static void
 unschedule_bw_meter(struct bw_meter *x)
 {
     int time_hash;
     struct bw_meter *prev, *tmp;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Compute the timeout hash value and delete the entry
      */
     time_hash = x->bm_time_hash;
     if (time_hash >= BW_METER_BUCKETS)
 	return;		/* Entry was not scheduled */
 
     for (prev = NULL, tmp = bw_meter_timers[time_hash];
 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 	if (tmp == x)
 	    break;
 
     if (tmp == NULL)
 	panic("unschedule_bw_meter: bw_meter entry not found");
 
     if (prev != NULL)
 	prev->bm_time_next = x->bm_time_next;
     else
 	bw_meter_timers[time_hash] = x->bm_time_next;
 
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 }
 
 
 /*
  * Process all "<=" type of bw_meter that should be processed now,
  * and for each entry prepare an upcall if necessary. Each processed
  * entry is rescheduled again for the (periodic) processing.
  *
  * This is run periodically (once per second normally). On each round,
  * all the potentially matching entries are in the hash slot that we are
  * looking at.
  */
 static void
 bw_meter_process()
 {
     static uint32_t last_tv_sec;	/* last time we processed this */
 
     uint32_t loops;
     int i;
     struct timeval now, process_endtime;
 
     GET_TIME(now);
     if (last_tv_sec == now.tv_sec)
 	return;		/* nothing to do */
 
     loops = now.tv_sec - last_tv_sec;
     last_tv_sec = now.tv_sec;
     if (loops > BW_METER_BUCKETS)
 	loops = BW_METER_BUCKETS;
 
     MFC_LOCK();
     /*
      * Process all bins of bw_meter entries from the one after the last
      * processed to the current one. On entry, i points to the last bucket
      * visited, so we need to increment i at the beginning of the loop.
      */
     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 	struct bw_meter *x, *tmp_list;
 
 	if (++i >= BW_METER_BUCKETS)
 	    i = 0;
 
 	/* Disconnect the list of bw_meter entries from the bin */
 	tmp_list = bw_meter_timers[i];
 	bw_meter_timers[i] = NULL;
 
 	/* Process the list of bw_meter entries */
 	while (tmp_list != NULL) {
 	    x = tmp_list;
 	    tmp_list = tmp_list->bm_time_next;
 
 	    /* Test if the time interval is over */
 	    process_endtime = x->bm_start_time;
 	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 		/* Not yet: reschedule, but don't reset */
 		int time_hash;
 
 		BW_METER_TIMEHASH(x, time_hash);
 		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 		    /*
 		     * XXX: somehow the bin processing is a bit ahead of time.
 		     * Put the entry in the next bin.
 		     */
 		    if (++time_hash >= BW_METER_BUCKETS)
 			time_hash = 0;
 		}
 		x->bm_time_next = bw_meter_timers[time_hash];
 		bw_meter_timers[time_hash] = x;
 		x->bm_time_hash = time_hash;
 
 		continue;
 	    }
 
 	    /*
 	     * Test if we should deliver an upcall
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, &now);
 	    }
 
 	    /*
 	     * Reschedule for next processing
 	     */
 	    schedule_bw_meter(x, &now);
 	}
     }
 
     /* Send all upcalls that are pending delivery */
     bw_upcalls_send();
 
     MFC_UNLOCK();
 }
 
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
 static void
 expire_bw_upcalls_send(void *unused)
 {
     MFC_LOCK();
     bw_upcalls_send();
     MFC_UNLOCK();
 
     callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 	expire_bw_upcalls_send, NULL);
 }
 
 /*
  * A periodic function for periodic scanning of the multicast forwarding
  * table for processing all "<=" bw_meter entries.
  */
 static void
 expire_bw_meter_process(void *unused)
 {
     if (mrt_api_config & MRT_MFC_BW_UPCALL)
 	bw_meter_process();
 
     callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL);
 }
 
 /*
  * End of bandwidth monitoring code
  */
 
 /*
  * Send the packet up to the user daemon, or eventually do kernel encapsulation
  *
  */
 static int
 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
     struct mfc *rt)
 {
     struct mbuf *mb_copy, *mm;
 
     if (mrtdebug & DEBUG_PIM)
 	log(LOG_DEBUG, "pim_register_send: ");
 
     /*
      * Do not send IGMP_WHOLEPKT notifications to userland, if the
      * rendezvous point was unspecified, and we were told not to.
      */
     if (pim_squelch_wholepkt != 0 && (mrt_api_config & MRT_MFC_RP) &&
 	(rt->mfc_rp.s_addr == INADDR_ANY))
 	return 0;
 
     mb_copy = pim_register_prepare(ip, m);
     if (mb_copy == NULL)
 	return ENOBUFS;
 
     /*
      * Send all the fragments. Note that the mbuf for each fragment
      * is freed by the sending machinery.
      */
     for (mm = mb_copy; mm; mm = mb_copy) {
 	mb_copy = mm->m_nextpkt;
 	mm->m_nextpkt = 0;
 	mm = m_pullup(mm, sizeof(struct ip));
 	if (mm != NULL) {
 	    ip = mtod(mm, struct ip *);
 	    if ((mrt_api_config & MRT_MFC_RP) &&
 		(rt->mfc_rp.s_addr != INADDR_ANY)) {
 		pim_register_send_rp(ip, vifp, mm, rt);
 	    } else {
 		pim_register_send_upcall(ip, vifp, mm, rt);
 	    }
 	}
     }
 
     return 0;
 }
 
 /*
  * Return a copy of the data packet that is ready for PIM Register
  * encapsulation.
  * XXX: Note that in the returned copy the IP header is a valid one.
  */
 static struct mbuf *
 pim_register_prepare(struct ip *ip, struct mbuf *m)
 {
     struct mbuf *mb_copy = NULL;
     int mtu;
 
     /* Take care of delayed checksums */
     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 	in_delayed_cksum(m);
 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
     }
 
     /*
      * Copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.
      */
     mb_copy = m_copypacket(m, M_DONTWAIT);
     if (mb_copy == NULL)
 	return NULL;
     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
     if (mb_copy == NULL)
 	return NULL;
 
     /* take care of the TTL */
     ip = mtod(mb_copy, struct ip *);
     --ip->ip_ttl;
 
     /* Compute the MTU after the PIM Register encapsulation */
     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 
     if (ip->ip_len <= mtu) {
 	/* Turn the IP header into a valid one */
 	ip->ip_len = htons(ip->ip_len);
 	ip->ip_off = htons(ip->ip_off);
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     } else {
 	/* Fragment the packet */
 	if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) {
 	    m_freem(mb_copy);
 	    return NULL;
 	}
     }
     return mb_copy;
 }
 
 /*
  * Send an upcall with the data packet to the user-level process.
  */
 static int
 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
     struct mbuf *mb_copy, struct mfc *rt)
 {
     struct mbuf *mb_first;
     int len = ntohs(ip->ip_len);
     struct igmpmsg *im;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 
     VIF_LOCK_ASSERT();
 
     /*
      * Add a new mbuf with an upcall header
      */
     MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
     mb_first->m_len = sizeof(struct igmpmsg);
     mb_first->m_next = mb_copy;
 
     /* Send message to routing daemon */
     im = mtod(mb_first, struct igmpmsg *);
     im->im_msgtype	= IGMPMSG_WHOLEPKT;
     im->im_mbz		= 0;
     im->im_vif		= vifp - viftable;
     im->im_src		= ip->ip_src;
     im->im_dst		= ip->ip_dst;
 
     k_igmpsrc.sin_addr	= ip->ip_src;
 
     mrtstat.mrts_upcalls++;
 
     if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 	if (mrtdebug & DEBUG_PIM)
 	    log(LOG_WARNING,
 		"mcast: pim_register_send_upcall: ip_mrouter socket queue full");
 	++mrtstat.mrts_upq_sockfull;
 	return ENOBUFS;
     }
 
     /* Keep statistics */
     pimstat.pims_snd_registers_msgs++;
     pimstat.pims_snd_registers_bytes += len;
 
     return 0;
 }
 
 /*
  * Encapsulate the data packet in PIM Register message and send it to the RP.
  */
 static int
 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
     struct mfc *rt)
 {
     struct mbuf *mb_first;
     struct ip *ip_outer;
     struct pim_encap_pimhdr *pimhdr;
     int len = ntohs(ip->ip_len);
     vifi_t vifi = rt->mfc_parent;
 
     VIF_LOCK_ASSERT();
 
     if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) {
 	m_freem(mb_copy);
 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
     }
 
     /*
      * Add a new mbuf with the encapsulating header
      */
     MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     mb_first->m_next = mb_copy;
 
     mb_first->m_pkthdr.len = len + mb_first->m_len;
 
     /*
      * Fill in the encapsulating IP and PIM header
      */
     ip_outer = mtod(mb_first, struct ip *);
     *ip_outer = pim_encap_iphdr;
     ip_outer->ip_id = ip_newid();
     ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     ip_outer->ip_src = viftable[vifi].v_lcl_addr;
     ip_outer->ip_dst = rt->mfc_rp;
     /*
      * Copy the inner header TOS to the outer header, and take care of the
      * IP_DF bit.
      */
     ip_outer->ip_tos = ip->ip_tos;
     if (ntohs(ip->ip_off) & IP_DF)
 	ip_outer->ip_off |= IP_DF;
     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 					 + sizeof(pim_encap_iphdr));
     *pimhdr = pim_encap_pimhdr;
     /* If the iif crosses a border, set the Border-bit */
     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 
     mb_first->m_data += sizeof(pim_encap_iphdr);
     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
     mb_first->m_data -= sizeof(pim_encap_iphdr);
 
     send_packet(vifp, mb_first);
 
     /* Keep statistics */
     pimstat.pims_snd_registers_msgs++;
     pimstat.pims_snd_registers_bytes += len;
 
     return 0;
 }
 
 /*
  * pim_encapcheck() is called by the encap[46]_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 
 #ifdef DIAGNOSTIC
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
 #endif
     if (proto != IPPROTO_PIM)
 	return 0;	/* not for us; reject the datagram. */
 
     return 64;		/* claim the datagram. */
 }
 
 /*
  * PIM-SMv2 and PIM-DM messages processing.
  * Receives and verifies the PIM control messages, and passes them
  * up to the listening socket, using rip_input().
  * The only message with special processing is the PIM_REGISTER message
  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  * is passed to if_simloop().
  */
 void
 pim_input(struct mbuf *m, int off)
 {
     struct ip *ip = mtod(m, struct ip *);
     struct pim *pim;
     int minlen;
     int datalen = ip->ip_len;
     int ip_tos;
     int iphlen = off;
 
     /* Keep statistics */
     pimstat.pims_rcv_total_msgs++;
     pimstat.pims_rcv_total_bytes += datalen;
 
     /*
      * Validate lengths
      */
     if (datalen < PIM_MINLEN) {
 	pimstat.pims_rcv_tooshort++;
 	log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 	    datalen, (u_long)ip->ip_src.s_addr);
 	m_freem(m);
 	return;
     }
 
     /*
      * If the packet is at least as big as a REGISTER, go agead
      * and grab the PIM REGISTER header size, to avoid another
      * possible m_pullup() later.
      *
      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
      */
     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
     /*
      * Get the IP and PIM headers in contiguous memory, and
      * possibly the PIM REGISTER header.
      */
     if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 	(m = m_pullup(m, minlen)) == 0) {
 	log(LOG_ERR, "pim_input: m_pullup failure\n");
 	return;
     }
     /* m_pullup() may have given us a new mbuf so reset ip. */
     ip = mtod(m, struct ip *);
     ip_tos = ip->ip_tos;
 
     /* adjust mbuf to point to the PIM header */
     m->m_data += iphlen;
     m->m_len  -= iphlen;
     pim = mtod(m, struct pim *);
 
     /*
      * Validate checksum. If PIM REGISTER, exclude the data packet.
      *
      * XXX: some older PIMv2 implementations don't make this distinction,
      * so for compatibility reason perform the checksum over part of the
      * message, and if error, then over the whole message.
      */
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 	/* do nothing, checksum okay */
     } else if (in_cksum(m, datalen)) {
 	pimstat.pims_rcv_badsum++;
 	if (mrtdebug & DEBUG_PIM)
 	    log(LOG_DEBUG, "pim_input: invalid checksum");
 	m_freem(m);
 	return;
     }
 
     /* PIM version check */
     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 	pimstat.pims_rcv_badversion++;
 	log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 	    PIM_VT_V(pim->pim_vt), PIM_VERSION);
 	m_freem(m);
 	return;
     }
 
     /* restore mbuf back to the outer IP */
     m->m_data -= iphlen;
     m->m_len  += iphlen;
 
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 	/*
 	 * Since this is a REGISTER, we'll make a copy of the register
 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 	 * routing daemon.
 	 */
 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 	struct mbuf *mcp;
 	struct ip *encap_ip;
 	u_int32_t *reghdr;
 	struct ifnet *vifp;
 
 	VIF_LOCK();
 	if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 	    VIF_UNLOCK();
 	    if (mrtdebug & DEBUG_PIM)
 		log(LOG_DEBUG,
 		    "pim_input: register vif not set: %d\n", reg_vif_num);
 	    m_freem(m);
 	    return;
 	}
 	/* XXX need refcnt? */
 	vifp = viftable[reg_vif_num].v_ifp;
 	VIF_UNLOCK();
 
 	/*
 	 * Validate length
 	 */
 	if (datalen < PIM_REG_MINLEN) {
 	    pimstat.pims_rcv_tooshort++;
 	    pimstat.pims_rcv_badregisters++;
 	    log(LOG_ERR,
 		"pim_input: register packet size too small %d from %lx\n",
 		datalen, (u_long)ip->ip_src.s_addr);
 	    m_freem(m);
 	    return;
 	}
 
 	reghdr = (u_int32_t *)(pim + 1);
 	encap_ip = (struct ip *)(reghdr + 1);
 
 	if (mrtdebug & DEBUG_PIM) {
 	    log(LOG_DEBUG,
 		"pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 		(u_long)ntohl(encap_ip->ip_src.s_addr),
 		(u_long)ntohl(encap_ip->ip_dst.s_addr),
 		ntohs(encap_ip->ip_len));
 	}
 
 	/* verify the version number of the inner packet */
 	if (encap_ip->ip_v != IPVERSION) {
 	    pimstat.pims_rcv_badregisters++;
 	    if (mrtdebug & DEBUG_PIM) {
 		log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 		    "of the inner packet\n", encap_ip->ip_v);
 	    }
 	    m_freem(m);
 	    return;
 	}
 
 	/* verify the inner packet is destined to a mcast group */
 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 	    pimstat.pims_rcv_badregisters++;
 	    if (mrtdebug & DEBUG_PIM)
 		log(LOG_DEBUG,
 		    "pim_input: inner packet of register is not "
 		    "multicast %lx\n",
 		    (u_long)ntohl(encap_ip->ip_dst.s_addr));
 	    m_freem(m);
 	    return;
 	}
 
 	/* If a NULL_REGISTER, pass it to the daemon */
 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 	    goto pim_input_to_daemon;
 
 	/*
 	 * Copy the TOS from the outer IP header to the inner IP header.
 	 */
 	if (encap_ip->ip_tos != ip_tos) {
 	    /* Outer TOS -> inner TOS */
 	    encap_ip->ip_tos = ip_tos;
 	    /* Recompute the inner header checksum. Sigh... */
 
 	    /* adjust mbuf to point to the inner IP header */
 	    m->m_data += (iphlen + PIM_MINLEN);
 	    m->m_len  -= (iphlen + PIM_MINLEN);
 
 	    encap_ip->ip_sum = 0;
 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 
 	    /* restore mbuf to point back to the outer IP header */
 	    m->m_data -= (iphlen + PIM_MINLEN);
 	    m->m_len  += (iphlen + PIM_MINLEN);
 	}
 
 	/*
 	 * Decapsulate the inner IP packet and loopback to forward it
 	 * as a normal multicast packet. Also, make a copy of the
 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 	 * to pass to the daemon later, so it can take the appropriate
 	 * actions (e.g., send back PIM_REGISTER_STOP).
 	 * XXX: here m->m_data points to the outer IP header.
 	 */
 	mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 	if (mcp == NULL) {
 	    log(LOG_ERR,
 		"pim_input: pim register: could not copy register head\n");
 	    m_freem(m);
 	    return;
 	}
 
 	/* Keep statistics */
 	/* XXX: registers_bytes include only the encap. mcast pkt */
 	pimstat.pims_rcv_registers_msgs++;
 	pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 
 	/*
 	 * forward the inner ip packet; point m_data at the inner ip.
 	 */
 	m_adj(m, iphlen + PIM_MINLEN);
 
 	if (mrtdebug & DEBUG_PIM) {
 	    log(LOG_DEBUG,
 		"pim_input: forwarding decapsulated register: "
 		"src %lx, dst %lx, vif %d\n",
 		(u_long)ntohl(encap_ip->ip_src.s_addr),
 		(u_long)ntohl(encap_ip->ip_dst.s_addr),
 		reg_vif_num);
 	}
 	/* NB: vifp was collected above; can it change on us? */
 	if_simloop(vifp, m, dst.sin_family, 0);
 
 	/* prepare the register head to send to the mrouting daemon */
 	m = mcp;
     }
 
 pim_input_to_daemon:
     /*
      * Pass the PIM message up to the daemon; if it is a Register message,
      * pass the 'head' only up to the daemon. This includes the
      * outer IP header, PIM header, PIM-Register header and the
      * inner IP header.
      * XXX: the outer IP header pkt size of a Register is not adjust to
      * reflect the fact that the inner multicast data is truncated.
      */
     rip_input(m, iphlen);
 
     return;
 }
 
 /*
  * XXX: This is common code for dealing with initialization for both
  * the IPv4 and IPv6 multicast forwarding paths. It could do with cleanup.
  */
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
     switch (type) {
     case MOD_LOAD:
 	MROUTER_LOCK_INIT();
 	MFC_LOCK_INIT();
 	VIF_LOCK_INIT();
 	ip_mrouter_reset();
 	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
 	    &pim_squelch_wholepkt);
 
 	pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM,
 	    pim_encapcheck, &in_pim_protosw, NULL);
 	if (pim_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim encap\n");
 		VIF_LOCK_DESTROY();
 		MFC_LOCK_DESTROY();
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 #ifdef INET6
 	pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM,
 	    pim_encapcheck, (struct protosw *)&in6_pim_protosw, NULL);
 	if (pim6_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim6 encap\n");
 		if (pim_encap_cookie) {
 		    encap_detach(pim_encap_cookie);
 		    pim_encap_cookie = NULL;
 		}
 		VIF_LOCK_DESTROY();
 		MFC_LOCK_DESTROY();
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 #endif
 
 	ip_mcast_src = X_ip_mcast_src;
 	ip_mforward = X_ip_mforward;
 	ip_mrouter_done = X_ip_mrouter_done;
 	ip_mrouter_get = X_ip_mrouter_get;
 	ip_mrouter_set = X_ip_mrouter_set;
 
 #ifdef INET6
 	ip6_mforward = X_ip6_mforward;
 	ip6_mrouter_done = X_ip6_mrouter_done;
 	ip6_mrouter_get = X_ip6_mrouter_get;
 	ip6_mrouter_set = X_ip6_mrouter_set;
 	mrt6_ioctl = X_mrt6_ioctl;
 #endif
 
 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 	ip_rsvp_vif = X_ip_rsvp_vif;
 
 	legal_vif_num = X_legal_vif_num;
 	mrt_ioctl = X_mrt_ioctl;
 	rsvp_input_p = X_rsvp_input;
 	break;
 
     case MOD_UNLOAD:
 	/*
 	 * Typically module unload happens after the user-level
 	 * process has shutdown the kernel services (the check
 	 * below insures someone can't just yank the module out
 	 * from under a running process).  But if the module is
 	 * just loaded and then unloaded w/o starting up a user
 	 * process we still need to cleanup.
 	 */
 	if (ip_mrouter
 #ifdef INET6
 	    || ip6_mrouter
 #endif
 	)
 	    return EINVAL;
 
 #ifdef INET6
 	if (pim6_encap_cookie) {
 	    encap_detach(pim6_encap_cookie);
 	    pim6_encap_cookie = NULL;
 	}
 	X_ip6_mrouter_done();
 	ip6_mforward = NULL;
 	ip6_mrouter_done = NULL;
 	ip6_mrouter_get = NULL;
 	ip6_mrouter_set = NULL;
 	mrt6_ioctl = NULL;
 #endif
 
 	if (pim_encap_cookie) {
 	    encap_detach(pim_encap_cookie);
 	    pim_encap_cookie = NULL;
 	}
 	X_ip_mrouter_done();
 	ip_mcast_src = NULL;
 	ip_mforward = NULL;
 	ip_mrouter_done = NULL;
 	ip_mrouter_get = NULL;
 	ip_mrouter_set = NULL;
 
 	ip_rsvp_force_done = NULL;
 	ip_rsvp_vif = NULL;
 
 	legal_vif_num = NULL;
 	mrt_ioctl = NULL;
 	rsvp_input_p = NULL;
 
 	VIF_LOCK_DESTROY();
 	MFC_LOCK_DESTROY();
 	MROUTER_LOCK_DESTROY();
 	break;
 
     default:
 	return EOPNOTSUPP;
     }
     return 0;
 }
 
 static moduledata_t ip_mroutemod = {
     "ip_mroute",
     ip_mroute_modevent,
     0
 };
 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 171636)
+++ head/sys/netinet/tcp_subr.c	(revision 171637)
@@ -1,2152 +1,2148 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 int	tcp_mssdflt = TCP_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
     &tcp_mssdflt, 0, "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 int	tcp_v6mssdflt = TCP6_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_RW, &tcp_v6mssdflt , 0,
     "Default TCP Maximum Segment Size for IPv6");
 #endif
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 int	tcp_minmss = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
     &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
 
 int	tcp_do_rfc1323 = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
     &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
     &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
     &tcbinfo.ipi_count, 0, "Number of active PCBs");
 
 static int	icmp_may_rst = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
     &icmp_may_rst, 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static int	tcp_isn_reseed_interval = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
 /*
  * TCP bandwidth limiting sysctls.  Note that the default lower bound of
  * 1024 exists only for debugging.  A good production default would be
  * something like 6100.
  */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
     "TCP inflight data limiting");
 
 static int	tcp_inflight_enable = 1;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
 
 static int	tcp_inflight_debug = 0;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
 
 static int	tcp_inflight_rttthresh;
 SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
     "RTT threshold below which inflight will deactivate itself");
 
 static int	tcp_inflight_min = 6144;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
 
 static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
 
 static int	tcp_inflight_stab = 20;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
     &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
 
 uma_zone_t sack_hole_zone;
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static void	tcp_isn_tick(void *);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	512
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 };
 
 static uma_zone_t tcpcb_zone;
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 struct callout isn_callout;
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 void
 tcp_init(void)
 {
 
 	int hashsize = TCBHASHSIZE;
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
 	LIST_INIT(&tcb);
 	tcbinfo.ipi_listhead = &tcb;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	tcp_tcbhashsize = hashsize;
 	tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
 	    &tcbinfo.ipi_hashmask);
 	tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
 	    &tcbinfo.ipi_porthashmask);
 	tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
 	    NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 	tcp_reass_init();
 	ISN_LOCK_INIT();
 	callout_init(&isn_callout, CALLOUT_MPSAFE);
 	tcp_isn_tick(NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 }
 
 void
 tcp_fini(void *xtp)
 {
 
 	callout_stop(&isn_callout);
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_LOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = sizeof(struct tcphdr);
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	} else
 #endif
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct mbuf *m;
 	struct tcptemp *n;
 
 	m = m_get(M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (0);
 	m->m_len = sizeof(struct tcptemp);
 	n = mtod(m, struct tcptemp *);
 
 	tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
 	return (n);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_LOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_DONTWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, n_short);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 						tlen));
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	} else
 #endif
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = tlen;
 		ip->ip_ttl = ip_defttl;
 		if (path_mtu_discovery)
 			ip->ip_off |= IP_DF;
 	}
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_LOCK_ASSERT(inp);
 		mac_create_mbuf_from_inpcb(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_reflect_mbuf_tcp(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 #ifdef INET6
 	if (isipv6) {
 		nth->th_sum = 0;
 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
 					sizeof(struct ip6_hdr),
 					tlen - sizeof(struct ip6_hdr));
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	} else
 #endif /* INET6 */
 	{
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 	}
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 	else
 #endif /* INET6 */
 	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 		tcp_mssdflt;
 
 	/* Set up our timeouts. */
-	if (NET_CALLOUT_MPSAFE)
-		callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx,
-		    CALLOUT_RETURNUNLOCKED);
-	else
-		callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx,
-		    (CALLOUT_RETURNUNLOCKED|CALLOUT_NETGIANT));
+	callout_init_mtx(&tp->t_timers->tt_timer, &inp->inp_mtx,
+	    CALLOUT_RETURNUNLOCKED);
 
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = ip_defttl;
 	inp->inp_ppcb = tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
 		(void) tcp_output(tp);
 		tcpstat.tcps_drops++;
 	} else
 		tcpstat.tcps_conndrops++;
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we
 	 * delete the PCB.
 	 *
 	 * XXX: callout_stop() may race and a callout may already
 	 * try to obtain the INP_LOCK.  Only callout_drain() would
 	 * stop this but it would cause a LOR thus we can't use it.
 	 * The tcp_timer() function contains a lot of checks to
 	 * handle this case rather gracefully.
 	 */
 	tp->t_timers->tt_active = 0;
 	callout_stop(&tp->t_timers->tt_timer);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 				      (isipv6 ? sizeof (struct ip6_hdr) +
 					       sizeof (struct tcphdr) :
 #endif
 				       sizeof (struct tcpiphdr)
 #ifdef INET6
 				       )
 #endif
 				      );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
 		metrics.rmx_bandwidth = tp->snd_bandwidth;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		uma_zfree(tcp_reass_zone, q);
 		tp->t_segqlen--;
 		tcp_reass_qsize--;
 	}
 	tcp_free_sackholes(tp);
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	in_pcbdrop(inp);
 	tcpstat.tcps_closed++;
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_vflag & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_vflag &= ~INP_SOCKREF;
 		INP_UNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 
 	if (do_tcpdrain) {
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 		struct tseg_qent *te;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	usefull.
 	 */
 		INP_INFO_RLOCK(&tcbinfo);
 		LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_vflag & INP_TIMEWAIT)
 				continue;
 			INP_LOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				while ((te = LIST_FIRST(&tcpb->t_segq))
 			            != NULL) {
 					LIST_REMOVE(te, tqe_q);
 					m_freem(te->tqe_m);
 					uma_zfree(tcp_reass_zone, te);
 					tcpb->t_segqlen--;
 					tcp_reass_qsize--;
 				}
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_UNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&tcbinfo);
 	}
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		m = syncache_pcbcount();
 		n = tcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ ((m + n) + n/8) * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&tcbinfo);
 	gencnt = tcbinfo.ipi_gencnt;
 	n = tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&tcbinfo);
 	for (inp = LIST_FIRST(tcbinfo.ipi_listhead), i = 0; inp != NULL && i
 	    < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_vflag & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseesocket(req->td->td_ucred,
 				    inp->inp_socket);
 			if (error == 0)
 				inp_list[i++] = inp;
 		}
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_vflag & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_UNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_UNLOCK(inp);
 	
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&tcbinfo);
 		xig.xig_gen = tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = tcbinfo.ipi_count + pcb_count;
 		INP_INFO_RUNLOCK(&tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	INP_INFO_RLOCK(&tcbinfo);
 	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error, mapped = 0;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 			return (EINVAL);
 	}
 
 	INP_INFO_RLOCK(&tcbinfo);
 	if (mapped == 1)
 		inp = in_pcblookup_hash(&tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port,
 			0, NULL);
 	else
 		inp = in6_pcblookup_hash(&tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif
 
 
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Source quench is depreciated.
 	 */
 	else if (cmd == PRC_QUENCH)
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		icp = (struct icmp *)((caddr_t)ip
 				      - offsetof(struct icmp, icmp_ip));
 		th = (struct tcphdr *)((caddr_t)ip
 				       + (ip->ip_hl << 2));
 		INP_INFO_WLOCK(&tcbinfo);
 		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, 0, NULL);
 		if (inp != NULL)  {
 			INP_LOCK(inp);
 			if (!(inp->inp_vflag & INP_TIMEWAIT) &&
 			    !(inp->inp_vflag & INP_DROPPED) &&
 			    !(inp->inp_socket == NULL)) {
 				icmp_tcp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 					if (cmd == PRC_MSGSIZE) {
 					    /*
 					     * MTU discovery:
 					     * If we got a needfrag set the MTU
 					     * in the route to the suggested new
 					     * value (if given) and then notify.
 					     */
 					    bzero(&inc, sizeof(inc));
 					    inc.inc_flags = 0;	/* IPv4 */
 					    inc.inc_faddr = faddr;
 
 					    mtu = ntohs(icp->icmp_nextmtu);
 					    /*
 					     * If no alternative MTU was
 					     * proposed, try the next smaller
 					     * one.  ip->ip_len has already
 					     * been swapped in icmp_input().
 					     */
 					    if (!mtu)
 						mtu = ip_next_mtu(ip->ip_len,
 						 1);
 					    if (mtu < max(296, (tcp_minmss)
 						 + sizeof(struct tcpiphdr)))
 						mtu = 0;
 					    if (!mtu)
 						mtu = tcp_mssdflt
 						 + sizeof(struct tcpiphdr);
 					    /*
 					     * Only cache the the MTU if it
 					     * is smaller than the interface
 					     * or route MTU.  tcp_mtudisc()
 					     * will do right thing by itself.
 					     */
 					    if (mtu <= tcp_maxmtu(&inc, NULL))
 						tcp_hc_updatemtu(&inc, mtu);
 					}
 
 					inp = (*notify)(inp, inetctlerrmap[cmd]);
 				}
 			}
 			if (inp != NULL)
 				INP_UNLOCK(inp);
 		} else {
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 #ifdef INET6
 			inc.inc_isipv6 = 0;
 #endif
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&tcbinfo);
 	} else
 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 	/* Source quench is depreciated. */
 	else if (cmd == PRC_QUENCH)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_isipv6 = 1;
 		INP_INFO_WLOCK(&tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_WUNLOCK(&tcbinfo);
 	} else
 		in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 static u_char isn_secret[32];
 static int isn_last_reseed;
 static u_int32_t isn_offset, isn_offset_old;
 static MD5_CTX isn_ctx;
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 
 	INP_LOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
 	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&isn_secret, sizeof(isn_secret));
 		isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	new_isn += isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * Increment the offset to the next ISN_BYTES_PER_SECOND / hz boundary
  * to keep time flowing at a relatively constant rate.  If the random
  * increments have already pushed us past the projected offset, do nothing.
  */
 static void
 tcp_isn_tick(void *xtp)
 {
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100;
 
 	if (projected_offset > isn_offset)
 		isn_offset = projected_offset;
 
 	isn_offset_old = isn_offset;
 	callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
 	ISN_UNLOCK();
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value in the route.  Also nudge TCP to send something,
  * since we know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 	struct socket *so = inp->inp_socket;
 	u_int maxmtu;
 	u_int romtu;
 	int mss;
 #ifdef INET6
 	int isipv6;
 #endif /* INET6 */
 
 	INP_LOCK_ASSERT(inp);
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 #ifdef INET6
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 	maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
 	romtu =
 #ifdef INET6
 	    isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) :
 #endif /* INET6 */
 	    tcp_maxmtu(&inp->inp_inc, NULL);
 	if (!maxmtu)
 		maxmtu = romtu;
 	else
 		maxmtu = min(maxmtu, romtu);
 	if (!maxmtu) {
 		tp->t_maxopd = tp->t_maxseg =
 #ifdef INET6
 			isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 			tcp_mssdflt;
 		return (inp);
 	}
 	mss = maxmtu -
 #ifdef INET6
 		(isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
 #endif /* INET6 */
 		 sizeof(struct tcpiphdr)
 #ifdef INET6
 		 )
 #endif /* INET6 */
 		;
 
 	/*
 	 * XXX - The above conditional probably violates the TCP
 	 * spec.  The problem is that, since we don't know the
 	 * other end's MSS, we are supposed to use a conservative
 	 * default.  But, if we do that, then MTU discovery will
 	 * never actually take place, because the conservative
 	 * default is much less than the MTUs typically seen
 	 * on the Internet today.  For the moment, we'll sweep
 	 * this under the carpet.
 	 *
 	 * The conservative default might not actually be a problem
 	 * if the only case this occurs is when sending an initial
 	 * SYN with options and data to a host we've never talked
 	 * to before.  Then, they will reply with an MSS value which
 	 * will get recorded and the new parameters should get
 	 * recomputed.  For Further Study.
 	 */
 	if (tp->t_maxopd <= mss)
 		return (inp);
 	tp->t_maxopd = mss;
 
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 		mss -= TCPOLEN_TSTAMP_APPA;
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 	if (mss > MCLBYTES)
 		mss &= ~(MCLBYTES-1);
 #else
 	if (mss > MCLBYTES)
 		mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	if (so->so_snd.sb_hiwat < mss)
 		mss = so->so_snd.sb_hiwat;
 
 	tp->t_maxseg = mss;
 
 	tcpstat.tcps_mturesent++;
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp);
 	tcp_output(tp);
 	return (inp);
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return NULL.  This routine
  * is called by TCP routines that access the rmx structure and by tcp_mss
  * to get the interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, int *flags)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		rtalloc_ign(&sro, RTF_CLONING);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, int *flags)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		rtalloc_ign((struct route *)&sro6, RTF_CLONING);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return (0);
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 /*
  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
  *
  * This code attempts to calculate the bandwidth-delay product as a
  * means of determining the optimal window size to maximize bandwidth,
  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
  * routers.  This code also does a fairly good job keeping RTTs in check
  * across slow links like modems.  We implement an algorithm which is very
  * similar (but not meant to be) TCP/Vegas.  The code operates on the
  * transmitter side of a TCP connection and so only effects the transmit
  * side of the connection.
  *
  * BACKGROUND:  TCP makes no provision for the management of buffer space
  * at the end points or at the intermediate routers and switches.  A TCP
  * stream, whether using NewReno or not, will eventually buffer as
  * many packets as it is able and the only reason this typically works is
  * due to the fairly small default buffers made available for a connection
  * (typicaly 16K or 32K).  As machines use larger windows and/or window
  * scaling it is now fairly easy for even a single TCP connection to blow-out
  * all available buffer space not only on the local interface, but on
  * intermediate routers and switches as well.  NewReno makes a misguided
  * attempt to 'solve' this problem by waiting for an actual failure to occur,
  * then backing off, then steadily increasing the window again until another
  * failure occurs, ad-infinitum.  This results in terrible oscillation that
  * is only made worse as network loads increase and the idea of intentionally
  * blowing out network buffers is, frankly, a terrible way to manage network
  * resources.
  *
  * It is far better to limit the transmit window prior to the failure
  * condition being achieved.  There are two general ways to do this:  First
  * you can 'scan' through different transmit window sizes and locate the
  * point where the RTT stops increasing, indicating that you have filled the
  * pipe, then scan backwards until you note that RTT stops decreasing, then
  * repeat ad-infinitum.  This method works in principle but has severe
  * implementation issues due to RTT variances, timer granularity, and
  * instability in the algorithm which can lead to many false positives and
  * create oscillations as well as interact badly with other TCP streams
  * implementing the same algorithm.
  *
  * The second method is to limit the window to the bandwidth delay product
  * of the link.  This is the method we implement.  RTT variances and our
  * own manipulation of the congestion window, bwnd, can potentially
  * destabilize the algorithm.  For this reason we have to stabilize the
  * elements used to calculate the window.  We do this by using the minimum
  * observed RTT, the long term average of the observed bandwidth, and
  * by adding two segments worth of slop.  It isn't perfect but it is able
  * to react to changing conditions and gives us a very stable basis on
  * which to extend the algorithm.
  */
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
 	u_long bw;
 	u_long bwnd;
 	int save_ticks;
 
 	INP_LOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * If inflight_enable is disabled in the middle of a tcp connection,
 	 * make sure snd_bwnd is effectively disabled.
 	 */
 	if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) {
 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 		tp->snd_bandwidth = 0;
 		return;
 	}
 
 	/*
 	 * Figure out the bandwidth.  Due to the tick granularity this
 	 * is a very rough number and it MUST be averaged over a fairly
 	 * long period of time.  XXX we need to take into account a link
 	 * that is not using all available bandwidth, but for now our
 	 * slop will ramp us up if this case occurs and the bandwidth later
 	 * increases.
 	 *
 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
 	    (save_ticks - tp->t_bw_rtttime);
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 		return;
 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	tp->snd_bandwidth = bw;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
 	 * segments.  The additional slop puts us squarely in the sweet
 	 * spot and also handles the bandwidth run-up case and stabilization.
 	 * Without the slop we could be locking ourselves into a lower
 	 * bandwidth.
 	 *
 	 * Situations Handled:
 	 *	(1) Prevents over-queueing of packets on LANs, especially on
 	 *	    high speed LANs, allowing larger TCP buffers to be
 	 *	    specified, and also does a good job preventing
 	 *	    over-queueing of packets over choke points like modems
 	 *	    (at least for the transmit side).
 	 *
 	 *	(2) Is able to handle changing network loads (bandwidth
 	 *	    drops so bwnd drops, bandwidth increases so bwnd
 	 *	    increases).
 	 *
 	 *	(3) Theoretically should stabilize in the face of multiple
 	 *	    connections implementing the same algorithm (this may need
 	 *	    a little work).
 	 *
 	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
 	 *	    be adjusted with a sysctl but typically only needs to be
 	 *	    on very slow connections.  A value no smaller then 5
 	 *	    should be used, but only reduce this default if you have
 	 *	    no other choice.
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 			    tp,
 			    bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
 	if ((long)bwnd < tcp_inflight_min)
 		bwnd = tcp_inflight_min;
 	if (bwnd > tcp_inflight_max)
 		bwnd = tcp_inflight_max;
 	if ((long)bwnd < tp->t_maxseg * 2)
 		bwnd = tp->t_maxseg * 2;
 	tp->snd_bwnd = bwnd;
 }
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * off0		offset to TCP header within the mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * This function is for IPv4 use only. Calling this function with an
  * IPv6 packet in the mbuf chain will yield undefined results.
  *
  * Return 0 if successful, otherwise return -1.
  *
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
  * right now. Another branch of this code exists which uses the SPD to
  * specify per-application flows but it is unstable.
  */
 int
 tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	union sockaddr_union dst;
 	struct ippseudo ippseudo;
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 	struct ipovly *ipovly;
 	struct secasvar *sav;
 	struct tcphdr *th;
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	ip = mtod(m, struct ip *);
 	bzero(&dst, sizeof(union sockaddr_union));
 	dst.sa.sa_len = sizeof(struct sockaddr_in);
 	dst.sa.sa_family = AF_INET;
 	dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 	    ip->ip_src : ip->ip_dst;
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		printf("%s: SADB lookup failed for %s\n", __func__,
 		    inet_ntoa(dst.sin.sin_addr));
 		return (EINVAL);
 	}
 
 	MD5Init(&ctx);
 	ipovly = (struct ipovly *)ip;
 	th = (struct tcphdr *)((u_char *)ip + off0);
 	doff = off0 + sizeof(struct tcphdr) + optlen;
 
 	/*
 	 * Step 1: Update MD5 hash with IP pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	ippseudo.ippseudo_src = ipovly->ih_src;
 	ippseudo.ippseudo_dst = ipovly->ih_dst;
 	ippseudo.ippseudo_pad = 0;
 	ippseudo.ippseudo_p = IPPROTO_TCP;
 	ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen);
 	MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 	struct in6_addr f6, l6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_WLOCK(&tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port,
 		    &l6, lin6->sin6_port, 0, NULL);
 		break;
 #endif
 	case AF_INET:
 		inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, 0, NULL);
 		break;
 	}
 	if (inp != NULL) {
 		INP_LOCK(inp);
 		if (inp->inp_vflag & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 		} else if (!(inp->inp_vflag & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tcp_drop(tp, ECONNABORTED);
 		}
 		INP_UNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_WUNLOCK(&tcbinfo);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && inc->inc_isipv6 == 0) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
Index: head/sys/sys/mutex.h
===================================================================
--- head/sys/sys/mutex.h	(revision 171636)
+++ head/sys/sys/mutex.h	(revision 171637)
@@ -1,462 +1,461 @@
 /*-
  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $
  * $FreeBSD$
  */
 
 #ifndef _SYS_MUTEX_H_
 #define _SYS_MUTEX_H_
 
 #ifndef LOCORE
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 #ifdef _KERNEL
 #include <sys/pcpu.h>
 #include <sys/lock_profile.h>
 #include <machine/atomic.h>
 #include <machine/cpufunc.h>
 #endif	/* _KERNEL_ */
 #endif	/* !LOCORE */
 
 #include <machine/mutex.h>
 
 #ifdef _KERNEL
 
 /*
  * Mutex types and options passed to mtx_init().  MTX_QUIET and MTX_DUPOK
  * can also be passed in.
  */
 #define	MTX_DEF		0x00000000	/* DEFAULT (sleep) lock */ 
 #define MTX_SPIN	0x00000001	/* Spin lock (disables interrupts) */
 #define MTX_RECURSE	0x00000004	/* Option: lock allowed to recurse */
 #define	MTX_NOWITNESS	0x00000008	/* Don't do any witness checking. */
 #define MTX_NOPROFILE   0x00000020	/* Don't profile this lock */
 
 /*
  * Option flags passed to certain lock/unlock routines, through the use
  * of corresponding mtx_{lock,unlock}_flags() interface macros.
  */
 #define	MTX_QUIET	LOP_QUIET	/* Don't log a mutex event */
 #define	MTX_DUPOK	LOP_DUPOK	/* Don't log a duplicate acquire */
 
 /*
  * State bits kept in mutex->mtx_lock, for the DEFAULT lock type. None of this,
  * with the exception of MTX_UNOWNED, applies to spin locks.
  */
 #define	MTX_RECURSED	0x00000001	/* lock recursed (for MTX_DEF only) */
 #define	MTX_CONTESTED	0x00000002	/* lock contested (for MTX_DEF only) */
 #define MTX_UNOWNED	0x00000004	/* Cookie for free mutex */
 #define	MTX_FLAGMASK	(MTX_RECURSED | MTX_CONTESTED | MTX_UNOWNED)
 
 /*
  * Value stored in mutex->mtx_lock to denote a destroyed mutex.
  */
 #define	MTX_DESTROYED	(MTX_CONTESTED | MTX_UNOWNED)
 
 #endif	/* _KERNEL */
 
 #ifndef LOCORE
 
 /*
  * XXX: Friendly reminder to fix things in MP code that is presently being
  * XXX: worked on.
  */
 #define mp_fixme(string)
 
 #ifdef _KERNEL
 
 /*
  * Prototypes
  *
  * NOTE: Functions prepended with `_' (underscore) are exported to other parts
  *	 of the kernel via macros, thus allowing us to use the cpp LOCK_FILE
  *	 and LOCK_LINE. These functions should not be called directly by any
  *	 code using the API. Their macros cover their functionality.
  *
  * [See below for descriptions]
  *
  */
 void	mtx_init(struct mtx *m, const char *name, const char *type, int opts);
 void	mtx_destroy(struct mtx *m);
 void	mtx_sysinit(void *arg);
 void	mutex_init(void);
 void	_mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts,
 	    const char *file, int line);
 void	_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line);
 #ifdef SMP
 void	_mtx_lock_spin(struct mtx *m, uintptr_t tid, int opts,
 	    const char *file, int line);
 #endif
 void	_mtx_unlock_spin(struct mtx *m, int opts, const char *file, int line);
 int	_mtx_trylock(struct mtx *m, int opts, const char *file, int line);
 void	_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line);
 void	_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line);
 void	_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file,
 	     int line);
 void	_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file,
 	     int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_mtx_assert(struct mtx *m, int what, const char *file, int line);
 #endif
 void	_thread_lock_flags(struct thread *, int, const char *, int);
 
 #define	thread_lock(tdp)						\
     _thread_lock_flags((tdp), 0, __FILE__, __LINE__)
 #define	thread_lock_flags(tdp, opt)					\
     _thread_lock_flags((tdp), (opt), __FILE__, __LINE__)
 #define	thread_unlock(tdp)						\
        mtx_unlock_spin((tdp)->td_lock)
 
 /*
  * We define our machine-independent (unoptimized) mutex micro-operations
  * here, if they are not already defined in the machine-dependent mutex.h 
  */
 
 /* Try to obtain mtx_lock once. */
 #ifndef _obtain_lock
 #define _obtain_lock(mp, tid)						\
 	atomic_cmpset_acq_ptr(&(mp)->mtx_lock, MTX_UNOWNED, (tid))
 #endif
 
 /* Try to release mtx_lock if it is unrecursed and uncontested. */
 #ifndef _release_lock
 #define _release_lock(mp, tid)						\
 	atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED)
 #endif
 
 /* Release mtx_lock quickly, assuming we own it. */
 #ifndef _release_lock_quick
 #define _release_lock_quick(mp)						\
 	atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED)
 #endif
 
 /*
  * Obtain a sleep lock inline, or call the "hard" function if we can't get it
  * easy.
  */
 #ifndef _get_sleep_lock
 #define _get_sleep_lock(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 	if (!_obtain_lock((mp), _tid)) {				\
 		_mtx_lock_sleep((mp), _tid, (opts), (file), (line));	\
 	} else 								\
               	lock_profile_obtain_lock_success(&(mp)->lock_object, 0,	\
 		    0, (file), (line));					\
 } while (0)
 #endif
 
 /*
  * Obtain a spin lock inline, or call the "hard" function if we can't get it
  * easy. For spinlocks, we handle recursion inline (it turns out that function
  * calls can be significantly expensive on some architectures).
  * Since spin locks are not _too_ common, inlining this code is not too big 
  * a deal.
  */
 #ifndef _get_spin_lock
 #ifdef SMP
 #define _get_spin_lock(mp, tid, opts, file, line) do {	\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 	spinlock_enter();						\
 	if (!_obtain_lock((mp), _tid)) {				\
 		if ((mp)->mtx_lock == _tid)				\
 			(mp)->mtx_recurse++;				\
 		else {							\
 			_mtx_lock_spin((mp), _tid, (opts), (file), (line)); \
 		}							\
 	} else 								\
               	lock_profile_obtain_lock_success(&(mp)->lock_object, 0,	\
 		    0, (file), (line));					\
 } while (0)
 #else /* SMP */
 #define _get_spin_lock(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	spinlock_enter();						\
 	if ((mp)->mtx_lock == _tid)					\
 		(mp)->mtx_recurse++;					\
 	else {								\
 		KASSERT((mp)->mtx_lock == MTX_UNOWNED, ("corrupt spinlock")); \
 		(mp)->mtx_lock = _tid;				\
 	}								\
 } while (0)
 #endif /* SMP */
 #endif
 
 /*
  * Release a sleep lock inline, or call the "hard" function if we can't do it
  * easy.
  */
 #ifndef _rel_sleep_lock
 #define _rel_sleep_lock(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	if (!_release_lock((mp), _tid))					\
 		_mtx_unlock_sleep((mp), (opts), (file), (line));	\
 } while (0)
 #endif
 
 /*
  * For spinlocks, we can handle everything inline, as it's pretty simple and
  * a function call would be too expensive (at least on some architectures).
  * Since spin locks are not _too_ common, inlining this code is not too big 
  * a deal.
  *
  * Since we always perform a spinlock_enter() when attempting to acquire a
  * spin lock, we need to always perform a matching spinlock_exit() when
  * releasing a spin lock.  This includes the recursion cases.
  */
 #ifndef _rel_spin_lock
 #ifdef SMP
 #define _rel_spin_lock(mp) do {						\
 	if (mtx_recursed((mp)))						\
 		(mp)->mtx_recurse--;					\
 	else {								\
 		lock_profile_release_lock(&(mp)->lock_object);          \
 		_release_lock_quick((mp));				\
 	}                                                               \
 	spinlock_exit();				                \
 } while (0)
 #else /* SMP */
 #define _rel_spin_lock(mp) do {						\
 	if (mtx_recursed((mp)))						\
 		(mp)->mtx_recurse--;					\
 	else								\
 		(mp)->mtx_lock = MTX_UNOWNED;				\
 	spinlock_exit();						\
 } while (0)
 #endif /* SMP */
 #endif
 
 /*
  * Exported lock manipulation interface.
  *
  * mtx_lock(m) locks MTX_DEF mutex `m'
  *
  * mtx_lock_spin(m) locks MTX_SPIN mutex `m'
  *
  * mtx_unlock(m) unlocks MTX_DEF mutex `m'
  *
  * mtx_unlock_spin(m) unlocks MTX_SPIN mutex `m'
  *
  * mtx_lock_spin_flags(m, opts) and mtx_lock_flags(m, opts) locks mutex `m'
  *     and passes option flags `opts' to the "hard" function, if required.
  *     With these routines, it is possible to pass flags such as MTX_QUIET
  *     to the appropriate lock manipulation routines.
  *
  * mtx_trylock(m) attempts to acquire MTX_DEF mutex `m' but doesn't sleep if
  *     it cannot. Rather, it returns 0 on failure and non-zero on success.
  *     It does NOT handle recursion as we assume that if a caller is properly
  *     using this part of the interface, he will know that the lock in question
  *     is _not_ recursed.
  *
  * mtx_trylock_flags(m, opts) is used the same way as mtx_trylock() but accepts
  *     relevant option flags `opts.'
  *
  * mtx_initialized(m) returns non-zero if the lock `m' has been initialized.
  *
  * mtx_owned(m) returns non-zero if the current thread owns the lock `m'
  *
  * mtx_recursed(m) returns non-zero if the lock `m' is presently recursed.
  */ 
 #define mtx_lock(m)		mtx_lock_flags((m), 0)
 #define mtx_lock_spin(m)	mtx_lock_spin_flags((m), 0)
 #define mtx_trylock(m)		mtx_trylock_flags((m), 0)
 #define mtx_unlock(m)		mtx_unlock_flags((m), 0)
 #define mtx_unlock_spin(m)	mtx_unlock_spin_flags((m), 0)
 
 struct mtx_pool;
 
 struct mtx_pool *mtx_pool_create(const char *mtx_name, int pool_size, int opts);
 void mtx_pool_destroy(struct mtx_pool **poolp);
 struct mtx *mtx_pool_find(struct mtx_pool *pool, void *ptr);
 struct mtx *mtx_pool_alloc(struct mtx_pool *pool);
 #define mtx_pool_lock(pool, ptr)					\
 	mtx_lock(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_lock_spin(pool, ptr)					\
 	mtx_lock_spin(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_unlock(pool, ptr)					\
 	mtx_unlock(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_unlock_spin(pool, ptr)					\
 	mtx_unlock_spin(mtx_pool_find((pool), (ptr)))
 
 /*
  * mtxpool_lockbuilder is a pool of sleep locks that is not witness
  * checked and should only be used for building higher level locks.
  *
  * mtxpool_sleep is a general purpose pool of sleep mutexes.
  */
 extern struct mtx_pool *mtxpool_lockbuilder;
 extern struct mtx_pool *mtxpool_sleep;
 
 #ifndef LOCK_DEBUG
 #error LOCK_DEBUG not defined, include <sys/lock.h> before <sys/mutex.h>
 #endif
 #if LOCK_DEBUG > 0 || defined(MUTEX_NOINLINE)
 #define	mtx_lock_flags(m, opts)						\
 	_mtx_lock_flags((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_flags(m, opts)					\
 	_mtx_unlock_flags((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_lock_spin_flags(m, opts)					\
 	_mtx_lock_spin_flags((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_spin_flags(m, opts)					\
 	_mtx_unlock_spin_flags((m), (opts), LOCK_FILE, LOCK_LINE)
 #else	/* LOCK_DEBUG == 0 && !MUTEX_NOINLINE */
 #define	mtx_lock_flags(m, opts)						\
 	_get_sleep_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_flags(m, opts)					\
 	_rel_sleep_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_lock_spin_flags(m, opts)					\
 	_get_spin_lock((m), curthread, (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_spin_flags(m, opts)					\
 	_rel_spin_lock((m))
 #endif	/* LOCK_DEBUG > 0 || MUTEX_NOINLINE */
 
 #define mtx_trylock_flags(m, opts)					\
 	_mtx_trylock((m), (opts), LOCK_FILE, LOCK_LINE)
 
 #define	mtx_sleep(chan, mtx, pri, wmesg, timo)				\
 	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (timo))
 
 #define	mtx_initialized(m)	lock_initalized(&(m)->lock_object)
 
 #define mtx_owned(m)	(((m)->mtx_lock & ~MTX_FLAGMASK) == (uintptr_t)curthread)
 
 #define mtx_recursed(m)	((m)->mtx_recurse != 0)
 
 #define mtx_name(m)	((m)->lock_object.lo_name)
 
 /*
  * Global locks.
  */
 extern struct mtx Giant;
 extern struct mtx blocked_lock;
 
 /*
  * Giant lock manipulation and clean exit macros.
  * Used to replace return with an exit Giant and return.
  *
  * Note that DROP_GIANT*() needs to be paired with PICKUP_GIANT() 
  * The #ifndef is to allow lint-like tools to redefine DROP_GIANT.
  */
 #ifndef DROP_GIANT
 #define DROP_GIANT()							\
 do {									\
 	int _giantcnt = 0;						\
 	WITNESS_SAVE_DECL(Giant);					\
 									\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++)	\
 			mtx_unlock(&Giant);				\
 	}
 
 #define PICKUP_GIANT()							\
 	PARTIAL_PICKUP_GIANT();						\
 } while (0)
 
 #define PARTIAL_PICKUP_GIANT()						\
 	mtx_assert(&Giant, MA_NOTOWNED);				\
 	if (_giantcnt > 0) {						\
 		while (_giantcnt--)					\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}
 #endif
 
 #define	UGAR(rval) do {							\
 	int _val = (rval);						\
 	mtx_unlock(&Giant);						\
 	return (_val);							\
 } while (0)
 
 /*
  * With the advent of fine-grained locking, the Giant lock is no longer
  * required around the network stack.  These macros exist for historical
  * reasons, allowing conditional acquisition of Giant based on a debugging
  * setting, and will be removed.
  */
 #define	NET_LOCK_GIANT() do {						\
 } while (0)
 #define	NET_UNLOCK_GIANT() do {						\
 } while (0)
 #define	NET_ASSERT_GIANT() do {						\
 } while (0)
-#define	NET_CALLOUT_MPSAFE	CALLOUT_MPSAFE
 
 struct mtx_args {
 	struct mtx	*ma_mtx;
 	const char 	*ma_desc;
 	int		 ma_opts;
 };
 
 #define	MTX_SYSINIT(name, mtx, desc, opts)				\
 	static struct mtx_args name##_args = {				\
 		(mtx),							\
 		(desc),							\
 		(opts)							\
 	};								\
 	SYSINIT(name##_mtx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    mtx_sysinit, &name##_args);					\
 	SYSUNINIT(name##_mtx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    mtx_destroy, (mtx))
 
 /*
  * The INVARIANTS-enabled mtx_assert() functionality.
  *
  * The constants need to be defined for INVARIANT_SUPPORT infrastructure
  * support as _mtx_assert() itself uses them and the latter implies that
  * _mtx_assert() must build.
  */
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define MA_OWNED	0x01
 #define MA_NOTOWNED	0x02
 #define MA_RECURSED	0x04
 #define MA_NOTRECURSED	0x08
 #endif
 
 #ifdef INVARIANTS
 #define	mtx_assert(m, what)						\
 	_mtx_assert((m), (what), __FILE__, __LINE__)
 
 #define GIANT_REQUIRED	mtx_assert(&Giant, MA_OWNED)
 
 #else	/* INVARIANTS */
 #define mtx_assert(m, what)
 #define GIANT_REQUIRED
 #endif	/* INVARIANTS */
 
 /*
  * Common lock type names.
  */
 #define	MTX_NETWORK_LOCK	"network driver"
 
 #endif	/* _KERNEL */
 #endif	/* !LOCORE */
 #endif	/* _SYS_MUTEX_H_ */