Index: projects/vnet/sys/contrib/ipfilter/netinet/mlfk_ipl.c
===================================================================
--- projects/vnet/sys/contrib/ipfilter/netinet/mlfk_ipl.c	(revision 294565)
+++ projects/vnet/sys/contrib/ipfilter/netinet/mlfk_ipl.c	(revision 294566)
@@ -1,568 +1,568 @@
 /*	$FreeBSD$	*/
 
 /*
  * Copyright (C) 2012 by Darren Reed.
  *
  * $FreeBSD$
  * See the IPFILTER.LICENCE file for details on licencing.
  */
 
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/conf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/select.h>
 #if __FreeBSD_version >= 500000
 # include <sys/selinfo.h>
 #endif
 #include <net/if.h>
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 
 
 #include "netinet/ipl.h"
 #include "netinet/ip_compat.h"
 #include "netinet/ip_fil.h"
 #include "netinet/ip_state.h"
 #include "netinet/ip_nat.h"
 #include "netinet/ip_auth.h"
 #include "netinet/ip_frag.h"
 #include "netinet/ip_sync.h"
 
 extern ipf_main_softc_t ipfmain;
 
 #if __FreeBSD_version >= 502116
 static struct cdev *ipf_devs[IPL_LOGSIZE];
 #else
 static dev_t ipf_devs[IPL_LOGSIZE];
 #endif
 
 static int sysctl_ipf_int ( SYSCTL_HANDLER_ARGS );
 static int ipf_modload(void);
 static int ipf_modunload(void);
 static int ipf_fbsd_sysctl_create(ipf_main_softc_t*);
 static int ipf_fbsd_sysctl_destroy(ipf_main_softc_t*);
 
 #if (__FreeBSD_version >= 500024)
 # if (__FreeBSD_version >= 502116)
 static	int	ipfopen __P((struct cdev*, int, int, struct thread *));
 static	int	ipfclose __P((struct cdev*, int, int, struct thread *));
 # else
 static	int	ipfopen __P((dev_t, int, int, struct thread *));
 static	int	ipfclose __P((dev_t, int, int, struct thread *));
 # endif /* __FreeBSD_version >= 502116 */
 #else
 static	int	ipfopen __P((dev_t, int, int, struct proc *));
 static	int	ipfclose __P((dev_t, int, int, struct proc *));
 #endif
 #if (__FreeBSD_version >= 502116)
 static	int	ipfread __P((struct cdev*, struct uio *, int));
 static	int	ipfwrite __P((struct cdev*, struct uio *, int));
 #else
 static	int	ipfread __P((dev_t, struct uio *, int));
 static	int	ipfwrite __P((dev_t, struct uio *, int));
 #endif /* __FreeBSD_version >= 502116 */
 
 
 SYSCTL_DECL(_net_inet);
 #define SYSCTL_IPF(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
 		   ptr, val, sysctl_ipf_int, "I", descr);
 #define SYSCTL_DYN_IPF(parent, nbr, name, access,ptr, val, descr) \
 	SYSCTL_ADD_OID(&ipf_clist, SYSCTL_STATIC_CHILDREN(parent), nbr, name, \
 	CTLFLAG_DYN|CTLTYPE_INT|access, ptr, val, sysctl_ipf_int, "I", descr)
 static struct sysctl_ctx_list ipf_clist;
 #define	CTLFLAG_OFF	0x00800000	/* IPFilter must be disabled */
 #define	CTLFLAG_RWO	(CTLFLAG_RW|CTLFLAG_OFF)
 SYSCTL_NODE(_net_inet, OID_AUTO, ipf, CTLFLAG_RW, 0, "IPF");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_flags, CTLFLAG_RW, &ipfmain.ipf_flags, 0, "IPF flags");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, ipf_pass, CTLFLAG_RW, &ipfmain.ipf_pass, 0, "default pass/block");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_active, CTLFLAG_RD, &ipfmain.ipf_active, 0, "IPF is active");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcpidletimeout, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcpidletimeout, 0, "TCP idle timeout in seconds");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcphalfclosed, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcphalfclosed, 0, "timeout for half closed TCP sessions");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcpclosewait, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcpclosewait, 0, "timeout for TCP sessions in closewait status");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcplastack, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcplastack, 0, "timeout for TCP sessions in last ack status");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcptimeout, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcptimeout, 0, "");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_tcpclosed, CTLFLAG_RWO,
 	   &ipfmain.ipf_tcpclosed, 0, "");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_udptimeout, CTLFLAG_RWO,
 	   &ipfmain.ipf_udptimeout, 0, "UDP timeout");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_udpacktimeout, CTLFLAG_RWO,
 	   &ipfmain.ipf_udpacktimeout, 0, "");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_icmptimeout, CTLFLAG_RWO,
 	   &ipfmain.ipf_icmptimeout, 0, "ICMP timeout");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_running, CTLFLAG_RD,
 	   &ipfmain.ipf_running, 0, "IPF is running");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_chksrc, CTLFLAG_RW, &ipfmain.ipf_chksrc, 0, "");
 SYSCTL_IPF(_net_inet_ipf, OID_AUTO, fr_minttl, CTLFLAG_RW, &ipfmain.ipf_minttl, 0, "");
 
 #define CDEV_MAJOR 79
 #include <sys/poll.h>
 #if __FreeBSD_version >= 500043
 # include <sys/select.h>
 static int ipfpoll(struct cdev *dev, int events, struct thread *td);
 
 static struct cdevsw ipf_cdevsw = {
 #if __FreeBSD_version >= 502103
 	.d_version =	D_VERSION,
 	.d_flags =	0,	/* D_NEEDGIANT - Should be SMP safe */
 #endif
 	.d_open =	ipfopen,
 	.d_close =	ipfclose,
 	.d_read =	ipfread,
 	.d_write =	ipfwrite,
 	.d_ioctl =	ipfioctl,
 	.d_poll =	ipfpoll,
 	.d_name =	"ipf",
 #if __FreeBSD_version < 600000
 	.d_maj =	CDEV_MAJOR,
 #endif
 };
 #else
 static int ipfpoll(dev_t dev, int events, struct proc *td);
 
 static struct cdevsw ipf_cdevsw = {
 	/* open */	ipfopen,
 	/* close */	ipfclose,
 	/* read */	ipfread,
 	/* write */	ipfwrite,
 	/* ioctl */	ipfioctl,
 	/* poll */	ipfpoll,
 	/* mmap */	nommap,
 	/* strategy */	nostrategy,
 	/* name */	"ipf",
 	/* maj */	CDEV_MAJOR,
 	/* dump */	nodump,
 	/* psize */	nopsize,
 	/* flags */	0,
 # if (__FreeBSD_version < 500043)
 	/* bmaj */	-1,
 # endif
 # if (__FreeBSD_version >= 430000)
 	/* kqfilter */	NULL
 # endif
 };
 #endif
 
 static char *ipf_devfiles[] = {	IPL_NAME, IPNAT_NAME, IPSTATE_NAME, IPAUTH_NAME,
 				IPSYNC_NAME, IPSCAN_NAME, IPLOOKUP_NAME, NULL };
 
 static int
 ipfilter_modevent(module_t mod, int type, void *unused)
 {
 	int error = 0;
 
 	switch (type)
 	{
 	case MOD_LOAD :
 		error = ipf_modload();
 		break;
 
 	case MOD_UNLOAD :
 		error = ipf_modunload();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return error;
 }
 
 
 static int
 ipf_modload()
 {
 	char *defpass, *c, *str;
 	int i, j, error;
 
 	if (ipf_load_all() != 0)
 		return EIO;
 
 	if (ipf_create_all(&ipfmain) == NULL)
 		return EIO;
 
 	if (ipf_fbsd_sysctl_create(&ipfmain) != 0)
 		return EIO;
 
 	error = ipfattach(&ipfmain);
 	if (error)
 		return error;
 
 	for (i = 0; i < IPL_LOGSIZE; i++)
 		ipf_devs[i] = NULL;
 
 	for (i = 0; (str = ipf_devfiles[i]); i++) {
 		c = NULL;
 		for(j = strlen(str); j > 0; j--)
 			if (str[j] == '/') {
 				c = str + j + 1;
 				break;
 			}
 		if (!c)
 			c = str;
 		ipf_devs[i] = make_dev(&ipf_cdevsw, i, 0, 0, 0600, "%s", c);
 	}
 
 	error = ipf_pfil_hook();
 	if (error != 0)
 		return error;
 	ipf_event_reg();
 
 	if (FR_ISPASS(ipfmain.ipf_pass))
 		defpass = "pass";
 	else if (FR_ISBLOCK(ipfmain.ipf_pass))
 		defpass = "block";
 	else
 		defpass = "no-match -> block";
 
 	printf("%s initialized.  Default = %s all, Logging = %s%s\n",
 		ipfilter_version, defpass,
 #ifdef IPFILTER_LOG
 		"enabled",
 #else
 		"disabled",
 #endif
 #ifdef IPFILTER_COMPILED
 		" (COMPILED)"
 #else
 		""
 #endif
 		);
 	return 0;
 }
 
 
 static int
 ipf_modunload()
 {
 	int error, i;
 
 	if (ipfmain.ipf_refcnt)
 		return EBUSY;
 
 	if (ipf_fbsd_sysctl_destroy(&ipfmain) != 0)
 		return EIO;
 
 	error = ipf_pfil_unhook();
 	if (error != 0)
 		return error;
 
 	if (ipfmain.ipf_running >= 0) {
 		error = ipfdetach(&ipfmain);
 		if (error != 0)
 			return error;
 
 		ipf_fbsd_sysctl_destroy(&ipfmain);
 		ipf_destroy_all(&ipfmain);
 		ipf_unload_all();
 	} else
 		error = 0;
 
 	ipfmain.ipf_running = -2;
 
 	for (i = 0; ipf_devfiles[i]; i++) {
 		if (ipf_devs[i] != NULL)
 			destroy_dev(ipf_devs[i]);
 	}
 
 	printf("%s unloaded\n", ipfilter_version);
 
 	return error;
 }
 
 
 static moduledata_t ipfiltermod = {
 	"ipfilter",
 	ipfilter_modevent,
 	0
 };
 
 
-DECLARE_MODULE(ipfilter, ipfiltermod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
+DECLARE_MODULE(ipfilter, ipfiltermod, SI_SUB_FW, SI_ORDER_ANY);
 #ifdef	MODULE_VERSION
 MODULE_VERSION(ipfilter, 1);
 #endif
 
 
 #ifdef SYSCTL_IPF
 int
 sysctl_ipf_int ( SYSCTL_HANDLER_ARGS )
 {
 	int error = 0;
 
 	if (arg1)
 		error = SYSCTL_OUT(req, arg1, sizeof(int));
 	else
 		error = SYSCTL_OUT(req, &arg2, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else {
 		if ((oidp->oid_kind & CTLFLAG_OFF) && (ipfmain.ipf_running > 0))
 			error = EBUSY;
 		else
 			error = SYSCTL_IN(req, arg1, sizeof(int));
 	}
 	return (error);
 }
 #endif
 
 
 static int
 #if __FreeBSD_version >= 500043
 ipfpoll(struct cdev *dev, int events, struct thread *td)
 #else
 ipfpoll(dev_t dev, int events, struct proc *td)
 #endif
 {
 	int unit = GET_MINOR(dev);
 	int revents;
 
 	if (unit < 0 || unit > IPL_LOGMAX)
 		return 0;
 
 	revents = 0;
 
 	switch (unit)
 	{
 	case IPL_LOGIPF :
 	case IPL_LOGNAT :
 	case IPL_LOGSTATE :
 #ifdef IPFILTER_LOG
 		if ((events & (POLLIN | POLLRDNORM)) && ipf_log_canread(&ipfmain, unit))
 			revents |= events & (POLLIN | POLLRDNORM);
 #endif
 		break;
 	case IPL_LOGAUTH :
 		if ((events & (POLLIN | POLLRDNORM)) && ipf_auth_waiting(&ipfmain))
 			revents |= events & (POLLIN | POLLRDNORM);
 		break;
 	case IPL_LOGSYNC :
 		if ((events & (POLLIN | POLLRDNORM)) && ipf_sync_canread(&ipfmain))
 			revents |= events & (POLLIN | POLLRDNORM);
 		if ((events & (POLLOUT | POLLWRNORM)) && ipf_sync_canwrite(&ipfmain))
 			revents |= events & (POLLOUT | POLLWRNORM);
 		break;
 	case IPL_LOGSCAN :
 	case IPL_LOGLOOKUP :
 	default :
 		break;
 	}
 
 	if ((revents == 0) && ((events & (POLLIN|POLLRDNORM)) != 0))
 		selrecord(td, &ipfmain.ipf_selwait[unit]);
 
 	return revents;
 }
 
 
 /*
  * routines below for saving IP headers to buffer
  */
 static int ipfopen(dev, flags
 #if ((BSD >= 199506) || (__FreeBSD_version >= 220000))
 , devtype, p)
 	int devtype;
 # if (__FreeBSD_version >= 500024)
 	struct thread *p;
 # else
 	struct proc *p;
 # endif /* __FreeBSD_version >= 500024 */
 #else
 )
 #endif
 #if (__FreeBSD_version >= 502116)
 	struct cdev *dev;
 #else
 	dev_t dev;
 #endif
 	int flags;
 {
 	int unit = GET_MINOR(dev);
 	int error;
 
 	if (IPL_LOGMAX < unit)
 		error = ENXIO;
 	else {
 		switch (unit)
 		{
 		case IPL_LOGIPF :
 		case IPL_LOGNAT :
 		case IPL_LOGSTATE :
 		case IPL_LOGAUTH :
 		case IPL_LOGLOOKUP :
 		case IPL_LOGSYNC :
 #ifdef IPFILTER_SCAN
 		case IPL_LOGSCAN :
 #endif
 			error = 0;
 			break;
 		default :
 			error = ENXIO;
 			break;
 		}
 	}
 	return error;
 }
 
 
 static int ipfclose(dev, flags
 #if ((BSD >= 199506) || (__FreeBSD_version >= 220000))
 , devtype, p)
 	int devtype;
 # if (__FreeBSD_version >= 500024)
 	struct thread *p;
 # else
 	struct proc *p;
 # endif /* __FreeBSD_version >= 500024 */
 #else
 )
 #endif
 #if (__FreeBSD_version >= 502116)
 	struct cdev *dev;
 #else
 	dev_t dev;
 #endif
 	int flags;
 {
 	int	unit = GET_MINOR(dev);
 
 	if (IPL_LOGMAX < unit)
 		unit = ENXIO;
 	else
 		unit = 0;
 	return unit;
 }
 
 /*
  * ipfread/ipflog
  * both of these must operate with at least splnet() lest they be
  * called during packet processing and cause an inconsistancy to appear in
  * the filter lists.
  */
 #if (BSD >= 199306)
 static int ipfread(dev, uio, ioflag)
 	int ioflag;
 #else
 static int ipfread(dev, uio)
 #endif
 #if (__FreeBSD_version >= 502116)
 	struct cdev *dev;
 #else
 	dev_t dev;
 #endif
 	struct uio *uio;
 {
 	int	unit = GET_MINOR(dev);
 
 	if (unit < 0)
 		return ENXIO;
 
 	if (ipfmain.ipf_running < 1)
 		return EIO;
 
 	if (unit == IPL_LOGSYNC)
 		return ipf_sync_read(&ipfmain, uio);
 
 #ifdef IPFILTER_LOG
 	return ipf_log_read(&ipfmain, unit, uio);
 #else
 	return ENXIO;
 #endif
 }
 
 
 /*
  * ipfwrite
  * both of these must operate with at least splnet() lest they be
  * called during packet processing and cause an inconsistancy to appear in
  * the filter lists.
  */
 #if (BSD >= 199306)
 static int ipfwrite(dev, uio, ioflag)
 	int ioflag;
 #else
 static int ipfwrite(dev, uio)
 #endif
 #if (__FreeBSD_version >= 502116)
 	struct cdev *dev;
 #else
 	dev_t dev;
 #endif
 	struct uio *uio;
 {
 
 	if (ipfmain.ipf_running < 1)
 		return EIO;
 
 	if (GET_MINOR(dev) == IPL_LOGSYNC)
 		return ipf_sync_write(&ipfmain, uio);
 	return ENXIO;
 }
 
 static int
 ipf_fbsd_sysctl_create(main_softc)
 	ipf_main_softc_t *main_softc;
 {
 	ipf_nat_softc_t	*nat_softc;
 	ipf_state_softc_t *state_softc;
 	ipf_auth_softc_t *auth_softc;
 	ipf_frag_softc_t *frag_softc;
 
 	nat_softc = main_softc->ipf_nat_soft;
 	state_softc = main_softc->ipf_state_soft;
 	auth_softc = main_softc->ipf_auth_soft;
 	frag_softc = main_softc->ipf_frag_soft;
 
 	sysctl_ctx_init(&ipf_clist);
 
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_defnatage", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_defage, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_statesize", CTLFLAG_RWO,
 	    &state_softc->ipf_state_size, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_statemax", CTLFLAG_RWO,
 	    &state_softc->ipf_state_max, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "ipf_nattable_max", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_table_max, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "ipf_nattable_sz", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_table_sz, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "ipf_natrules_sz", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_maprules_sz, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "ipf_rdrrules_sz", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_rdrrules_sz, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "ipf_hostmap_sz", CTLFLAG_RWO,
 	    &nat_softc->ipf_nat_hostmap_sz, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_authsize", CTLFLAG_RWO,
 	   &auth_softc->ipf_auth_size, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_authused", CTLFLAG_RD,
 	   &auth_softc->ipf_auth_used, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_defaultauthage", CTLFLAG_RW,
 	   &auth_softc->ipf_auth_defaultage, 0, "");
 	SYSCTL_DYN_IPF(_net_inet_ipf, OID_AUTO, "fr_ipfrttl", CTLFLAG_RW,
 	   &frag_softc->ipfr_ttl, 0, "");
 	return 0;
 }
 
 static int
 ipf_fbsd_sysctl_destroy(main_softc)
 	ipf_main_softc_t *main_softc;
 {
 	if (sysctl_ctx_free(&ipf_clist)) {
 		printf("sysctl_ctx_free failed");
 		return(ENOTEMPTY);
 	}
 	return 0;
 }
 
Index: projects/vnet/sys/dev/usb/net/usb_ethernet.c
===================================================================
--- projects/vnet/sys/dev/usb/net/usb_ethernet.c	(revision 294565)
+++ projects/vnet/sys/dev/usb/net/usb_ethernet.c	(revision 294566)
@@ -1,645 +1,649 @@
 /* $FreeBSD$ */
 /*-
  * Copyright (c) 2009 Andrew Thompson (thompsa@FreeBSD.org)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 
 #include <dev/mii/mii.h>
 #include <dev/mii/miivar.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 
 #include <dev/usb/usb_process.h>
 #include <dev/usb/net/usb_ethernet.h>
 
 static SYSCTL_NODE(_net, OID_AUTO, ue, CTLFLAG_RD, 0,
     "USB Ethernet parameters");
 
 #define	UE_LOCK(_ue)		mtx_lock((_ue)->ue_mtx)
 #define	UE_UNLOCK(_ue)		mtx_unlock((_ue)->ue_mtx)
 #define	UE_LOCK_ASSERT(_ue, t)	mtx_assert((_ue)->ue_mtx, t)
 
 MODULE_DEPEND(uether, usb, 1, 1, 1);
 MODULE_DEPEND(uether, miibus, 1, 1, 1);
 
 static struct unrhdr *ueunit;
 
 static usb_proc_callback_t ue_attach_post_task;
 static usb_proc_callback_t ue_promisc_task;
 static usb_proc_callback_t ue_setmulti_task;
 static usb_proc_callback_t ue_ifmedia_task;
 static usb_proc_callback_t ue_tick_task;
 static usb_proc_callback_t ue_start_task;
 static usb_proc_callback_t ue_stop_task;
 
 static void	ue_init(void *);
 static void	ue_start(struct ifnet *);
 static int	ue_ifmedia_upd(struct ifnet *);
 static void	ue_watchdog(void *);
 
 /*
  * Return values:
  *    0: success
  * Else: device has been detached
  */
 uint8_t
 uether_pause(struct usb_ether *ue, unsigned int _ticks)
 {
 	if (usb_proc_is_gone(&ue->ue_tq)) {
 		/* nothing to do */
 		return (1);
 	}
 	usb_pause_mtx(ue->ue_mtx, _ticks);
 	return (0);
 }
 
 static void
 ue_queue_command(struct usb_ether *ue,
     usb_proc_callback_t *fn,
     struct usb_proc_msg *t0, struct usb_proc_msg *t1)
 {
 	struct usb_ether_cfg_task *task;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	if (usb_proc_is_gone(&ue->ue_tq)) {
 		return;         /* nothing to do */
 	}
 	/* 
 	 * NOTE: The task cannot get executed before we drop the
 	 * "sc_mtx" mutex. It is safe to update fields in the message
 	 * structure after that the message got queued.
 	 */
 	task = (struct usb_ether_cfg_task *)
 	  usb_proc_msignal(&ue->ue_tq, t0, t1);
 
 	/* Setup callback and self pointers */
 	task->hdr.pm_callback = fn;
 	task->ue = ue;
 
 	/*
 	 * Start and stop must be synchronous!
 	 */
 	if ((fn == ue_start_task) || (fn == ue_stop_task))
 		usb_proc_mwait(&ue->ue_tq, t0, t1);
 }
 
 struct ifnet *
 uether_getifp(struct usb_ether *ue)
 {
 	return (ue->ue_ifp);
 }
 
 struct mii_data *
 uether_getmii(struct usb_ether *ue)
 {
 	return (device_get_softc(ue->ue_miibus));
 }
 
 void *
 uether_getsc(struct usb_ether *ue)
 {
 	return (ue->ue_sc);
 }
 
 static int
 ue_sysctl_parent(SYSCTL_HANDLER_ARGS)
 {
 	struct usb_ether *ue = arg1;
 	const char *name;
 
 	name = device_get_nameunit(ue->ue_dev);
 	return SYSCTL_OUT_STR(req, name);
 }
 
 int
 uether_ifattach(struct usb_ether *ue)
 {
 	int error;
 
 	/* check some critical parameters */
 	if ((ue->ue_dev == NULL) ||
 	    (ue->ue_udev == NULL) ||
 	    (ue->ue_mtx == NULL) ||
 	    (ue->ue_methods == NULL))
 		return (EINVAL);
 
 	error = usb_proc_create(&ue->ue_tq, ue->ue_mtx, 
 	    device_get_nameunit(ue->ue_dev), USB_PRI_MED);
 	if (error) {
 		device_printf(ue->ue_dev, "could not setup taskqueue\n");
 		goto error;
 	}
 
 	/* fork rest of the attach code */
 	UE_LOCK(ue);
 	ue_queue_command(ue, ue_attach_post_task,
 	    &ue->ue_sync_task[0].hdr,
 	    &ue->ue_sync_task[1].hdr);
 	UE_UNLOCK(ue);
 
 error:
 	return (error);
 }
 
 static void
 ue_attach_post_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 	struct ifnet *ifp;
 	int error;
 	char num[14];			/* sufficient for 32 bits */
 
 	/* first call driver's post attach routine */
 	ue->ue_methods->ue_attach_post(ue);
 
 	UE_UNLOCK(ue);
 
 	ue->ue_unit = alloc_unr(ueunit);
 	usb_callout_init_mtx(&ue->ue_watchdog, ue->ue_mtx, 0);
 	sysctl_ctx_init(&ue->ue_sysctl_ctx);
 
 	error = 0;
 	CURVNET_SET_QUIET(vnet0);
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(ue->ue_dev, "could not allocate ifnet\n");
 		goto fail;
 	}
 
 	ifp->if_softc = ue;
 	if_initname(ifp, "ue", ue->ue_unit);
 	if (ue->ue_methods->ue_attach_post_sub != NULL) {
 		ue->ue_ifp = ifp;
 		error = ue->ue_methods->ue_attach_post_sub(ue);
 	} else {
 		ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 		if (ue->ue_methods->ue_ioctl != NULL)
 			ifp->if_ioctl = ue->ue_methods->ue_ioctl;
 		else
 			ifp->if_ioctl = uether_ioctl;
 		ifp->if_start = ue_start;
 		ifp->if_init = ue_init;
 		IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 		ifp->if_snd.ifq_drv_maxlen = ifqmaxlen;
 		IFQ_SET_READY(&ifp->if_snd);
 		ue->ue_ifp = ifp;
 
 		if (ue->ue_methods->ue_mii_upd != NULL &&
 		    ue->ue_methods->ue_mii_sts != NULL) {
 			/* device_xxx() depends on this */
 			mtx_lock(&Giant);
 			error = mii_attach(ue->ue_dev, &ue->ue_miibus, ifp,
 			    ue_ifmedia_upd, ue->ue_methods->ue_mii_sts,
 			    BMSR_DEFCAPMASK, MII_PHY_ANY, MII_OFFSET_ANY, 0);
 			mtx_unlock(&Giant);
 		}
 	}
 
 	if (error) {
 		device_printf(ue->ue_dev, "attaching PHYs failed\n");
 		goto fail;
 	}
 
 	if_printf(ifp, "<USB Ethernet> on %s\n", device_get_nameunit(ue->ue_dev));
 	ether_ifattach(ifp, ue->ue_eaddr);
 	/* Tell upper layer we support VLAN oversized frames. */
 	if (ifp->if_capabilities & IFCAP_VLAN_MTU)
 		ifp->if_hdrlen = sizeof(struct ether_vlan_header);
 
 	CURVNET_RESTORE();
 
 	snprintf(num, sizeof(num), "%u", ue->ue_unit);
 	ue->ue_sysctl_oid = SYSCTL_ADD_NODE(&ue->ue_sysctl_ctx,
 	    &SYSCTL_NODE_CHILDREN(_net, ue),
 	    OID_AUTO, num, CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_PROC(&ue->ue_sysctl_ctx,
 	    SYSCTL_CHILDREN(ue->ue_sysctl_oid), OID_AUTO,
 	    "%parent", CTLTYPE_STRING | CTLFLAG_RD, ue, 0,
 	    ue_sysctl_parent, "A", "parent device");
 
 	UE_LOCK(ue);
 	return;
 
 fail:
 	CURVNET_RESTORE();
 	free_unr(ueunit, ue->ue_unit);
 	if (ue->ue_ifp != NULL) {
 		if_free(ue->ue_ifp);
 		ue->ue_ifp = NULL;
 	}
 	UE_LOCK(ue);
 	return;
 }
 
 void
 uether_ifdetach(struct usb_ether *ue)
 {
 	struct ifnet *ifp;
 
 	/* wait for any post attach or other command to complete */
 	usb_proc_drain(&ue->ue_tq);
 
 	/* read "ifnet" pointer after taskqueue drain */
 	ifp = ue->ue_ifp;
 
 	if (ifp != NULL) {
 
 		/* we are not running any more */
 		UE_LOCK(ue);
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		UE_UNLOCK(ue);
 
 		/* drain any callouts */
 		usb_callout_drain(&ue->ue_watchdog);
 
 		/* detach miibus */
 		if (ue->ue_miibus != NULL) {
 			mtx_lock(&Giant);	/* device_xxx() depends on this */
 			device_delete_child(ue->ue_dev, ue->ue_miibus);
 			mtx_unlock(&Giant);
 		}
 
 		/* detach ethernet */
 		ether_ifdetach(ifp);
 
 		/* free interface instance */
 		if_free(ifp);
 
 		/* free sysctl */
 		sysctl_ctx_free(&ue->ue_sysctl_ctx);
 
 		/* free unit */
 		free_unr(ueunit, ue->ue_unit);
 	}
 
 	/* free taskqueue, if any */
 	usb_proc_free(&ue->ue_tq);
 }
 
 uint8_t
 uether_is_gone(struct usb_ether *ue)
 {
 	return (usb_proc_is_gone(&ue->ue_tq));
 }
 
 void
 uether_init(void *arg)
 {
 
 	ue_init(arg);
 }
 
 static void
 ue_init(void *arg)
 {
 	struct usb_ether *ue = arg;
 
 	UE_LOCK(ue);
 	ue_queue_command(ue, ue_start_task,
 	    &ue->ue_sync_task[0].hdr, 
 	    &ue->ue_sync_task[1].hdr);
 	UE_UNLOCK(ue);
 }
 
 static void
 ue_start_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 	struct ifnet *ifp = ue->ue_ifp;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	ue->ue_methods->ue_init(ue);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	if (ue->ue_methods->ue_tick != NULL)
 		usb_callout_reset(&ue->ue_watchdog, hz, ue_watchdog, ue);
 }
 
 static void
 ue_stop_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	usb_callout_stop(&ue->ue_watchdog);
 
 	ue->ue_methods->ue_stop(ue);
 }
 
 void
 uether_start(struct ifnet *ifp)
 {
 
 	ue_start(ifp);
 }
 
 static void
 ue_start(struct ifnet *ifp)
 {
 	struct usb_ether *ue = ifp->if_softc;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	UE_LOCK(ue);
 	ue->ue_methods->ue_start(ue);
 	UE_UNLOCK(ue);
 }
 
 static void
 ue_promisc_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 
 	ue->ue_methods->ue_setpromisc(ue);
 }
 
 static void
 ue_setmulti_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 
 	ue->ue_methods->ue_setmulti(ue);
 }
 
 int
 uether_ifmedia_upd(struct ifnet *ifp)
 {
 
 	return (ue_ifmedia_upd(ifp));
 }
 
 static int
 ue_ifmedia_upd(struct ifnet *ifp)
 {
 	struct usb_ether *ue = ifp->if_softc;
 
 	/* Defer to process context */
 	UE_LOCK(ue);
 	ue_queue_command(ue, ue_ifmedia_task,
 	    &ue->ue_media_task[0].hdr,
 	    &ue->ue_media_task[1].hdr);
 	UE_UNLOCK(ue);
 
 	return (0);
 }
 
 static void
 ue_ifmedia_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 	struct ifnet *ifp = ue->ue_ifp;
 
 	ue->ue_methods->ue_mii_upd(ifp);
 }
 
 static void
 ue_watchdog(void *arg)
 {
 	struct usb_ether *ue = arg;
 	struct ifnet *ifp = ue->ue_ifp;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ue_queue_command(ue, ue_tick_task,
 	    &ue->ue_tick_task[0].hdr, 
 	    &ue->ue_tick_task[1].hdr);
 
 	usb_callout_reset(&ue->ue_watchdog, hz, ue_watchdog, ue);
 }
 
 static void
 ue_tick_task(struct usb_proc_msg *_task)
 {
 	struct usb_ether_cfg_task *task =
 	    (struct usb_ether_cfg_task *)_task;
 	struct usb_ether *ue = task->ue;
 	struct ifnet *ifp = ue->ue_ifp;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ue->ue_methods->ue_tick(ue);
 }
 
 int
 uether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct usb_ether *ue = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct mii_data *mii;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFFLAGS:
 		UE_LOCK(ue);
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				ue_queue_command(ue, ue_promisc_task,
 				    &ue->ue_promisc_task[0].hdr, 
 				    &ue->ue_promisc_task[1].hdr);
 			else
 				ue_queue_command(ue, ue_start_task,
 				    &ue->ue_sync_task[0].hdr, 
 				    &ue->ue_sync_task[1].hdr);
 		} else {
 			ue_queue_command(ue, ue_stop_task,
 			    &ue->ue_sync_task[0].hdr, 
 			    &ue->ue_sync_task[1].hdr);
 		}
 		UE_UNLOCK(ue);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		UE_LOCK(ue);
 		ue_queue_command(ue, ue_setmulti_task,
 		    &ue->ue_multi_task[0].hdr, 
 		    &ue->ue_multi_task[1].hdr);
 		UE_UNLOCK(ue);
 		break;
 	case SIOCGIFMEDIA:
 	case SIOCSIFMEDIA:
 		if (ue->ue_miibus != NULL) {
 			mii = device_get_softc(ue->ue_miibus);
 			error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, command);
 		} else
 			error = ether_ioctl(ifp, command, data);
 		break;
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 uether_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		ueunit = new_unrhdr(0, INT_MAX, NULL);
 		break;
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 static moduledata_t uether_mod = {
 	"uether",
 	uether_modevent,
 	0
 };
 
 struct mbuf *
 uether_newbuf(void)
 {
 	struct mbuf *m_new;
 
 	m_new = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m_new == NULL)
 		return (NULL);
 	m_new->m_len = m_new->m_pkthdr.len = MCLBYTES;
 
 	m_adj(m_new, ETHER_ALIGN);
 	return (m_new);
 }
 
 int
 uether_rxmbuf(struct usb_ether *ue, struct mbuf *m, 
     unsigned int len)
 {
 	struct ifnet *ifp = ue->ue_ifp;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	/* finalize mbuf */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.len = m->m_len = len;
 
 	/* enqueue for later when the lock can be released */
 	_IF_ENQUEUE(&ue->ue_rxq, m);
 	return (0);
 }
 
 int
 uether_rxbuf(struct usb_ether *ue, struct usb_page_cache *pc, 
     unsigned int offset, unsigned int len)
 {
 	struct ifnet *ifp = ue->ue_ifp;
 	struct mbuf *m;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	if (len < ETHER_HDR_LEN || len > MCLBYTES - ETHER_ALIGN)
 		return (1);
 
 	m = uether_newbuf();
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 		return (ENOMEM);
 	}
 
 	usbd_copy_out(pc, offset, mtod(m, uint8_t *), len);
 
 	/* finalize mbuf */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.len = m->m_len = len;
 
 	/* enqueue for later when the lock can be released */
 	_IF_ENQUEUE(&ue->ue_rxq, m);
 	return (0);
 }
 
 void
 uether_rxflush(struct usb_ether *ue)
 {
 	struct ifnet *ifp = ue->ue_ifp;
 	struct mbuf *m;
 
 	UE_LOCK_ASSERT(ue, MA_OWNED);
 
 	for (;;) {
 		_IF_DEQUEUE(&ue->ue_rxq, m);
 		if (m == NULL)
 			break;
 
 		/*
 		 * The USB xfer has been resubmitted so its safe to unlock now.
 		 */
 		UE_UNLOCK(ue);
 		ifp->if_input(ifp, m);
 		UE_LOCK(ue);
 	}
 }
 
-DECLARE_MODULE(uether, uether_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+/*
+ * USB net drivers are run by DRIVER_MODULE() thus SI_SUB_DRIVERS,
+ * SI_ORDER_MIDDLE.  Run uether after that.
+ */
+DECLARE_MODULE(uether, uether_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
 MODULE_VERSION(uether, 1);
Index: projects/vnet/sys/kern/kern_shutdown.c
===================================================================
--- projects/vnet/sys/kern/kern_shutdown.c	(revision 294565)
+++ projects/vnet/sys/kern/kern_shutdown.c	(revision 294566)
@@ -1,901 +1,912 @@
 /*-
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
 #include "opt_panic.h"
 #include "opt_sched.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/cpu.h>
 #include <machine/dump.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <sys/signalvar.h>
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME;
 SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN,
     &panic_reboot_wait_time, 0,
     "Seconds to wait before rebooting after a panic");
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 #ifdef KDB
 #ifdef KDB_UNATTENDED
 int debugger_on_panic = 0;
 #else
 int debugger_on_panic = 1;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &debugger_on_panic, 0, "Run debugger on kernel panic");
 
 #ifdef KDB_TRACE
 static int trace_on_panic = 1;
 #else
 static int trace_on_panic = 0;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &trace_on_panic, 0, "Print stack trace on kernel panic");
 #endif /* KDB */
 
 static int sync_on_panic = 0;
 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN,
 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
 
 static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
     "Shutdown environment");
 
 #ifndef DIAGNOSTIC
 static int show_busybufs;
 #else
 static int show_busybufs = 1;
 #endif
 SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
 	&show_busybufs, 0, "");
 
 int suspend_blocked = 0;
 SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW,
 	&suspend_blocked, 0, "Block suspend due to a pending shutdown");
 
 /*
  * Variable panicstr contains argument to first call to panic; used as flag
  * to indicate that the kernel has already called panic.
  */
 const char *panicstr;
 
 int dumping;				/* system is dumping */
 int rebooting;				/* system is rebooting */
 static struct dumperinfo dumper;	/* our selected dumper */
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
 lwpid_t dumptid;			/* Thread ID. */
 
 static struct cdevsw reroot_cdevsw = {
      .d_version = D_VERSION,
      .d_name    = "reroot",
 };
 
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
 static void shutdown_panic(void *junk, int howto);
 static void shutdown_reset(void *junk, int howto);
 static int kern_reroot(void);
 
 /* register various local shutdown events */
 static void
 shutdown_conf(void *unused)
 {
 
 	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
 	    SHUTDOWN_PRI_LAST + 200);
 }
 
 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
 
 /*
  * The only reason this exists is to create the /dev/reroot/ directory,
  * used by reroot code in init(8) as a mountpoint for tmpfs.
  */
 static void
 reroot_conf(void *unused)
 {
 	int error;
 	struct cdev *cdev;
 
 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev,
 	    &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot");
 	if (error != 0) {
 		printf("%s: failed to create device node, error %d",
 		    __func__, error);
 	}
 }
 
 SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL);
 
 /*
  * The system call that results in a reboot.
  */
 /* ARGSUSED */
 int
 sys_reboot(struct thread *td, struct reboot_args *uap)
 {
 	int error;
 
 	error = 0;
 #ifdef MAC
 	error = mac_system_check_reboot(td->td_ucred, uap->opt);
 #endif
 	if (error == 0)
 		error = priv_check(td, PRIV_REBOOT);
 	if (error == 0) {
 		if (uap->opt & RB_REROOT) {
 			error = kern_reroot();
 		} else {
 			mtx_lock(&Giant);
 			kern_reboot(uap->opt);
 			mtx_unlock(&Giant);
 		}
 	}
 	return (error);
 }
 
 /*
  * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
  */
 void
 shutdown_nice(int howto)
 {
 
 	if (initproc != NULL) {
 		/* Send a signal to init(8) and have it shutdown the world. */
 		PROC_LOCK(initproc);
 		if (howto & RB_POWEROFF)
 			kern_psignal(initproc, SIGUSR2);
 		else if (howto & RB_HALT)
 			kern_psignal(initproc, SIGUSR1);
 		else
 			kern_psignal(initproc, SIGINT);
 		PROC_UNLOCK(initproc);
 	} else {
 		/* No init(8) running, so simply reboot. */
 		kern_reboot(howto | RB_NOSYNC);
 	}
 }
 
 static void
 print_uptime(void)
 {
 	int f;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	printf("Uptime: ");
 	f = 0;
 	if (ts.tv_sec >= 86400) {
 		printf("%ldd", (long)ts.tv_sec / 86400);
 		ts.tv_sec %= 86400;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 3600) {
 		printf("%ldh", (long)ts.tv_sec / 3600);
 		ts.tv_sec %= 3600;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 60) {
 		printf("%ldm", (long)ts.tv_sec / 60);
 		ts.tv_sec %= 60;
 		f = 1;
 	}
 	printf("%lds\n", (long)ts.tv_sec);
 }
 
 int
 doadump(boolean_t textdump)
 {
 	boolean_t coredump;
 	int error;
 
 	error = 0;
 	if (dumping)
 		return (EBUSY);
 	if (dumper.dumper == NULL)
 		return (ENXIO);
 
 	savectx(&dumppcb);
 	dumptid = curthread->td_tid;
 	dumping++;
 
 	coredump = TRUE;
 #ifdef DDB
 	if (textdump && textdump_pending) {
 		coredump = FALSE;
 		textdump_dumpsys(&dumper);
 	}
 #endif
 	if (coredump)
 		error = dumpsys(&dumper);
 
 	dumping--;
 	return (error);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 kern_reboot(int howto)
 {
 	static int once = 0;
 
 #if defined(SMP)
 	/*
 	 * Bind us to CPU 0 so that all shutdown code runs there.  Some
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
 	if (!SCHEDULER_STOPPED()) {
 		thread_lock(curthread);
 		sched_bind(curthread, 0);
 		thread_unlock(curthread);
 		KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
 	}
 #endif
 	/* We're in the process of rebooting. */
 	rebooting = 1;
 
 	/* We are out of the debugger now. */
 	kdb_active = 0;
 
 	/*
 	 * Do any callouts that should be done BEFORE syncing the filesystems.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
 
 	/* 
 	 * Now sync filesystems
 	 */
 	if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) {
 		once = 1;
 		bufshutdown(show_busybufs);
 	}
 
 	print_uptime();
 
 	cngrab();
 
 	/*
 	 * Ok, now do things that assume all filesystem activity has
 	 * been completed.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
 
 	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) 
 		doadump(TRUE);
 
 	/* Now that we're going to really halt the system... */
 	EVENTHANDLER_INVOKE(shutdown_final, howto);
 
 	for(;;) ;	/* safety against shutdown_reset not working */
 	/* NOTREACHED */
 }
 
 /*
  * The system call that results in changing the rootfs.
  */
 static int
 kern_reroot(void)
 {
 	struct vnode *oldrootvnode, *vp;
 	struct mount *mp, *devmp;
 	int error;
 
 	if (curproc != initproc)
 		return (EPERM);
 
 	/*
 	 * Mark the filesystem containing currently-running executable
 	 * (the temporary copy of init(8)) busy.
 	 */
 	vp = curproc->p_textvp;
 	error = vn_lock(vp, LK_SHARED);
 	if (error != 0)
 		return (error);
 	mp = vp->v_mount;
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp, 0);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0) {
 			VOP_UNLOCK(vp, 0);
 			return (ENOENT);
 		}
 		if (vp->v_iflag & VI_DOOMED) {
 			VOP_UNLOCK(vp, 0);
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Remove the filesystem containing currently-running executable
 	 * from the mount list, to prevent it from being unmounted
 	 * by vfs_unmountall(), and to avoid confusing vfs_mountroot().
 	 *
 	 * Also preserve /dev - forcibly unmounting it could cause driver
 	 * reinitialization.
 	 */
 
 	vfs_ref(rootdevmp);
 	devmp = rootdevmp;
 	rootdevmp = NULL;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	TAILQ_REMOVE(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	oldrootvnode = rootvnode;
 
 	/*
 	 * Unmount everything except for the two filesystems preserved above.
 	 */
 	vfs_unmountall();
 
 	/*
 	 * Add /dev back; vfs_mountroot() will move it into its new place.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	rootdevmp = devmp;
 	vfs_rel(rootdevmp);
 
 	/*
 	 * Mount the new rootfs.
 	 */
 	vfs_mountroot();
 
 	/*
 	 * Update all references to the old rootvnode.
 	 */
 	mountcheckdirs(oldrootvnode, rootvnode);
 
 	/*
 	 * Add the temporary filesystem back and unbusy it.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_unbusy(mp);
 
 	return (0);
 }
 
 /*
  * If the shutdown was a clean halt, behave accordingly.
  */
 static void
 shutdown_halt(void *junk, int howto)
 {
 
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 		switch (cngetc()) {
 		case -1:		/* No console, just die */
 			cpu_halt();
 			/* NOTREACHED */
 		default:
 			howto &= ~RB_HALT;
 			break;
 		}
 	}
 }
 
 /*
  * Check to see if the system paniced, pause and then reboot
  * according to the specified delay.
  */
 static void
 shutdown_panic(void *junk, int howto)
 {
 	int loop;
 
 	if (howto & RB_DUMP) {
 		if (panic_reboot_wait_time != 0) {
 			if (panic_reboot_wait_time != -1) {
 				printf("Automatic reboot in %d seconds - "
 				       "press a key on the console to abort\n",
 					panic_reboot_wait_time);
 				for (loop = panic_reboot_wait_time * 10;
 				     loop > 0; --loop) {
 					DELAY(1000 * 100); /* 1/10th second */
 					/* Did user type a key? */
 					if (cncheckc() != -1)
 						break;
 				}
 				if (!loop)
 					return;
 			}
 		} else { /* zero time specified - reboot NOW */
 			return;
 		}
 		printf("--> Press a key on the console to reboot,\n");
 		printf("--> or switch off the system now.\n");
 		cngetc();
 	}
 }
 
 /*
  * Everything done, now reset
  */
 static void
 shutdown_reset(void *junk, int howto)
 {
 
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 
 	/*
 	 * Acquiring smp_ipi_mtx here has a double effect:
 	 * - it disables interrupts avoiding CPU0 preemption
 	 *   by fast handlers (thus deadlocking  against other CPUs)
 	 * - it avoids deadlocks against smp_rendezvous() or, more 
 	 *   generally, threads busy-waiting, with this spinlock held,
 	 *   and waiting for responses by threads on other CPUs
 	 *   (ie. smp_tlb_shootdown()).
 	 *
 	 * For the !SMP case it just needs to handle the former problem.
 	 */
 #ifdef SMP
 	mtx_lock_spin(&smp_ipi_mtx);
 #else
 	spinlock_enter();
 #endif
 
 	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
 	cpu_reset();
 	/* NOTREACHED */ /* assuming reset worked */
 }
 
 #if defined(WITNESS) || defined(INVARIANTS)
 static int kassert_warn_only = 0;
 #ifdef KDB
 static int kassert_do_kdb = 0;
 #endif
 #ifdef KTR
 static int kassert_do_ktr = 0;
 #endif
 static int kassert_do_log = 1;
 static int kassert_log_pps_limit = 4;
 static int kassert_log_mute_at = 0;
 static int kassert_log_panic_at = 0;
 static int kassert_warnings = 0;
 
 SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RWTUN,
     &kassert_warn_only, 0,
     "KASSERT triggers a panic (1) or just a warning (0)");
 
 #ifdef KDB
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RWTUN,
     &kassert_do_kdb, 0, "KASSERT will enter the debugger");
 #endif
 
 #ifdef KTR
 SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RWTUN,
     &kassert_do_ktr, 0,
     "KASSERT does a KTR, set this to the KTRMASK you want");
 #endif
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RWTUN,
     &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RWTUN,
     &kassert_warnings, 0, "number of KASSERTs that have been triggered");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RWTUN,
     &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RWTUN,
     &kassert_log_pps_limit, 0, "limit number of log messages per second");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN,
     &kassert_log_mute_at, 0, "max number of KASSERTS to log");
 
 static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kassert_sysctl_kassert, "I", "set to trigger a test kassert");
 
 static int
 kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
 	return (0);
 }
 
 /*
  * Called by KASSERT, this decides if we will panic
  * or if we will log via printf and/or ktr.
  */
 void
 kassert_panic(const char *fmt, ...)
 {
 	static char buf[256];
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	/*
 	 * panic if we're not just warning, or if we've exceeded
 	 * kassert_log_panic_at warnings.
 	 */
 	if (!kassert_warn_only ||
 	    (kassert_log_panic_at > 0 &&
 	     kassert_warnings >= kassert_log_panic_at)) {
 		va_start(ap, fmt);
 		vpanic(fmt, ap);
 		/* NORETURN */
 	}
 #ifdef KTR
 	if (kassert_do_ktr)
 		CTR0(ktr_mask, buf);
 #endif /* KTR */
 	/*
 	 * log if we've not yet met the mute limit.
 	 */
 	if (kassert_do_log &&
 	    (kassert_log_mute_at == 0 ||
 	     kassert_warnings < kassert_log_mute_at)) {
 		static  struct timeval lasterr;
 		static  int curerr;
 
 		if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
 			printf("KASSERT failed: %s\n", buf);
 			kdb_backtrace();
 		}
 	}
 #ifdef KDB
 	if (kassert_do_kdb) {
 		kdb_enter(KDB_WHY_KASSERT, buf);
 	}
 #endif
 	atomic_add_int(&kassert_warnings, 1);
 }
 #endif
 
 /*
  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
  * and then reboots.  If we are called twice, then we avoid trying to sync
  * the disks as this often leads to recursive panics.
  */
 void
 panic(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vpanic(fmt, ap);
 }
 
 void
 vpanic(const char *fmt, va_list ap)
 {
 #ifdef SMP
 	cpuset_t other_cpus;
 #endif
 	struct thread *td = curthread;
 	int bootopt, newpanic;
 	static char buf[256];
 
 	spinlock_enter();
 
 #ifdef SMP
 	/*
 	 * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
 	 * concurrently entering panic.  Only the winner will proceed
 	 * further.
 	 */
 	if (panicstr == NULL && !kdb_active) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		stop_cpus_hard(other_cpus);
 	}
 
 	/*
 	 * Ensure that the scheduler is stopped while panicking, even if panic
 	 * has been entered from kdb.
 	 */
 	td->td_stopsched = 1;
 #endif
 
 	bootopt = RB_AUTOBOOT;
 	newpanic = 0;
 	if (panicstr)
 		bootopt |= RB_NOSYNC;
 	else {
 		bootopt |= RB_DUMP;
 		panicstr = fmt;
 		newpanic = 1;
 	}
 
 	if (newpanic) {
 		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 		panicstr = buf;
 		cngrab();
 		printf("panic: %s\n", buf);
 	} else {
 		printf("panic: ");
 		vprintf(fmt, ap);
 		printf("\n");
 	}
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 
 #ifdef KDB
 	if (newpanic && trace_on_panic)
 		kdb_backtrace();
 	if (debugger_on_panic)
 		kdb_enter(KDB_WHY_PANIC, "panic");
 #endif
 	/*thread_lock(td); */
 	td->td_flags |= TDF_INPANIC;
 	/* thread_unlock(td); */
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	kern_reboot(bootopt);
 }
 
 /*
  * Support for poweroff delay.
  *
  * Please note that setting this delay too short might power off your machine
  * before the write cache on your hard disk has been flushed, leading to
  * soft-updates inconsistencies.
  */
 #ifndef POWEROFF_DELAY
 # define POWEROFF_DELAY 5000
 #endif
 static int poweroff_delay = POWEROFF_DELAY;
 
 SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
     &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
 
 static void
 poweroff_wait(void *junk, int howto)
 {
 
 	if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
 		return;
 	DELAY(poweroff_delay * 1000);
 }
 
 /*
  * Some system processes (e.g. syncer) need to be stopped at appropriate
  * points in their main loops prior to a system shutdown, so that they
  * won't interfere with the shutdown process (e.g. by holding a disk buf
  * to cause sync to fail).  For each of these system processes, register
  * shutdown_kproc() as a handler for one of shutdown events.
  */
 static int kproc_shutdown_wait = 60;
 SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
     &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
 
 void
 kproc_shutdown(void *arg, int howto)
 {
 	struct proc *p;
 	int error;
 
 	if (panicstr)
 		return;
 
 	p = (struct proc *)arg;
 	printf("Waiting (max %d seconds) for system process `%s' to stop...",
 	    kproc_shutdown_wait, p->p_comm);
 	error = kproc_suspend(p, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 void
 kthread_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 	int error;
 
 	if (panicstr)
 		return;
 
 	td = (struct thread *)arg;
 	printf("Waiting (max %d seconds) for system thread `%s' to stop...",
 	    kproc_shutdown_wait, td->td_name);
 	error = kthread_suspend(td, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)];
 SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD,
     dumpdevname, 0, "Device for kernel dumps");
 
 /* Registration of dumpers */
 int
 set_dumper(struct dumperinfo *di, const char *devname, struct thread *td)
 {
 	size_t wantcopy;
 	int error;
 
 	error = priv_check(td, PRIV_SETDUMPER);
 	if (error != 0)
 		return (error);
 
 	if (di == NULL) {
 		bzero(&dumper, sizeof dumper);
 		dumpdevname[0] = '\0';
 		return (0);
 	}
 	if (dumper.dumper != NULL)
 		return (EBUSY);
 	dumper = *di;
 	wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname));
 	if (wantcopy >= sizeof(dumpdevname)) {
 		printf("set_dumper: device name truncated from '%s' -> '%s'\n",
 			devname, dumpdevname);
 	}
 	return (0);
 }
 
 /* Call dumper with bounds checking. */
 int
 dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length)
 {
 
 	if (length != 0 && (offset < di->mediaoffset ||
 	    offset - di->mediaoffset + length > di->mediasize)) {
 		printf("Attempt to write outside dump device boundaries.\n"
 	    "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
 		    (intmax_t)offset, (intmax_t)di->mediaoffset,
 		    (uintmax_t)length, (intmax_t)di->mediasize);
 		return (ENOSPC);
 	}
 	return (di->dumper(di->priv, virtual, physical, offset, length));
 }
 
 void
 mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
     uint64_t dumplen, uint32_t blksz)
 {
 
 	bzero(kdh, sizeof(*kdh));
 	strlcpy(kdh->magic, magic, sizeof(kdh->magic));
 	strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
 	kdh->version = htod32(KERNELDUMPVERSION);
 	kdh->architectureversion = htod32(archver);
 	kdh->dumplength = htod64(dumplen);
 	kdh->dumptime = htod64(time_second);
 	kdh->blocksize = htod32(blksz);
 	strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
 	strlcpy(kdh->versionstring, version, sizeof(kdh->versionstring));
 	if (panicstr != NULL)
 		strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
 	kdh->parity = kerneldump_parity(kdh);
 }
+
+#ifdef DDB
+DB_SHOW_COMMAND(panic, db_show_panic)
+{
+
+	if (panicstr == NULL)
+		db_printf("Not paniced\n");
+	else
+		db_printf("panic: %s\n", panicstr);
+}
+#endif
Index: projects/vnet/sys/kern/kern_timeout.c
===================================================================
--- projects/vnet/sys/kern/kern_timeout.c	(revision 294565)
+++ projects/vnet/sys/kern/kern_timeout.c	(revision 294566)
@@ -1,1617 +1,1655 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
+#include "opt_ddb.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <machine/_inttypes.h>
+#endif
+
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 #ifdef	RSS
 static int pin_default_swi = 1;
 static int pin_pcpu_swi = 1;
 #else
 static int pin_default_swi = 0;
 static int pin_pcpu_swi = 0;
 #endif
 
 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_curr;
 	void			(*cc_drain)(void *);
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	int			ce_migration_cpu;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_next;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 	u_int			cc_inited;
 	char			cc_ktr_event_name[20];
 };
 
 #define	callout_migrating(c)	((c)->c_iflags & CALLOUT_DFRMIGRATION)
 
 #define	cc_exec_curr(cc, dir)		cc->cc_exec_entity[dir].cc_curr
 #define	cc_exec_drain(cc, dir)		cc->cc_exec_entity[dir].cc_drain
 #define	cc_exec_next(cc)		cc->cc_next
 #define	cc_exec_cancel(cc, dir)		cc->cc_exec_entity[dir].cc_cancel
 #define	cc_exec_waiting(cc, dir)	cc->cc_exec_entity[dir].cc_waiting
 #ifdef SMP
 #define	cc_migration_func(cc, dir)	cc->cc_exec_entity[dir].ce_migration_func
 #define	cc_migration_arg(cc, dir)	cc->cc_exec_entity[dir].ce_migration_arg
 #define	cc_migration_cpu(cc, dir)	cc->cc_exec_entity[dir].ce_migration_cpu
 #define	cc_migration_time(cc, dir)	cc->cc_exec_entity[dir].ce_migration_time
 #define	cc_migration_prec(cc, dir)	cc->cc_exec_entity[dir].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc_exec_curr(cc, direct) = NULL;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_waiting(cc, direct) = false;
 #ifdef SMP
 	cc_migration_cpu(cc, direct) = CPUBLOCK;
 	cc_migration_time(cc, direct) = 0;
 	cc_migration_prec(cc, direct) = 0;
 	cc_migration_func(cc, direct) = NULL;
 	cc_migration_arg(cc, direct) = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc_migration_cpu(cc, direct) != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on cpu0 during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	memset(CC_CPU(0), 0, sizeof(cc_cpu));
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Fetch whether we're pinning the swi's or not.
 	 */
 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
 
 	/*
 	 * Only cpu0 handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_inited = 1;
 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
 	    M_CALLOUT, M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
 	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 	if (pin_default_swi &&
 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
 		    __func__,
 		    timeout_cpu);
 	}
 
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
 		callout_cpu_init(cc, cpu);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
 		ie = NULL;
 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
 			printf("%s: per-cpu clock couldn't be pinned to "
 			    "cpu %d\n",
 			    __func__,
 			    cpu);
 		}
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_iflags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc_exec_next(cc) =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc_exec_next(cc);
 					cc_exec_next(cc) = NULL;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_iflags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_iflags |= CALLOUT_PENDING;
 	c->c_iflags &= ~CALLOUT_PROCESSED;
 	c->c_flags |= CALLOUT_ACTIVE;
 	if (flags & C_DIRECT_EXEC)
 		c->c_iflags |= CALLOUT_DIRECT;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc_exec_next(cc) = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (SBT_MAX - c->c_time < c->c_precision)
 		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_iflags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
 	    ("softclock_call_cc: pend %p %x", c, c->c_iflags));
 	KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
 	    ("softclock_call_cc: act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_iflags = c->c_iflags;
 	if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_iflags &= ~CALLOUT_PENDING;
 	
 	cc_exec_curr(cc, direct) = c;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_drain(cc, direct) = NULL;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc_exec_cancel(cc, direct)) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc_exec_cancel(cc, direct) = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE1(callout_execute, , , callout__start, c);
 	c_func(c_arg);
 	SDT_PROBE1(callout_execute, , , callout__end, c);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
 	cc_exec_curr(cc, direct) = NULL;
 	if (cc_exec_drain(cc, direct)) {
 		void (*drain)(void *);
 		
 		drain = cc_exec_drain(cc, direct);
 		cc_exec_drain(cc, direct) = NULL;
 		CC_UNLOCK(cc);
 		drain(c_arg);
 		CC_LOCK(cc);
 	}
 	if (cc_exec_waiting(cc, direct)) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc_exec_waiting(cc, direct) = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc_exec_waiting(cc, direct));
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc_migration_cpu(cc, direct);
 		new_time = cc_migration_time(cc, direct);
 		new_prec = cc_migration_prec(cc, direct);
 		new_func = cc_migration_func(cc, direct);
 		new_arg = cc_migration_arg(cc, direct);
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if (!callout_migrating(c)) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_iflags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_iflags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_iflags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(timeout_t *ftn, void *arg, int to_ticks)
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, pr;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 	int ignore_cpu=0;
 
 	cancelled = 0;
 	if (cpu == -1) {
 		ignore_cpu = 1;
 	} else if ((cpu >= MAXCPU) ||
 		   ((CC_CPU(cpu))->cc_inited == 0)) {
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
 	if (flags & C_ABSOLUTE) {
 		to_sbt = sbt;
 	} else {
 		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
 		if ((flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
 			to_sbt = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
 			to_sbt += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
 			 * Obtain the time of the last hardclock() call on
 			 * this CPU directly from the kern_clocksource.c.
 			 * This value is per-CPU, but it is equal for all
 			 * active ones.
 			 */
 #ifdef __LP64__
 			to_sbt = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
 			to_sbt = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
 			if ((flags & C_HARDCLOCK) == 0)
 				to_sbt += tick_sbt;
 		} else
 			to_sbt = sbinuptime();
 		if (SBT_MAX - to_sbt < sbt)
 			to_sbt = SBT_MAX;
 		else
 			to_sbt += sbt;
 		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 		    sbt >> C_PRELGET(flags));
 		if (pr > precision)
 			precision = pr;
 	}
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
 	 * wrong direct flag if we don't do it before we add.
 	 */
 	if (flags & C_DIRECT_EXEC) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced or handle the case where the user does
 	 * not care. 
 	 */
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
 	    ignore_cpu) {
 		cpu = c->c_cpu;
 	}
 
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
 			cancelled = cc_exec_cancel(cc, direct) = true;
 		if (cc_exec_waiting(cc, direct)) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #ifdef SMP
 		if (callout_migrating(c)) {
 			/* 
 			 * This only occurs when a second callout_reset_sbt_on
 			 * is made after a previous one moved it into
 			 * deferred migration (below). Note we do *not* change
 			 * the prev_cpu even though the previous target may
 			 * be different.
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			cancelled = 1;
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 #endif
 	}
 	if (c->c_iflags & CALLOUT_PENDING) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 		cancelled = 1;
 		c->c_iflags &= ~ CALLOUT_PENDING;
 		c->c_flags &= ~ CALLOUT_ACTIVE;
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc_exec_curr(cc, direct) == c) {
 			/* 
 			 * Pending will have been removed since we are
 			 * actually executing the callout on another
 			 * CPU. That callout should be waiting on the
 			 * lock the caller holds. If we set both
 			 * active/and/pending after we return and the
 			 * lock on the executing callout proceeds, it
 			 * will then see pending is true and return.
 			 * At the return from the actual callout execution
 			 * the migration will occur in softclock_call_cc
 			 * and this new callout will be placed on the 
 			 * new CPU via a call to callout_cpu_switch() which
 			 * will get the lock on the right CPU followed
 			 * by a call callout_cc_add() which will add it there.
 			 * (see above in softclock_call_cc()).
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
 			c->c_flags |= CALLOUT_ACTIVE;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(struct callout *c, int safe, void (*drain)(void *))
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 	int not_on_a_list;
 
 	if (safe)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
 		    "calling %s", __func__);
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if (!safe && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	if (c->c_iflags & CALLOUT_DIRECT) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
 	    (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
 	    ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
 		/*
 		 * Special case where this slipped in while we
 		 * were migrating *as* the callout is about to
 		 * execute. The caller probably holds the lock
 		 * the callout wants.
 		 *
 		 * Get rid of the migration first. Then set
 		 * the flag that tells this code *not* to
 		 * try to remove it from any lists (its not
 		 * on one yet). When the callout wheel runs,
 		 * it will ignore this callout.
 		 */
 		c->c_iflags &= ~CALLOUT_PENDING;
 		c->c_flags &= ~CALLOUT_ACTIVE;
 		not_on_a_list = 1;
 	} else {
 		not_on_a_list = 0;
 	}
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&cc_exec_waiting(old_cc, direct));
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout isn't pending, it's not on the queue, so
 	 * don't attempt to remove it from the queue.  We can try to
 	 * stop it by other means however.
 	 */
 	if (!(c->c_iflags & CALLOUT_PENDING)) {
 		/*
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 * It probably has already been run (if locking
 		 * is properly done). You could get here if the caller
 		 * calls stop twice in a row for example. The second
 		 * call would fall here without CALLOUT_ACTIVE set.
 		 */
 		c->c_flags &= ~CALLOUT_ACTIVE;
 		if (cc_exec_curr(cc, direct) != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
 				sleepq_release(&cc_exec_waiting(cc, direct));
 			return (-1);
 		}
 
 		if (safe) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc_exec_curr(cc, direct) == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					    &cc_exec_waiting(cc, direct));
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc_exec_waiting(cc, direct) = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc_exec_waiting(cc, direct),
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc_exec_waiting(cc, direct),
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 		} else if (use_lock &&
 			   !cc_exec_cancel(cc, direct) && (drain == NULL)) {
 			
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock(). This *only* works with a
 			 * callout_stop() *not* callout_drain() or
 			 * callout_async_drain().
 			 */
 			cc_exec_cancel(cc, direct) = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			if (callout_migrating(c)) {
 				c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 				cc_migration_cpu(cc, direct) = CPUBLOCK;
 				cc_migration_time(cc, direct) = 0;
 				cc_migration_prec(cc, direct) = 0;
 				cc_migration_func(cc, direct) = NULL;
 				cc_migration_arg(cc, direct) = NULL;
 #endif
 			}
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if (callout_migrating(c)) {
 			/*
 			 * The callout is currently being serviced
 			 * and the "next" callout is scheduled at
 			 * its completion with a migration. We remove
 			 * the migration flag so it *won't* get rescheduled,
 			 * but we can't stop the one thats running so
 			 * we return 0.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 			/* 
 			 * We can't call cc_cce_cleanup here since
 			 * if we do it will remove .ce_curr and
 			 * its still running. This will prevent a
 			 * reschedule of the callout when the 
 			 * execution completes.
 			 */
 			cc_migration_cpu(cc, direct) = CPUBLOCK;
 			cc_migration_time(cc, direct) = 0;
 			cc_migration_prec(cc, direct) = 0;
 			cc_migration_func(cc, direct) = NULL;
 			cc_migration_arg(cc, direct) = NULL;
 #endif
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
  			if (drain) {
 				cc_exec_drain(cc, direct) = drain;
 			}
 			CC_UNLOCK(cc);
 			return (0);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		if (drain) {
 			cc_exec_drain(cc, direct) = drain;
 		}
 		CC_UNLOCK(cc);
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		return (0);
 	}
 	if (sq_locked)
 		sleepq_release(&cc_exec_waiting(cc, direct));
 
 	c->c_iflags &= ~CALLOUT_PENDING;
 	c->c_flags &= ~CALLOUT_ACTIVE;
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if (not_on_a_list == 0) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 	}
 	callout_cc_del(c, cc);
 	CC_UNLOCK(cc);
 	return (1);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_iflags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_iflags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independant on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(struct timeval *time_change)
 {
 	register struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = (time_change->tv_sec * 1000000 +
 			       time_change->tv_usec + (tick - 1)) / tick + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 			      (time_change->tv_usec + (tick - 1)) / tick + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
+
+#ifdef DDB
+
+static void
+_show_callout(struct callout *c)
+{
+
+	db_printf("callout %p\n", c);
+#define	C_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, c->e);
+	db_printf("   &c_links = %p\n", &(c->c_links));
+	C_DB_PRINTF("%" PRId64,	c_time);
+	C_DB_PRINTF("%" PRId64,	c_precision);
+	C_DB_PRINTF("%p",	c_arg);
+	C_DB_PRINTF("%p",	c_func);
+	C_DB_PRINTF("%p",	c_lock);
+	C_DB_PRINTF("%#x",	c_flags);
+	C_DB_PRINTF("%#x",	c_iflags);
+	C_DB_PRINTF("%d",	c_cpu);
+#undef	C_DB_PRINTF
+}
+
+DB_SHOW_COMMAND(callout, db_show_callout)
+{
+
+	if (!have_addr) {
+		db_printf("usage: show callout <struct callout *>\n");
+		return;
+	}
+
+	_show_callout((struct callout *)addr);
+}
+#endif /* DDB */
Index: projects/vnet/sys/kern/uipc_domain.c
===================================================================
--- projects/vnet/sys/kern/uipc_domain.c	(revision 294565)
+++ projects/vnet/sys/kern/uipc_domain.c	(revision 294566)
@@ -1,524 +1,528 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 /*
  * System initialization
  *
  * Note: domain initialization takes place on a per domain basis
  * as a result of traversing a SYSINIT linker set.  Most likely,
  * each domain would want to call DOMAIN_SET(9) itself, which
  * would cause the domain to be added just after domaininit()
  * is called during startup.
  *
  * See DOMAIN_SET(9) for details on its use.
  */
 
 static void domaininit(void *);
 SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL);
 
 static void domainfinalize(void *);
 SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize,
     NULL);
 
 static struct callout pffast_callout;
 static struct callout pfslow_callout;
 
 static void	pffasttimo(void *);
 static void	pfslowtimo(void *);
 
 struct domain *domains;		/* registered protocol domains */
 int domain_init_status = 0;
 static struct mtx dom_mtx;		/* domain list lock */
 MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF);
 
 /*
  * Dummy protocol specific user requests function pointer array.
  * All functions return EOPNOTSUPP.
  */
 struct pr_usrreqs nousrreqs = {
 	.pru_accept =		pru_accept_notsupp,
 	.pru_attach =		pru_attach_notsupp,
 	.pru_bind =		pru_bind_notsupp,
 	.pru_connect =		pru_connect_notsupp,
 	.pru_connect2 =		pru_connect2_notsupp,
 	.pru_control =		pru_control_notsupp,
 	.pru_disconnect	=	pru_disconnect_notsupp,
 	.pru_listen =		pru_listen_notsupp,
 	.pru_peeraddr =		pru_peeraddr_notsupp,
 	.pru_rcvd =		pru_rcvd_notsupp,
 	.pru_rcvoob =		pru_rcvoob_notsupp,
 	.pru_send =		pru_send_notsupp,
 	.pru_sense =		pru_sense_null,
 	.pru_shutdown =		pru_shutdown_notsupp,
 	.pru_sockaddr =		pru_sockaddr_notsupp,
 	.pru_sosend =		pru_sosend_notsupp,
 	.pru_soreceive =	pru_soreceive_notsupp,
 	.pru_sopoll =		pru_sopoll_notsupp,
 };
 
 static void
 protosw_init(struct protosw *pr)
 {
 	struct pr_usrreqs *pu;
 
 	pu = pr->pr_usrreqs;
 	KASSERT(pu != NULL, ("protosw_init: %ssw[%d] has no usrreqs!",
 	    pr->pr_domain->dom_name,
 	    (int)(pr - pr->pr_domain->dom_protosw)));
 
 	/*
 	 * Protocol switch methods fall into three categories: mandatory,
 	 * mandatory but protosw_init() provides a default, and optional.
 	 *
 	 * For true protocols (i.e., pru_attach != NULL), KASSERT truly
 	 * mandatory methods with no defaults, and initialize defaults for
 	 * other mandatory methods if the protocol hasn't defined an
 	 * implementation (NULL function pointer).
 	 */
 #if 0
 	if (pu->pru_attach != NULL) {
 		KASSERT(pu->pru_abort != NULL,
 		    ("protosw_init: %ssw[%d] pru_abort NULL",
 		    pr->pr_domain->dom_name,
 		    (int)(pr - pr->pr_domain->dom_protosw)));
 		KASSERT(pu->pru_send != NULL,
 		    ("protosw_init: %ssw[%d] pru_send NULL",
 		    pr->pr_domain->dom_name,
 		    (int)(pr - pr->pr_domain->dom_protosw)));
 	}
 #endif
 
 #define DEFAULT(foo, bar)	if ((foo) == NULL)  (foo) = (bar)
 	DEFAULT(pu->pru_accept, pru_accept_notsupp);
 	DEFAULT(pu->pru_bind, pru_bind_notsupp);
 	DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
 	DEFAULT(pu->pru_connect, pru_connect_notsupp);
 	DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
 	DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
 	DEFAULT(pu->pru_control, pru_control_notsupp);
 	DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
 	DEFAULT(pu->pru_listen, pru_listen_notsupp);
 	DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp);
 	DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
 	DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
 	DEFAULT(pu->pru_sense, pru_sense_null);
 	DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp);
 	DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp);
 	DEFAULT(pu->pru_sosend, sosend_generic);
 	DEFAULT(pu->pru_soreceive, soreceive_generic);
 	DEFAULT(pu->pru_sopoll, sopoll_generic);
 	DEFAULT(pu->pru_ready, pru_ready_notsupp);
 #undef DEFAULT
 	if (pr->pr_init)
 		(*pr->pr_init)();
 }
 
 /*
  * Add a new protocol domain to the list of supported domains
  * Note: you cant unload it again because a socket may be using it.
  * XXX can't fail at this time.
  */
 void
 domain_init(void *arg)
 {
 	struct domain *dp = arg;
 	struct protosw *pr;
 
 	if (dp->dom_init)
 		(*dp->dom_init)();
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 		protosw_init(pr);
 	/*
 	 * update global information about maximums
 	 */
 	max_hdr = max_linkhdr + max_protohdr;
 	max_datalen = MHLEN - max_hdr;
 	if (max_datalen < 1)
 		panic("%s: max_datalen < 1", __func__);
 }
 
 #ifdef VIMAGE
 void
 vnet_domain_init(void *arg)
 {
 
 	/* Virtualized case is no different -- call init functions. */
 	domain_init(arg);
 }
 
 void
 vnet_domain_uninit(void *arg)
 {
 	struct domain *dp = arg;
 	struct protosw *pr;
 
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
-		if (pr->pr_destroy)
+		if (pr->pr_destroy) {
+#ifdef INVARIANTS
+			printf("%s: pr %p called pr_destroy\n", __func__, pr);
+#endif
 			(*pr->pr_destroy)();
+		}
 	if (dp->dom_destroy)
 		(*dp->dom_destroy)();
 }
 #endif
 
 /*
  * Add a new protocol domain to the list of supported domains
  * Note: you cant unload it again because a socket may be using it.
  * XXX can't fail at this time.
  */
 void
 domain_add(void *data)
 {
 	struct domain *dp;
 
 	dp = (struct domain *)data;
 	mtx_lock(&dom_mtx);
 	dp->dom_next = domains;
 	domains = dp;
 
 	KASSERT(domain_init_status >= 1,
 	    ("attempt to domain_add(%s) before domaininit()",
 	    dp->dom_name));
 #ifndef INVARIANTS
 	if (domain_init_status < 1)
 		printf("WARNING: attempt to domain_add(%s) before "
 		    "domaininit()\n", dp->dom_name);
 #endif
 #ifdef notyet
 	KASSERT(domain_init_status < 2,
 	    ("attempt to domain_add(%s) after domainfinalize()",
 	    dp->dom_name));
 #else
 	if (domain_init_status >= 2)
 		printf("WARNING: attempt to domain_add(%s) after "
 		    "domainfinalize()\n", dp->dom_name);
 #endif
 	mtx_unlock(&dom_mtx);
 }
 
 /* ARGSUSED*/
 static void
 domaininit(void *dummy)
 {
 
 	if (max_linkhdr < 16)		/* XXX */
 		max_linkhdr = 16;
 
 	callout_init(&pffast_callout, 1);
 	callout_init(&pfslow_callout, 1);
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 0, ("domaininit called too late!"));
 	domain_init_status = 1;
 	mtx_unlock(&dom_mtx);
 }
 
 /* ARGSUSED*/
 static void
 domainfinalize(void *dummy)
 {
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
 	domain_init_status = 2;
 	mtx_unlock(&dom_mtx);	
 
 	callout_reset(&pffast_callout, 1, pffasttimo, NULL);
 	callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
 }
 
 struct domain *
 pffinddomain(int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		if (dp->dom_family == family)
 			return (dp);
 	return (NULL);
 }
 
 struct protosw *
 pffindtype(int family, int type)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (NULL);
 
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 		if (pr->pr_type && pr->pr_type == type)
 			return (pr);
 	return (NULL);
 }
 
 struct protosw *
 pffindproto(int family, int protocol, int type)
 {
 	struct domain *dp;
 	struct protosw *pr;
 	struct protosw *maybe;
 
 	maybe = NULL;
 	if (family == 0)
 		return (NULL);
 
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (NULL);
 
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
 			return (pr);
 
 		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
 		    pr->pr_protocol == 0 && maybe == NULL)
 			maybe = pr;
 	}
 	return (maybe);
 }
 
 /*
  * The caller must make sure that the new protocol is fully set up and ready to
  * accept requests before it is registered.
  */
 int
 pf_proto_register(int family, struct protosw *npr)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct domain *dp;
 	struct protosw *pr, *fpr;
 
 	/* Sanity checks. */
 	if (family == 0)
 		return (EPFNOSUPPORT);
 	if (npr->pr_type == 0)
 		return (EPROTOTYPE);
 	if (npr->pr_protocol == 0)
 		return (EPROTONOSUPPORT);
 	if (npr->pr_usrreqs == NULL)
 		return (ENXIO);
 
 	/* Try to find the specified domain based on the family. */
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (EPFNOSUPPORT);
 
 	/* Initialize backpointer to struct domain. */
 	npr->pr_domain = dp;
 	fpr = NULL;
 
 	/*
 	 * Protect us against races when two protocol registrations for
 	 * the same protocol happen at the same time.
 	 */
 	mtx_lock(&dom_mtx);
 
 	/* The new protocol must not yet exist. */
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 		if ((pr->pr_type == npr->pr_type) &&
 		    (pr->pr_protocol == npr->pr_protocol)) {
 			mtx_unlock(&dom_mtx);
 			return (EEXIST);	/* XXX: Check only protocol? */
 		}
 		/* While here, remember the first free spacer. */
 		if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER))
 			fpr = pr;
 	}
 
 	/* If no free spacer is found we can't add the new protocol. */
 	if (fpr == NULL) {
 		mtx_unlock(&dom_mtx);
 		return (ENOMEM);
 	}
 
 	/* Copy the new struct protosw over the spacer. */
 	bcopy(npr, fpr, sizeof(*fpr));
 
 	/* Job is done, no more protection required. */
 	mtx_unlock(&dom_mtx);
 
 	/* Initialize and activate the protocol. */
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET_QUIET(vnet_iter);
 		protosw_init(fpr);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * The caller must make sure the protocol and its functions correctly shut down
  * all sockets and release all locks and memory references.
  */
 int
 pf_proto_unregister(int family, int protocol, int type)
 {
 	struct domain *dp;
 	struct protosw *pr, *dpr;
 
 	/* Sanity checks. */
 	if (family == 0)
 		return (EPFNOSUPPORT);
 	if (protocol == 0)
 		return (EPROTONOSUPPORT);
 	if (type == 0)
 		return (EPROTOTYPE);
 
 	/* Try to find the specified domain based on the family type. */
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (EPFNOSUPPORT);
 
 	dpr = NULL;
 
 	/* Lock out everyone else while we are manipulating the protosw. */
 	mtx_lock(&dom_mtx);
 
 	/* The protocol must exist and only once. */
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 		if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) {
 			if (dpr != NULL) {
 				mtx_unlock(&dom_mtx);
 				return (EMLINK);   /* Should not happen! */
 			} else
 				dpr = pr;
 		}
 	}
 
 	/* Protocol does not exist. */
 	if (dpr == NULL) {
 		mtx_unlock(&dom_mtx);
 		return (EPROTONOSUPPORT);
 	}
 
 	/* De-orbit the protocol and make the slot available again. */
 	dpr->pr_type = 0;
 	dpr->pr_domain = dp;
 	dpr->pr_protocol = PROTO_SPACER;
 	dpr->pr_flags = 0;
 	dpr->pr_input = NULL;
 	dpr->pr_output = NULL;
 	dpr->pr_ctlinput = NULL;
 	dpr->pr_ctloutput = NULL;
 	dpr->pr_init = NULL;
 	dpr->pr_fasttimo = NULL;
 	dpr->pr_slowtimo = NULL;
 	dpr->pr_drain = NULL;
 	dpr->pr_usrreqs = &nousrreqs;
 
 	/* Job is done, not more protection required. */
 	mtx_unlock(&dom_mtx);
 
 	return (0);
 }
 
 void
 pfctlinput(int cmd, struct sockaddr *sa)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_ctlinput)
 				(*pr->pr_ctlinput)(cmd, sa, (void *)0);
 }
 
 void
 pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	if (!sa)
 		return;
 	for (dp = domains; dp; dp = dp->dom_next) {
 		/*
 		 * the check must be made by xx_ctlinput() anyways, to
 		 * make sure we use data item pointed to by ctlparam in
 		 * correct way.  the following check is made just for safety.
 		 */
 		if (dp->dom_family != sa->sa_family)
 			continue;
 
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_ctlinput)
 				(*pr->pr_ctlinput)(cmd, sa, ctlparam);
 	}
 }
 
 static void
 pfslowtimo(void *arg)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_slowtimo)
 				(*pr->pr_slowtimo)();
 	callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
 }
 
 static void
 pffasttimo(void *arg)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_fasttimo)
 				(*pr->pr_fasttimo)();
 	callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
 }
Index: projects/vnet/sys/net/if.c
===================================================================
--- projects/vnet/sys/net/if.c	(revision 294565)
+++ projects/vnet/sys/net/if.c	(revision 294566)
@@ -1,3955 +1,3980 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int, struct if_clone *);
 static int	if_detach_internal(struct ifnet *, int, struct if_clone **);
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE);
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static u_short
 ifindex_alloc(void)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	return (idx);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = ifp;
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	ifnet_setbyindex_locked(idx, ifp);
 	IFNET_WUNLOCK();
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa = NULL;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
+
+/*
+ * XXX-BZ VNET; probably along with dom stuff.
+ * This is very wrong but MC currently implies that interfaces are
+ * gone before we can free it.  This needs to be fied differently
+ * and this needs to be moved back to SI_SUB_INIT_IF.
+ */
+static void
+vnet_if_return(const void *unused __unused)
+{
+	struct ifnet *ifp, *nifp;
+
+	/* Return all inherited interfaces to their parent vnets. */
+	TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
+		if (ifp->if_home_vnet != ifp->if_vnet)
+			if_vmove(ifp, ifp->if_home_vnet);
+	}
+}
+VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
+    vnet_if_return, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifnet **e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc(u_char type)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 	IFNET_WLOCK();
 	idx = ifindex_alloc();
 	ifnet_setbyindex_locked(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0, NULL);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	/* Restore group membership for cloned interfaces. */
 	if (vmove && ifc != NULL)
 		if_clone_addgroup(ifp, ifc);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_rtrequest = link_rtrequest;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0)
 		return;
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
+	/* XXX IF_ADDR_R/WLOCK */
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
+		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
+		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0, NULL);
 	CURVNET_RESTORE();
 }
 
 static int
 if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	if (found)
 		curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		/*
 		 * While we would want to panic here, we cannot
 		 * guarantee that the interface is indeed still on
 		 * the list given we don't hold locks all the way.
 		 */
 		return (ENOENT);
 #if 0
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 #endif
 	}
 
 	/* Check if this is a cloned interface or not. */
 	if (vmove && ifcp != NULL)
 		*ifcp = if_clone_findifc(ifp);
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
+			IF_ADDR_WLOCK(ifp);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
+			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		}
 	}
 
 	rt_flushifroutes(ifp);
 	if_delgroups(ifp);
 
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 	struct if_clone *ifc;
 	int rc;
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
 	rc = if_detach_internal(ifp, 1, &ifc);
 	if (rc != 0)
 		return;
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	ifp->if_index = ifindex_alloc();
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1, ifc);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	CURVNET_RESTORE();
 	if (difp != NULL) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
-		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
+		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);		// <<<<
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RLOCK(ifp);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	IF_ADDR_RLOCK((struct ifnet *)ifp);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	IF_ADDR_RUNLOCK((struct ifnet *)ifp);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 static int
 ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa,
     struct sockaddr *ia)
 {
 	int error;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 	struct ifnet *ifp;
 
 	ifp = ifa->ifa_ifp;
 
 	bzero(&info, sizeof(info));
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
 
 	error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n",
 		    __func__, otype, if_name(ifp), error);
 
 	return (error);
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia));
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia));
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia));
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			if_up(ifp);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	CURVNET_SET(so->so_vnet);
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: projects/vnet/sys/net/if_disc.c
===================================================================
--- projects/vnet/sys/net/if_disc.c	(revision 294565)
+++ projects/vnet/sys/net/if_disc.c	(revision 294566)
@@ -1,244 +1,244 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if_loop.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 /*
  * Discard interface driver for protocol testing and timing.
  * (Based on the loopback.)
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TINY_DSMTU
 #define	DSMTU	(1024+512)
 #else
 #define DSMTU	65532
 #endif
 
 struct disc_softc {
 	struct ifnet *sc_ifp;
 };
 
 static int	discoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	discioctl(struct ifnet *, u_long, caddr_t);
 static int	disc_clone_create(struct if_clone *, int, caddr_t);
 static void	disc_clone_destroy(struct ifnet *);
 
 static const char discname[] = "disc";
 static MALLOC_DEFINE(M_DISC, discname, "Discard interface");
 
 static VNET_DEFINE(struct if_clone *, disc_cloner);
 #define	V_disc_cloner	VNET(disc_cloner)
 
 static int
 disc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet		*ifp;
 	struct disc_softc	*sc;
 
 	sc = malloc(sizeof(struct disc_softc), M_DISC, M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_LOOP);
 	if (ifp == NULL) {
 		free(sc, M_DISC);
 		return (ENOSPC);
 	}
 
 	ifp->if_softc = sc;
 	if_initname(ifp, discname, unit);
 	ifp->if_mtu = DSMTU;
 	/*
 	 * IFF_LOOPBACK should not be removed from disc's flags because
 	 * it controls what PF-specific routes are magically added when
 	 * a network address is assigned to the interface.  Things just
 	 * won't work as intended w/o such routes because the output
 	 * interface selection for a packet is totally route-driven.
 	 * A valid alternative to IFF_LOOPBACK can be IFF_BROADCAST or
 	 * IFF_POINTOPOINT, but it would result in different properties
 	 * of the interface.
 	 */
 	ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	ifp->if_drv_flags = IFF_DRV_RUNNING;
 	ifp->if_ioctl = discioctl;
 	ifp->if_output = discoutput;
 	ifp->if_hdrlen = 0;
 	ifp->if_addrlen = 0;
 	ifp->if_snd.ifq_maxlen = 20;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 
 	return (0);
 }
 
 static void
 disc_clone_destroy(struct ifnet *ifp)
 {
 	struct disc_softc	*sc;
 
 	sc = ifp->if_softc;
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 
 	free(sc, M_DISC);
 }
 
 static void
 vnet_disc_init(const void *unused __unused)
 {
 
 	V_disc_cloner = if_clone_simple(discname, disc_clone_create,
 	    disc_clone_destroy, 0);
 }
-VNET_SYSINIT(vnet_disc_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSINIT(vnet_disc_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_disc_init, NULL);
 
 static void
 vnet_disc_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_disc_cloner);
 }
-VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_disc_uninit, NULL);
 
 static int
 disc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t disc_mod = {
 	"if_disc",
 	disc_modevent,
 	NULL
 };
 
 DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 static int
 discoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	u_int32_t af;
 
 	M_ASSERTPKTHDR(m);
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = dst->sa_family;
 
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
 
 	m->m_pkthdr.rcvif = ifp;
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 discioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == 0) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
Index: projects/vnet/sys/net/if_enc.c
===================================================================
--- projects/vnet/sys/net/if_enc.c	(revision 294565)
+++ projects/vnet/sys/net/if_enc.c	(revision 294566)
@@ -1,404 +1,404 @@
 /*-
  * Copyright (c) 2006 The FreeBSD Project.
  * Copyright (c) 2015 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_enc.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #define ENCMTU		(1024+512)
 
 /* XXX this define must have the same value as in OpenBSD */
 #define M_CONF		0x0400	/* payload was encrypted (ESP-transport) */
 #define M_AUTH		0x0800	/* payload was authenticated (AH or ESP auth) */
 #define M_AUTH_AH	0x2000	/* header was authenticated (AH) */
 
 struct enchdr {
 	u_int32_t af;
 	u_int32_t spi;
 	u_int32_t flags;
 };
 struct enc_softc {
 	struct	ifnet *sc_ifp;
 };
 static VNET_DEFINE(struct enc_softc *, enc_sc);
 #define	V_enc_sc	VNET(enc_sc)
 static VNET_DEFINE(struct if_clone *, enc_cloner);
 #define	V_enc_cloner	VNET(enc_cloner)
 
 static int	enc_ioctl(struct ifnet *, u_long, caddr_t);
 static int	enc_output(struct ifnet *, struct mbuf *,
     const struct sockaddr *, struct route *);
 static int	enc_clone_create(struct if_clone *, int, caddr_t);
 static void	enc_clone_destroy(struct ifnet *);
 static int	enc_add_hhooks(struct enc_softc *);
 static void	enc_remove_hhooks(struct enc_softc *);
 
 static const char encname[] = "enc";
 
 /*
  * Before and after are relative to when we are stripping the
  * outer IP header.
  */
 static VNET_DEFINE(int, filter_mask_in) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, bpf_mask_in) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, filter_mask_out) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, bpf_mask_out) = IPSEC_ENC_BEFORE | IPSEC_ENC_AFTER;
 #define	V_filter_mask_in	VNET(filter_mask_in)
 #define	V_bpf_mask_in		VNET(bpf_mask_in)
 #define	V_filter_mask_out	VNET(filter_mask_out)
 #define	V_bpf_mask_out		VNET(bpf_mask_out)
 
 static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW, 0, "enc sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW, 0, "enc input sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW, 0, "enc output sysctl");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_in), 0,
     "IPsec input firewall filter mask");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_in), 0,
     "IPsec input bpf mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_out), 0,
     "IPsec output firewall filter mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_out), 0,
     "IPsec output bpf mask");
 
 static void
 enc_clone_destroy(struct ifnet *ifp)
 {
 	struct enc_softc *sc;
 
 	sc = ifp->if_softc;
 	KASSERT(sc == V_enc_sc, ("sc != ifp->if_softc"));
 
 	enc_remove_hhooks(sc);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 	free(sc, M_DEVBUF);
 	V_enc_sc = NULL;
 }
 
 static int
 enc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 	struct enc_softc *sc;
 
 	sc = malloc(sizeof(struct enc_softc), M_DEVBUF,
 	    M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ENC);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 	if (V_enc_sc != NULL) {
 		if_free(ifp);
 		free(sc, M_DEVBUF);
 		return (EEXIST);
 	}
 	V_enc_sc = sc;
 	if_initname(ifp, encname, unit);
 	ifp->if_mtu = ENCMTU;
 	ifp->if_ioctl = enc_ioctl;
 	ifp->if_output = enc_output;
 	ifp->if_softc = sc;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_ENC, sizeof(struct enchdr));
 	if (enc_add_hhooks(sc) != 0) {
 		enc_clone_destroy(ifp);
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 enc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 
 	if (cmd != SIOCSIFFLAGS)
 		return (EINVAL);
 	if (ifp->if_flags & IFF_UP)
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	return (0);
 }
 
 /*
  * One helper hook function is used by any hook points.
  * + from hhook_type we can determine the packet direction:
  *   HHOOK_TYPE_IPSEC_IN or HHOOK_TYPE_IPSEC_OUT;
  * + from hhook_id we can determine address family: AF_INET or AF_INET6;
  * + udata contains pointer to enc_softc;
  * + ctx_data contains pointer to struct ipsec_ctx_data.
  */
 static int
 enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data,
     void *hdata, struct osd *hosd)
 {
 	struct enchdr hdr;
 	struct ipsec_ctx_data *ctx;
 	struct enc_softc *sc;
 	struct ifnet *ifp, *rcvif;
 	struct pfil_head *ph;
 	int pdir;
 
 	sc = (struct enc_softc *)udata;
 	ifp = sc->sc_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return (0);
 
 	ctx = (struct ipsec_ctx_data *)ctx_data;
 	/* XXX: wrong hook point was used by caller? */
 	if (ctx->af != hhook_id)
 		return (EPFNOSUPPORT);
 
 	if (((hhook_type == HHOOK_TYPE_IPSEC_IN &&
 	    (ctx->enc & V_bpf_mask_in) != 0) ||
 	    (hhook_type == HHOOK_TYPE_IPSEC_OUT &&
 	    (ctx->enc & V_bpf_mask_out) != 0)) &&
 	    bpf_peers_present(ifp->if_bpf) != 0) {
 		hdr.af = ctx->af;
 		hdr.spi = ctx->sav->spi;
 		hdr.flags = 0;
 		if (ctx->sav->alg_enc != SADB_EALG_NONE)
 			hdr.flags |= M_CONF;
 		if (ctx->sav->alg_auth != SADB_AALG_NONE)
 			hdr.flags |= M_AUTH;
 		bpf_mtap2(ifp->if_bpf, &hdr, sizeof(hdr), *ctx->mp);
 	}
 
 	switch (hhook_type) {
 	case HHOOK_TYPE_IPSEC_IN:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_IBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_in) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_IN;
 		break;
 	case HHOOK_TYPE_IPSEC_OUT:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_out) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_OUT;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (hhook_id) {
 #ifdef INET
 	case AF_INET:
 		ph = &V_inet_pfil_hook;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ph = &V_inet6_pfil_hook;
 		break;
 #endif
 	default:
 		ph = NULL;
 	}
 	if (ph == NULL || !PFIL_HOOKED(ph))
 		return (0);
 	/* Make a packet looks like it was received on enc(4) */
 	rcvif = (*ctx->mp)->m_pkthdr.rcvif;
 	(*ctx->mp)->m_pkthdr.rcvif = ifp;
 	if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, NULL) != 0 ||
 	    *ctx->mp == NULL) {
 		*ctx->mp = NULL; /* consumed by filter */
 		return (EACCES);
 	}
 	(*ctx->mp)->m_pkthdr.rcvif = rcvif;
 	return (0);
 }
 
 static int
 enc_add_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 	int error;
 
 	error = EPFNOSUPPORT;
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 	return (error);
 }
 
 static void
 enc_remove_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki);
 #endif
 }
 
 static void
 vnet_enc_init(const void *unused __unused)
 {
 
 	V_enc_sc = NULL;
 	V_enc_cloner = if_clone_simple(encname, enc_clone_create,
 	    enc_clone_destroy, 1);
 }
-VNET_SYSINIT(vnet_enc_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSINIT(vnet_enc_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_enc_init, NULL);
 
 static void
 vnet_enc_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_enc_cloner);
 }
-VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_enc_uninit, NULL);
 
 static int
 enc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t enc_mod = {
 	"if_enc",
 	enc_modevent,
 	0
 };
 
-DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: projects/vnet/sys/net/pfil.c
===================================================================
--- projects/vnet/sys/net/pfil.c	(revision 294565)
+++ projects/vnet/sys/net/pfil.c	(revision 294566)
@@ -1,407 +1,402 @@
 /*	$FreeBSD$ */
 /*	$NetBSD: pfil.c,v 1.20 2001/11/12 23:49:46 lukem Exp $	*/
 
 /*-
  * Copyright (c) 1996 Matthew R. Green
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 
 static struct mtx pfil_global_lock;
 
 MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock",
   MTX_DEF);
 
 static struct packet_filter_hook *pfil_chain_get(int, struct pfil_head *);
 static int pfil_chain_add(pfil_chain_t *, struct packet_filter_hook *, int);
 static int pfil_chain_remove(pfil_chain_t *, pfil_func_t, void *);
 
 LIST_HEAD(pfilheadhead, pfil_head);
 VNET_DEFINE(struct pfilheadhead, pfil_head_list);
 #define	V_pfil_head_list	VNET(pfil_head_list)
 VNET_DEFINE(struct rmlock, pfil_lock);
 #define	V_pfil_lock	VNET(pfil_lock)
 
 /*
  * pfil_run_hooks() runs the specified packet filter hook chain.
  */
 int
 pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp,
     int dir, struct inpcb *inp)
 {
 	struct rm_priotracker rmpt;
 	struct packet_filter_hook *pfh;
 	struct mbuf *m = *mp;
 	int rv = 0;
 
 	PFIL_RLOCK(ph, &rmpt);
 	KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0"));
 	for (pfh = pfil_chain_get(dir, ph); pfh != NULL;
 	     pfh = TAILQ_NEXT(pfh, pfil_chain)) {
 		if (pfh->pfil_func != NULL) {
 			rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir,
 			    inp);
 			if (rv != 0 || m == NULL)
 				break;
 		}
 	}
 	PFIL_RUNLOCK(ph, &rmpt);
 	*mp = m;
 	return (rv);
 }
 
 static struct packet_filter_hook *
 pfil_chain_get(int dir, struct pfil_head *ph)
 {
 
 	if (dir == PFIL_IN)
 		return (TAILQ_FIRST(&ph->ph_in));
 	else if (dir == PFIL_OUT)
 		return (TAILQ_FIRST(&ph->ph_out));
 	else
 		return (NULL);
 }
 
 /*
  * pfil_try_rlock() acquires rm reader lock for specified head
  * if this is immediately possible.
  */
 int
 pfil_try_rlock(struct pfil_head *ph, struct rm_priotracker *tracker)
 {
 
 	return (PFIL_TRY_RLOCK(ph, tracker));
 }
 
 /*
  * pfil_rlock() acquires rm reader lock for specified head.
  */
 void
 pfil_rlock(struct pfil_head *ph, struct rm_priotracker *tracker)
 {
 
 	PFIL_RLOCK(ph, tracker);
 }
 
 /*
  * pfil_runlock() releases reader lock for specified head.
  */
 void
 pfil_runlock(struct pfil_head *ph, struct rm_priotracker *tracker)
 {
 
 	PFIL_RUNLOCK(ph, tracker);
 }
 
 /*
  * pfil_wlock() acquires writer lock for specified head.
  */
 void
 pfil_wlock(struct pfil_head *ph)
 {
 
 	PFIL_WLOCK(ph);
 }
 
 /*
  * pfil_wunlock() releases writer lock for specified head.
  */
 void
 pfil_wunlock(struct pfil_head *ph)
 {
 
 	PFIL_WUNLOCK(ph);
 }
 
 /*
  * pfil_wowned() returns a non-zero value if the current thread owns
  * an exclusive lock.
  */
 int
 pfil_wowned(struct pfil_head *ph)
 {
 
 	return (PFIL_WOWNED(ph));
 }
 
 /*
  * pfil_head_register() registers a pfil_head with the packet filter hook
  * mechanism.
  */
 int
 pfil_head_register(struct pfil_head *ph)
 {
 	struct pfil_head *lph;
 
 	PFIL_HEADLIST_LOCK();
 	LIST_FOREACH(lph, &V_pfil_head_list, ph_list) {
 		if (ph->ph_type == lph->ph_type &&
 		    ph->ph_un.phu_val == lph->ph_un.phu_val) {
 			PFIL_HEADLIST_UNLOCK();
 			return (EEXIST);
 		}
 	}
 	PFIL_LOCK_INIT(ph);
 	ph->ph_nhooks = 0;
 	TAILQ_INIT(&ph->ph_in);
 	TAILQ_INIT(&ph->ph_out);
 	LIST_INSERT_HEAD(&V_pfil_head_list, ph, ph_list);
 	PFIL_HEADLIST_UNLOCK();
 	return (0);
 }
 
 /*
  * pfil_head_unregister() removes a pfil_head from the packet filter hook
  * mechanism.  The producer of the hook promises that all outstanding
  * invocations of the hook have completed before it unregisters the hook.
  */
 int
 pfil_head_unregister(struct pfil_head *ph)
 {
 	struct packet_filter_hook *pfh, *pfnext;
 		
 	PFIL_HEADLIST_LOCK();
 	LIST_REMOVE(ph, ph_list);
 	PFIL_HEADLIST_UNLOCK();
 	TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_chain, pfnext)
 		free(pfh, M_IFADDR);
 	TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_chain, pfnext)
 		free(pfh, M_IFADDR);
 	PFIL_LOCK_DESTROY(ph);
 	return (0);
 }
 
 /*
  * pfil_head_get() returns the pfil_head for a given key/dlt.
  */
 struct pfil_head *
 pfil_head_get(int type, u_long val)
 {
 	struct pfil_head *ph;
 
 	PFIL_HEADLIST_LOCK();
 	LIST_FOREACH(ph, &V_pfil_head_list, ph_list)
 		if (ph->ph_type == type && ph->ph_un.phu_val == val)
 			break;
 	PFIL_HEADLIST_UNLOCK();
 	return (ph);
 }
 
 /*
  * pfil_add_hook() adds a function to the packet filter hook.  the
  * flags are:
  *	PFIL_IN		call me on incoming packets
  *	PFIL_OUT	call me on outgoing packets
  *	PFIL_ALL	call me on all of the above
  *	PFIL_WAITOK	OK to call malloc with M_WAITOK.
  */
 int
 pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph)
 {
 	struct packet_filter_hook *pfh1 = NULL;
 	struct packet_filter_hook *pfh2 = NULL;
 	int err;
 
 	if (flags & PFIL_IN) {
 		pfh1 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), 
 		    M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT);
 		if (pfh1 == NULL) {
 			err = ENOMEM;
 			goto error;
 		}
 	}
 	if (flags & PFIL_OUT) {
 		pfh2 = (struct packet_filter_hook *)malloc(sizeof(*pfh1),
 		    M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT);
 		if (pfh2 == NULL) {
 			err = ENOMEM;
 			goto error;
 		}
 	}
 	PFIL_WLOCK(ph);
 	if (flags & PFIL_IN) {
 		pfh1->pfil_func = func;
 		pfh1->pfil_arg = arg;
 		err = pfil_chain_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT);
 		if (err)
 			goto locked_error;
 		ph->ph_nhooks++;
 	}
 	if (flags & PFIL_OUT) {
 		pfh2->pfil_func = func;
 		pfh2->pfil_arg = arg;
 		err = pfil_chain_add(&ph->ph_out, pfh2, flags & ~PFIL_IN);
 		if (err) {
 			if (flags & PFIL_IN)
 				pfil_chain_remove(&ph->ph_in, func, arg);
 			goto locked_error;
 		}
 		ph->ph_nhooks++;
 	}
 	PFIL_WUNLOCK(ph);
 	return (0);
 locked_error:
 	PFIL_WUNLOCK(ph);
 error:
 	if (pfh1 != NULL)
 		free(pfh1, M_IFADDR);
 	if (pfh2 != NULL)
 		free(pfh2, M_IFADDR);
 	return (err);
 }
 
 /*
  * pfil_remove_hook removes a specific function from the packet filter hook
  * chain.
  */
 int
 pfil_remove_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph)
 {
 	int err = 0;
 
 	PFIL_WLOCK(ph);
 	if (flags & PFIL_IN) {
 		err = pfil_chain_remove(&ph->ph_in, func, arg);
 		if (err == 0)
 			ph->ph_nhooks--;
 	}
 	if ((err == 0) && (flags & PFIL_OUT)) {
 		err = pfil_chain_remove(&ph->ph_out, func, arg);
 		if (err == 0)
 			ph->ph_nhooks--;
 	}
 	PFIL_WUNLOCK(ph);
 	return (err);
 }
 
 /*
  * Internal: Add a new pfil hook into a hook chain.
  */
 static int
 pfil_chain_add(pfil_chain_t *chain, struct packet_filter_hook *pfh1, int flags)
 {
 	struct packet_filter_hook *pfh;
 
 	/*
 	 * First make sure the hook is not already there.
 	 */
 	TAILQ_FOREACH(pfh, chain, pfil_chain)
 		if (pfh->pfil_func == pfh1->pfil_func &&
 		    pfh->pfil_arg == pfh1->pfil_arg)
 			return (EEXIST);
 
 	/*
 	 * Insert the input list in reverse order of the output list so that
 	 * the same path is followed in or out of the kernel.
 	 */
 	if (flags & PFIL_IN)
 		TAILQ_INSERT_HEAD(chain, pfh1, pfil_chain);
 	else
 		TAILQ_INSERT_TAIL(chain, pfh1, pfil_chain);
 	return (0);
 }
 
 /*
  * Internal: Remove a pfil hook from a hook chain.
  */
 static int
 pfil_chain_remove(pfil_chain_t *chain, pfil_func_t func, void *arg)
 {
 	struct packet_filter_hook *pfh;
 
 	TAILQ_FOREACH(pfh, chain, pfil_chain)
 		if (pfh->pfil_func == func && pfh->pfil_arg == arg) {
 			TAILQ_REMOVE(chain, pfh, pfil_chain);
 			free(pfh, M_IFADDR);
 			return (0);
 		}
 	return (ENOENT);
 }
 
 /*
  * Stuff that must be initialized for every instance (including the first of
  * course).
  */
-static int
+static void
 vnet_pfil_init(const void *unused)
 {
 
 	LIST_INIT(&V_pfil_head_list);
 	PFIL_LOCK_INIT_REAL(&V_pfil_lock, "shared");
-	return (0);
 }
 
 /*
  * Called for the removal of each instance.
  */
-static int
+static void
 vnet_pfil_uninit(const void *unused)
 {
 
-	KASSERT(LIST_EMPTY(&V_pfil_head_list),
+	VNET_ASSERT(LIST_EMPTY(&V_pfil_head_list),
 	    ("%s: pfil_head_list %p not empty", __func__, &V_pfil_head_list));
 	PFIL_LOCK_DESTROY_REAL(&V_pfil_lock);
-	return (0);
 }
 
-/* Define startup order. */
-#define	PFIL_SYSINIT_ORDER	SI_SUB_PROTO_BEGIN
-#define	PFIL_MODEVENT_ORDER	(SI_ORDER_FIRST) /* On boot slot in here. */
-#define	PFIL_VNET_ORDER		(PFIL_MODEVENT_ORDER + 2) /* Later still. */
-
 /*
  * Starting up.
  *
  * VNET_SYSINIT is called for each existing vnet and each new vnet.
+ * Make sure the pfil bits are first before any possible subsystem which
+ * might piggyback on the SI_SUB_PROTO_PFIL.
  */
-VNET_SYSINIT(vnet_pfil_init, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER,
+VNET_SYSINIT(vnet_pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST,
     vnet_pfil_init, NULL);
  
 /*
  * Closing up shop.  These are done in REVERSE ORDER.  Not called on reboot.
  *
  * VNET_SYSUNINIT is called for each exiting vnet as it exits.
  */
-VNET_SYSUNINIT(vnet_pfil_uninit, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER,
+VNET_SYSUNINIT(vnet_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST,
     vnet_pfil_uninit, NULL);
Index: projects/vnet/sys/net/vnet.c
===================================================================
--- projects/vnet/sys/net/vnet.c	(revision 294565)
+++ projects/vnet/sys/net/vnet.c	(revision 294566)
@@ -1,782 +1,774 @@
 /*-
  * Copyright (c) 2004-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 /*-
  * This file implements core functions for virtual network stacks:
  *
  * - Virtual network stack management functions.
  *
  * - Virtual network stack memory allocator, which virtualizes global
  *   variables in the network stack
  *
  * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems
  *   to register startup/shutdown events to be run for each virtual network
  *   stack instance.
  */
 
 FEATURE(vimage, "VIMAGE kernel virtualization");
 
 static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
 
 /*
  * The virtual network stack list has two read-write locks, one sleepable and
  * the other not, so that the list can be stablized and walked in a variety
  * of network stack contexts.  Both must be acquired exclusively to modify
  * the list, but a read lock of either lock is sufficient to walk the list.
  */
 struct rwlock		vnet_rwlock;
 struct sx		vnet_sxlock;
 
 #define	VNET_LIST_WLOCK() do {						\
 	sx_xlock(&vnet_sxlock);						\
 	rw_wlock(&vnet_rwlock);						\
 } while (0)
 
 #define	VNET_LIST_WUNLOCK() do {					\
 	rw_wunlock(&vnet_rwlock);					\
 	sx_xunlock(&vnet_sxlock);					\
 } while (0)
 
 struct vnet_list_head vnet_head;
 struct vnet *vnet0;
 
 /*
  * The virtual network stack allocator provides storage for virtualized
  * global variables.  These variables are defined/declared using the
  * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet'
  * linker set.  The details of the implementation are somewhat subtle, but
  * allow the majority of most network subsystems to maintain
  * virtualization-agnostic.
  *
  * The virtual network stack allocator handles variables in the base kernel
  * vs. modules in similar but different ways.  In both cases, virtualized
  * global variables are marked as such by being declared to be part of the
  * vnet linker set.  These "master" copies of global variables serve two
  * functions:
  *
  * (1) They contain static initialization or "default" values for global
  *     variables which will be propagated to each virtual network stack
  *     instance when created.  As with normal global variables, they default
  *     to zero-filled.
  *
  * (2) They act as unique global names by which the variable can be referred
  *     to, regardless of network stack instance.  The single global symbol
  *     will be used to calculate the location of a per-virtual instance
  *     variable at run-time.
  *
  * Each virtual network stack instance has a complete copy of each
  * virtualized global variable, stored in a malloc'd block of memory
  * referred to by vnet->vnet_data_mem.  Critical to the design is that each
  * per-instance memory block is laid out identically to the master block so
  * that the offset of each global variable is the same across all blocks.  To
  * optimize run-time access, a precalculated 'base' address,
  * vnet->vnet_data_base, is stored in each vnet, and is the amount that can
  * be added to the address of a 'master' instance of a variable to get to the
  * per-vnet instance.
  *
  * Virtualized global variables are handled in a similar manner, but as each
  * module has its own 'set_vnet' linker set, and we want to keep all
  * virtualized globals togther, we reserve space in the kernel's linker set
  * for potential module variables using a per-vnet character array,
  * 'modspace'.  The virtual network stack allocator maintains a free list to
  * track what space in the array is free (all, initially) and as modules are
  * linked, allocates portions of the space to specific globals.  The kernel
  * module linker queries the virtual network stack allocator and will
  * bind references of the global to the location during linking.  It also
  * calls into the virtual network stack allocator, once the memory is
  * initialized, in order to propagate the new static initializations to all
  * existing virtual network stack instances so that the soon-to-be executing
  * module will find every network stack instance with proper default values.
  */
 
 /*
  * Number of bytes of data in the 'set_vnet' linker set, and hence the total
  * size of all kernel virtualized global variables, and the malloc(9) type
  * that will be used to allocate it.
  */
 #define	VNET_BYTES	(VNET_STOP - VNET_START)
 
 static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data");
 
 /*
  * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of
  * global variables across all loaded modules.  As this actually sizes an
  * array declared as a virtualized global variable in the kernel itself, and
  * we want the virtualized global variable space to be page-sized, we may
  * have more space than that in practice.
  */
 #define	VNET_MODMIN	8192
 #define	VNET_SIZE	roundup2(VNET_BYTES, PAGE_SIZE)
 #define	VNET_MODSIZE	(VNET_SIZE - (VNET_BYTES - VNET_MODMIN))
 
 /*
  * Space to store virtualized global variables from loadable kernel modules,
  * and the free list to manage it.
  */
 static VNET_DEFINE(char, modspace[VNET_MODMIN]);
 
 /*
  * Global lists of subsystem constructor and destructors for vnets.  They are
  * registered via VNET_SYSINIT() and VNET_SYSUNINIT().  Both lists are
  * protected by the vnet_sysinit_sxlock global lock.
  */
 static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors =
 	TAILQ_HEAD_INITIALIZER(vnet_constructors);
 static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors =
 	TAILQ_HEAD_INITIALIZER(vnet_destructors);
 
 struct sx		vnet_sysinit_sxlock;
 
 #define	VNET_SYSINIT_WLOCK()	sx_xlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_WUNLOCK()	sx_xunlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RLOCK()	sx_slock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RUNLOCK()	sx_sunlock(&vnet_sysinit_sxlock);
 
 struct vnet_data_free {
 	uintptr_t	vnd_start;
 	int		vnd_len;
 	TAILQ_ENTRY(vnet_data_free) vnd_link;
 };
 
 static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free",
     "VNET resource accounting");
 static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head =
 	    TAILQ_HEAD_INITIALIZER(vnet_data_free_head);
 static struct sx vnet_data_free_lock;
 
 SDT_PROVIDER_DEFINE(vnet);
 SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int",
     "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return,
     "int");
 
 #ifdef DDB
 static void db_show_vnet_print_vs(struct vnet_sysinit *, int);
 #endif
 
 /*
  * Allocate a virtual network stack.
  */
 struct vnet *
 vnet_alloc(void)
 {
 	struct vnet *vnet;
 
 	SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__);
 	vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO);
 	vnet->vnet_magic_n = VNET_MAGIC_N;
 	SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet);
 
 	/*
 	 * Allocate storage for virtualized global variables and copy in
 	 * initial values form our 'master' copy.
 	 */
 	vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK);
 	memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES);
 
 	/*
 	 * All use of vnet-specific data will immediately subtract VNET_START
 	 * from the base memory pointer, so pre-calculate that now to avoid
 	 * it on each use.
 	 */
 	vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START;
 
 	/* Initialize / attach vnet module instances. */
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysinit();
 	CURVNET_RESTORE();
 
 	VNET_LIST_WLOCK();
 	LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet);
 	return (vnet);
 }
 
 /*
  * Destroy a virtual network stack.
  */
 void
 vnet_destroy(struct vnet *vnet)
 {
-	struct ifnet *ifp, *nifp;
 
 	SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet);
 	KASSERT(vnet->vnet_sockcnt == 0,
 	    ("%s: vnet still has sockets", __func__));
 
 	VNET_LIST_WLOCK();
 	LIST_REMOVE(vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	CURVNET_SET_QUIET(vnet);
-
-	/* Return all inherited interfaces to their parent vnets. */
-	TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
-		if (ifp->if_home_vnet != ifp->if_vnet)
-			if_vmove(ifp, ifp->if_home_vnet);
-	}
-
 	vnet_sysuninit();
 	CURVNET_RESTORE();
 
 	/*
 	 * Release storage for the virtual network stack instance.
 	 */
 	free(vnet->vnet_data_mem, M_VNET_DATA);
 	vnet->vnet_data_mem = NULL;
 	vnet->vnet_data_base = 0;
 	vnet->vnet_magic_n = 0xdeadbeef;
 	free(vnet, M_VNET);
 	SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__);
 }
 
 /*
  * Boot time initialization and allocation of virtual network stacks.
  */
 static void
 vnet_init_prelink(void *arg)
 {
 
 	rw_init(&vnet_rwlock, "vnet_rwlock");
 	sx_init(&vnet_sxlock, "vnet_sxlock");
 	sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
 	LIST_INIT(&vnet_head);
 }
 SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
     vnet_init_prelink, NULL);
 
 static void
 vnet0_init(void *arg)
 {
 
 	/* Warn people before take off - in case we crash early. */
 	printf("WARNING: VIMAGE (virtualized network stack) is a highly "
 	    "experimental feature.\n");
 
 	/*
 	 * We MUST clear curvnet in vi_init_done() before going SMP,
 	 * otherwise CURVNET_SET() macros would scream about unnecessary
 	 * curvnet recursions.
 	 */
 	curvnet = prison0.pr_vnet = vnet0 = vnet_alloc();
 }
 SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL);
 
 static void
 vnet_init_done(void *unused)
 {
 
 	curvnet = NULL;
 }
 
 SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done,
     NULL);
 
 /*
  * Once on boot, initialize the modspace freelist to entirely cover modspace.
  */
 static void
 vnet_data_startup(void *dummy __unused)
 {
 	struct vnet_data_free *df;
 
 	df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	df->vnd_start = (uintptr_t)&VNET_NAME(modspace);
 	df->vnd_len = VNET_MODMIN;
 	TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link);
 	sx_init(&vnet_data_free_lock, "vnet_data alloc lock");
 }
 SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0);
 
 /*
  * When a module is loaded and requires storage for a virtualized global
  * variable, allocate space from the modspace free list.  This interface
  * should be used only by the kernel linker.
  */
 void *
 vnet_data_alloc(int size)
 {
 	struct vnet_data_free *df;
 	void *s;
 
 	s = NULL;
 	size = roundup2(size, sizeof(void *));
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_len < size)
 			continue;
 		if (df->vnd_len == size) {
 			s = (void *)df->vnd_start;
 			TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link);
 			free(df, M_VNET_DATA_FREE);
 			break;
 		}
 		s = (void *)df->vnd_start;
 		df->vnd_len -= size;
 		df->vnd_start = df->vnd_start + size;
 		break;
 	}
 	sx_xunlock(&vnet_data_free_lock);
 
 	return (s);
 }
 
 /*
  * Free space for a virtualized global variable on module unload.
  */
 void
 vnet_data_free(void *start_arg, int size)
 {
 	struct vnet_data_free *df;
 	struct vnet_data_free *dn;
 	uintptr_t start;
 	uintptr_t end;
 
 	size = roundup2(size, sizeof(void *));
 	start = (uintptr_t)start_arg;
 	end = start + size;
 	/*
 	 * Free a region of space and merge it with as many neighbors as
 	 * possible.  Keeping the list sorted simplifies this operation.
 	 */
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_start > end)
 			break;
 		/*
 		 * If we expand at the end of an entry we may have to merge
 		 * it with the one following it as well.
 		 */
 		if (df->vnd_start + df->vnd_len == start) {
 			df->vnd_len += size;
 			dn = TAILQ_NEXT(df, vnd_link);
 			if (df->vnd_start + df->vnd_len == dn->vnd_start) {
 				df->vnd_len += dn->vnd_len;
 				TAILQ_REMOVE(&vnet_data_free_head, dn,
 				    vnd_link);
 				free(dn, M_VNET_DATA_FREE);
 			}
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 		if (df->vnd_start == end) {
 			df->vnd_start = start;
 			df->vnd_len += size;
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 	}
 	dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	dn->vnd_start = start;
 	dn->vnd_len = size;
 	if (df)
 		TAILQ_INSERT_BEFORE(df, dn, vnd_link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link);
 	sx_xunlock(&vnet_data_free_lock);
 }
 
 /*
  * When a new virtualized global variable has been allocated, propagate its
  * initial value to each already-allocated virtual network stack instance.
  */
 void
 vnet_data_copy(void *start, int size)
 {
 	struct vnet *vnet;
 
 	VNET_LIST_RLOCK();
 	LIST_FOREACH(vnet, &vnet_head, vnet_le)
 		memcpy((void *)((uintptr_t)vnet->vnet_data_base +
 		    (uintptr_t)start), start, size);
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Support for special SYSINIT handlers registered via VNET_SYSINIT()
  * and VNET_SYSUNINIT().
  */
 void
 vnet_register_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;	
 	struct vnet *vnet;
 
 	vs = arg;
 	KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early"));
 
 	/* Add the constructor to the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_constructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_constructors, vs, link);
 
 	/*
 	 * Invoke the constructor on all the existing vnets when it is
 	 * registered.
 	 */
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs;
 
 	vs = arg;
 
 	/* Remove the constructor from the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_REMOVE(&vnet_constructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_register_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;
 
 	vs = arg;
 
 	/* Add the destructor to the global list of vnet destructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_destructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs;
 	struct vnet *vnet;
 
 	vs = arg;
 
 	/*
 	 * Invoke the destructor on all the existing vnets when it is
 	 * deregistered.
 	 */
 	VNET_SYSINIT_WLOCK();
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 
 	/* Remove the destructor from the global list of vnet destructors. */
 	TAILQ_REMOVE(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 /*
  * Invoke all registered vnet constructors on the current vnet.  Used during
  * vnet construction.  The caller is responsible for ensuring the new vnet is
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysinit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * Invoke all registered vnet destructors on the current vnet.  Used during
  * vnet destruction.  The caller is responsible for ensuring the dying vnet
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysuninit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 /*
  * Invoke the eventhandler function originally registered with the possibly
  * registered argument for all virtual network stack instances.
  *
  * This iterator can only be used for eventhandlers that do not take any
  * additional arguments, as we do ignore the variadic arguments from the
  * EVENTHANDLER_INVOKE() call.
  */
 void
 vnet_global_eventhandler_iterator_func(void *arg, ...)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct eventhandler_entry_vimage *v_ee;
 
 	/*
 	 * There is a bug here in that we should actually cast things to
 	 * (struct eventhandler_entry_ ## name *)  but that's not easily
 	 * possible in here so just re-using the variadic version we
 	 * defined for the generic vimage case.
 	 */
 	v_ee = arg;
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 #ifdef VNET_DEBUG
 struct vnet_recursion {
 	SLIST_ENTRY(vnet_recursion)	 vnr_le;
 	const char			*prev_fn;
 	const char			*where_fn;
 	int				 where_line;
 	struct vnet			*old_vnet;
 	struct vnet			*new_vnet;
 };
 
 static SLIST_HEAD(, vnet_recursion) vnet_recursions =
     SLIST_HEAD_INITIALIZER(vnet_recursions);
 
 static void
 vnet_print_recursion(struct vnet_recursion *vnr, int brief)
 {
 
 	if (!brief)
 		printf("CURVNET_SET() recursion in ");
 	printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
 	    vnr->prev_fn);
 	if (brief)
 		printf(", ");
 	else
 		printf("\n    ");
 	printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet);
 }
 
 void
 vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line)
 {
 	struct vnet_recursion *vnr;
 
 	/* Skip already logged recursion events. */
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		if (vnr->prev_fn == old_fn &&
 		    vnr->where_fn == curthread->td_vnet_lpush &&
 		    vnr->where_line == line &&
 		    (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet))
 			return;
 
 	vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO);
 	if (vnr == NULL)
 		panic("%s: malloc failed", __func__);
 	vnr->prev_fn = old_fn;
 	vnr->where_fn = curthread->td_vnet_lpush;
 	vnr->where_line = line;
 	vnr->old_vnet = old_vnet;
 	vnr->new_vnet = curvnet;
 
 	SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le);
 
 	vnet_print_recursion(vnr, 0);
 #ifdef KDB
 	kdb_backtrace();
 #endif
 }
 #endif /* VNET_DEBUG */
 
 /*
  * DDB(4).
  */
 #ifdef DDB
 DB_SHOW_COMMAND(vnets, db_show_vnets)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		db_printf("vnet            = %p\n", vnet_iter);
 		db_printf(" vnet_magic_n   = 0x%x (%s, orig 0x%x)\n",
 		    vnet_iter->vnet_magic_n,
 		    (vnet_iter->vnet_magic_n == VNET_MAGIC_N) ?
 			"ok" : "mismatch", VNET_MAGIC_N);
 		db_printf(" vnet_ifcnt     = %u\n", vnet_iter->vnet_ifcnt);
 		db_printf(" vnet_sockcnt   = %u\n", vnet_iter->vnet_sockcnt);
 		db_printf(" vnet_data_mem  = %p\n", vnet_iter->vnet_data_mem);
 		db_printf(" vnet_data_base = 0x%jx\n",
 		    (uintmax_t)vnet_iter->vnet_data_base);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 static void
 db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb)
 {
 	const char *vsname, *funcname;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 #define xprint(...)							\
 	if (ddb)							\
 		db_printf(__VA_ARGS__);					\
 	else								\
 		printf(__VA_ARGS__)
 
 	if (vs == NULL) {
 		xprint("%s: no vnet_sysinit * given\n", __func__);
 		return;
 	}
 
 	sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
 	db_symbol_values(sym, &vsname, NULL);
 	sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
 	db_symbol_values(sym, &funcname, NULL);
 	xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
 	xprint("  0x%08x 0x%08x\n", vs->subsystem, vs->order);
 	xprint("  %p(%s)(%p)\n",
 	    vs->func, (funcname != NULL) ? funcname : "", vs->arg);
 #undef xprint
 }
 
 DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSUNINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #ifdef VNET_DEBUG
 DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs)
 {
 	struct vnet_recursion *vnr;
 
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		vnet_print_recursion(vnr, 1);
 }
 #endif
 #endif /* DDB */
Index: projects/vnet/sys/netinet/igmp.c
===================================================================
--- projects/vnet/sys/netinet/igmp.c	(revision 294565)
+++ projects/vnet/sys/netinet/igmp.c	(revision 294566)
@@ -1,3667 +1,3674 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_ddb.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifsoftc *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct mbufq *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifsoftc *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifsoftc *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct mbufq *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct mbufq *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifsoftc *,
 		    struct mbufq *, struct mbufq *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct mbufq *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
-static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head);
+static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) =
+    LIST_HEAD_INITIALIZER(igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			struct igmp_ifinfo info;
 
 			info.igi_version = igi->igi_version;
 			info.igi_v1_timer = igi->igi_v1_timer;
 			info.igi_v2_timer = igi->igi_v2_timer;
 			info.igi_v3_timer = igi->igi_v3_timer;
 			info.igi_flags = igi->igi_flags;
 			info.igi_rv = igi->igi_rv;
 			info.igi_qi = igi->igi_qi;
 			info.igi_qri = igi->igi_qri;
 			info.igi_uri = igi->igi_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifsoftc *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifsoftc *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 	SLIST_INIT(&igi->igi_relinmhead);
 	mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
-	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
-
-	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	igi_delete_locked(ifp);
-
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 
 #ifdef INVARIANTS
 	panic("%s: igmp_ifsoftc not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				if (nsrc * sizeof(in_addr_t) >
 				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				   sizeof(struct in_addr) * nsrc;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
     struct mbufq *qrq, struct mbufq *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifsoftc *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		mbufq_drain(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifsoftc *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->inm_scq;
 			mbufq_drain(mq);
 			retval = igmp_v3_enqueue_group_record(mq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	mbufq_drain(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(mq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&igi->igi_gq) != 0)
 		goto send;
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&igi->igi_gq) != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker	in_ifa_tracker;
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
+#ifdef VIMAGE
 static void
-igmp_init(void *unused __unused)
+vnet_igmp_uninit(const void *unused __unused)
 {
 
-	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
-
-	IGMP_LOCK_INIT();
-
-	m_raopt = igmp_ra_alloc();
-
-	netisr_register(&igmp_nh);
-}
-SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
-
-static void
-igmp_uninit(void *unused __unused)
-{
-
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
-	netisr_unregister(&igmp_nh);
-
-	m_free(m_raopt);
-	m_raopt = NULL;
-
-	IGMP_LOCK_DESTROY();
+	VNET_ASSERT(LIST_EMPTY(&V_igi_head),
+	    ("%s: igi list %p not empty; ifnets not detached?", __func__,
+	    &V_igi_head));
 }
-SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
+VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
+    vnet_igmp_uninit, NULL);
+#endif
 
-static void
-vnet_igmp_init(const void *unused __unused)
+#ifdef DDB
+DB_SHOW_COMMAND(igi_list, db_show_igi_list)
 {
+	struct igmp_ifsoftc *igi, *tigi;
+	LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
 
-	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
+        if (!have_addr) {
+                db_printf("usage: show igi_list <addr>\n");
+                return;
+        }
+        igi_head = (struct _igi_list *)addr;
 
-	LIST_INIT(&V_igi_head);
+	LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
+		db_printf("igmp_ifsoftc %p:\n", igi);
+		db_printf("    ifp %p\n", igi->igi_ifp);
+		db_printf("    version %u\n", igi->igi_version);
+		db_printf("    v1_timer %u\n", igi->igi_v1_timer);
+		db_printf("    v2_timer %u\n", igi->igi_v2_timer);
+		db_printf("    v3_timer %u\n", igi->igi_v3_timer);
+		db_printf("    flags %#x\n", igi->igi_flags);
+		db_printf("    rv %u\n", igi->igi_rv);
+		db_printf("    qi %u\n", igi->igi_qi);
+		db_printf("    qri %u\n", igi->igi_qri);
+		db_printf("    uri %u\n", igi->igi_uri);
+		/* SLIST_HEAD(,in_multi)   igi_relinmhead */
+		/* struct mbufq    igi_gq; */
+		db_printf("\n");
+	}
 }
-VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
-    NULL);
+#endif
 
-static void
-vnet_igmp_uninit(const void *unused __unused)
-{
-
-	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
-
-	KASSERT(LIST_EMPTY(&V_igi_head),
-	    ("%s: igi list not empty; ifnets not detached?", __func__));
-}
-VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
-    vnet_igmp_uninit, NULL);
-
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
-    switch (type) {
-    case MOD_LOAD:
-    case MOD_UNLOAD:
-	break;
-    default:
-	return (EOPNOTSUPP);
-    }
-    return (0);
+	switch (type) {
+	case MOD_LOAD:
+		CTR1(KTR_IGMPV3, "%s: initializing", __func__);
+		IGMP_LOCK_INIT();
+		m_raopt = igmp_ra_alloc();
+		netisr_register(&igmp_nh);
+		break;
+	case MOD_UNLOAD:
+		CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+		netisr_unregister(&igmp_nh);
+		m_free(m_raopt);
+		m_raopt = NULL;
+		IGMP_LOCK_DESTROY();
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+	return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
-DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: projects/vnet/sys/netinet/in_proto.c
===================================================================
--- projects/vnet/sys/netinet/in_proto.c	(revision 294565)
+++ projects/vnet/sys/netinet/in_proto.c	(revision 294566)
@@ -1,396 +1,378 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.2 (Berkeley) 2/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mrouting.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/domain.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 
 /*
  * While this file provides the domain and protocol switch tables for IPv4, it
  * also provides the sysctl node declarations for net.inet.* often shared with
  * IPv6 for common features or by upper layer protocols.  In case of no IPv4
  * support compile out everything but these sysctl nodes.
  */
 #ifdef INET
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/vnet.h>
 #endif /* INET */
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #endif
 
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/ip_encap.h>
 
 /*
  * TCP/IP protocol family: IP, ICMP, UDP, TCP.
  */
 
 static struct pr_usrreqs nousrreqs;
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /* IPSEC */
 
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #endif /* SCTP */
 
 FEATURE(inet, "Internet Protocol version 4");
 
 extern	struct domain inetdomain;
 
 /* Spacer for loadable protocols. */
 #define IPPROTOSPACER   			\
 {						\
 	.pr_domain =		&inetdomain,	\
 	.pr_protocol =		PROTO_SPACER,	\
 	.pr_usrreqs =		&nousrreqs	\
 }
 
 struct protosw inetsw[] = {
 {
 	.pr_type =		0,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_IP,
 	.pr_init =		ip_init,
-#ifdef VIMAGE
-	.pr_destroy =		ip_destroy,
-#endif
 	.pr_slowtimo =		ip_slowtimo,
 	.pr_drain =		ip_drain,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_UDP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp_input,
 	.pr_ctlinput =		udp_ctlinput,
 	.pr_ctloutput =		udp_ctloutput,
 	.pr_init =		udp_init,
-#ifdef VIMAGE
-	.pr_destroy =		udp_destroy,
-#endif
 	.pr_usrreqs =		&udp_usrreqs
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
 	.pr_input =		tcp_input,
 	.pr_ctlinput =		tcp_ctlinput,
 	.pr_ctloutput =		tcp_ctloutput,
 	.pr_init =		tcp_init,
-#ifdef VIMAGE
-	.pr_destroy =		tcp_destroy,
-#endif
 	.pr_slowtimo =		tcp_slowtimo,
 	.pr_drain =		tcp_drain,
 	.pr_usrreqs =		&tcp_usrreqs
 },
 #ifdef SCTP
 { 
 	.pr_type =		SOCK_SEQPACKET,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp_input,
 	.pr_ctlinput =		sctp_ctlinput,
 	.pr_ctloutput =		sctp_ctloutput,
 	.pr_init =		sctp_init,
-#ifdef VIMAGE
-	.pr_destroy =		sctp_finish,
-#endif
 	.pr_drain =		sctp_drain,
 	.pr_usrreqs =		&sctp_usrreqs
 },
 { 
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp_input,
 	.pr_ctlinput =		sctp_ctlinput,
 	.pr_ctloutput =		sctp_ctloutput,
 	.pr_drain =		sctp_drain,
 	.pr_usrreqs =		&sctp_usrreqs
 },
 #endif /* SCTP */
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_UDPLITE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp_input,
 	.pr_ctlinput =		udplite_ctlinput,
 	.pr_ctloutput =		udp_ctloutput,
 	.pr_init =		udplite_init,
-#ifdef VIMAGE
-	.pr_destroy =		udplite_destroy,
-#endif
 	.pr_usrreqs =		&udp_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip_input,
 	.pr_ctlinput =		rip_ctlinput,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_ICMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		icmp_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_IGMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		igmp_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_fasttimo =		igmp_fasttimo,
 	.pr_slowtimo =		igmp_slowtimo,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_RSVP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		rsvp_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 },
 #ifdef IPSEC
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_AH,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		ah4_input,
 	.pr_ctlinput =		ah4_ctlinput,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_ESP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		esp4_input,
 	.pr_ctlinput =		esp4_ctlinput,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_IPCOMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		ipcomp4_input,
 	.pr_usrreqs =		&nousrreqs
 },
 #endif /* IPSEC */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_IPV4,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_MOBILE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_ETHERIP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_GRE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip_usrreqs
 },
 # ifdef INET6
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip_usrreqs
 },
 #endif
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap4_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 },
 /* Spacer n-times for loadable protocols. */
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 IPPROTOSPACER,
 /* raw wildcard */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip_input,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_init =		rip_init,
-#ifdef VIMAGE
-	.pr_destroy =		rip_destroy,
-#endif
 	.pr_usrreqs =		&rip_usrreqs
 },
 };
 
 extern int in_inithead(void **, int);
 extern int in_detachhead(void **, int);
 
 struct domain inetdomain = {
 	.dom_family =		AF_INET,
 	.dom_name =		"internet",
 	.dom_protosw =		inetsw,
 	.dom_protoswNPROTOSW =	&inetsw[sizeof(inetsw)/sizeof(inetsw[0])],
 #ifdef RADIX_MPATH
 	.dom_rtattach =		rn4_mpath_inithead,
 #else
 	.dom_rtattach =		in_inithead,
 #endif
 #ifdef VIMAGE
 	.dom_rtdetach =		in_detachhead,
 #endif
 	.dom_ifattach =		in_domifattach,
 	.dom_ifdetach =		in_domifdetach
 };
 
 VNET_DOMAIN_SET(inet);
 #endif /* INET */
 
 SYSCTL_NODE(_net,      PF_INET,		inet,	CTLFLAG_RW, 0,
 	"Internet Family");
 
 SYSCTL_NODE(_net_inet, IPPROTO_IP,	ip,	CTLFLAG_RW, 0,	"IP");
 SYSCTL_NODE(_net_inet, IPPROTO_ICMP,	icmp,	CTLFLAG_RW, 0,	"ICMP");
 SYSCTL_NODE(_net_inet, IPPROTO_UDP,	udp,	CTLFLAG_RW, 0,	"UDP");
 SYSCTL_NODE(_net_inet, IPPROTO_TCP,	tcp,	CTLFLAG_RW, 0,	"TCP");
 #ifdef SCTP
 SYSCTL_NODE(_net_inet, IPPROTO_SCTP,	sctp,	CTLFLAG_RW, 0,	"SCTP");
 #endif
 SYSCTL_NODE(_net_inet, IPPROTO_IGMP,	igmp,	CTLFLAG_RW, 0,	"IGMP");
 #ifdef IPSEC
 /* XXX no protocol # to use, pick something "reserved" */
 SYSCTL_NODE(_net_inet, 253,		ipsec,	CTLFLAG_RW, 0,	"IPSEC");
 SYSCTL_NODE(_net_inet, IPPROTO_AH,	ah,	CTLFLAG_RW, 0,	"AH");
 SYSCTL_NODE(_net_inet, IPPROTO_ESP,	esp,	CTLFLAG_RW, 0,	"ESP");
 SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP,	ipcomp,	CTLFLAG_RW, 0,	"IPCOMP");
 SYSCTL_NODE(_net_inet, IPPROTO_IPIP,	ipip,	CTLFLAG_RW, 0,	"IPIP");
 #endif /* IPSEC */
 SYSCTL_NODE(_net_inet, IPPROTO_RAW,	raw,	CTLFLAG_RW, 0,	"RAW");
 SYSCTL_NODE(_net_inet, OID_AUTO,	accf,	CTLFLAG_RW, 0,
     "Accept filters");
Index: projects/vnet/sys/netinet/ip_divert.c
===================================================================
--- projects/vnet/sys/netinet/ip_divert.c	(revision 294565)
+++ projects/vnet/sys/netinet/ip_divert.c	(revision 294566)
@@ -1,840 +1,832 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #ifndef INET
 #error "IPDIVERT requires INET"
 #endif
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h> 
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #ifdef SCTP
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw or other packet filters,
  * see the divert(4) manpage for features.
  * Packets are selected by the packet filter and tagged with an
  * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
  * the packet filter) and information on the matching filter rule for
  * subsequent reinjection. The divert_port is used to put the packet
  * on the corresponding divert socket, while the rule number is passed
  * up (at least partially) as the sin_port in the struct sockaddr.
  *
  * Packets written to the divert socket carry in sin_addr a
  * destination address, and in sin_port the number of the filter rule
  * after which to continue processing.
  * If the destination address is INADDR_ANY, the packet is treated as
  * as outgoing and sent to ip_output(); otherwise it is treated as
  * incoming and sent to ip_input().
  * Further, sin_zero carries some information on the interface,
  * which can be used in the reinject -- see comments in the code.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * packet filter processing will start at the rule number after the one
  * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
  * will apply the entire ruleset to the packet).
  */
 
 /* Internal variables. */
 static VNET_DEFINE(struct inpcbhead, divcb);
 static VNET_DEFINE(struct inpcbinfo, divcbinfo);
 
 #define	V_divcb				VNET(divcb)
 #define	V_divcbinfo			VNET(divcbinfo)
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 static eventhandler_tag ip_divert_event_tag;
 
 /*
  * Initialize divert connection block queue.
  */
 static void
 div_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
 }
 
 static int
 div_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "divinp");
 	return (0);
 }
 
 static void
 div_inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 static void
 div_init(void)
 {
 
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier to
 	 * allocate one-entry hash lists than it is to check all over the
 	 * place for hashbase == NULL.
 	 */
 	in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
 	    div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_NONE);
 }
 
 static void
 div_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_divcbinfo);
 }
+VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
+    div_destroy, NULL);
 
 /*
  * IPPROTO_DIVERT is not in the real IP protocol number space; this
  * function should never be called.  Just in case, drop any packets.
  */
 static int
 div_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 
 	KMOD_IPSTAT_INC(ips_noproto);
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 static void
 divert_packet(struct mbuf *m, int incoming)
 {
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct m_tag *mtag;
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	/* record matching rule, in host format */
 	divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	if (incoming) {
 		struct ifaddr *ifa;
 		struct ifnet *ifp;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		ifp = m->m_pkthdr.rcvif;
 		if_addr_rlock(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 		if_addr_runlock(ifp);
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
 	INP_INFO_RLOCK(&V_divcbinfo);
 	LIST_FOREACH(inp, &V_divcb, inp_list) {
 		/* XXX why does only one socket match? */
 		if (inp->inp_lport == nport) {
 			INP_RLOCK(inp);
 			sa = inp->inp_socket;
 			SOCKBUF_LOCK(&sa->so_rcv);
 			if (sbappendaddr_locked(&sa->so_rcv,
 			    (struct sockaddr *)&divsrc, m,
 			    (struct mbuf *)0) == 0) {
 				SOCKBUF_UNLOCK(&sa->so_rcv);
 				sa = NULL;	/* force mbuf reclaim below */
 			} else
 				sorwakeup_locked(sa);
 			INP_RUNLOCK(inp);
 			break;
 		}
 	}
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 	if (sa == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_noproto);
 		KMOD_IPSTAT_DEC(ips_delivered);
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
     struct mbuf *control)
 {
 	struct ip *const ip = mtod(m, struct ip *);
 	struct m_tag *mtag;
 	struct ipfw_rule_ref *dt;
 	int error = 0;
 
 	/*
 	 * An mbuf may hasn't come from userland, but we pretend
 	 * that it has.
 	 */
 	m->m_pkthdr.rcvif = NULL;
 	m->m_nextpkt = NULL;
 	M_SETFIB(m, so->so_fibnum);
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		/* this should be normal */
 		mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			error = ENOBUFS;
 			goto cantsend;
 		}
 		m_tag_prepend(m, mtag);
 	}
 	dt = (struct ipfw_rule_ref *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		/* set the starting point. We provide a non-zero slot,
 		 * but a non_matching chain_id to skip that info and use
 		 * the rulenum/rule_id.
 		 */
 		dt->slot = 1; /* dummy, chain_id is invalid */
 		dt->chain_id = 0;
 		dt->rulenum = sin->sin_port+1; /* host format ? */
 		dt->rule_id = 0;
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct mbuf *options = NULL;
 		struct inpcb *inp;
 
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
 		inp = sotoinpcb(so);
 		INP_RLOCK(inp);
 		switch (ip->ip_v) {
 		case IPVERSION:
 			/*
 			 * Don't allow both user specified and setsockopt
 			 * options, and don't allow packet length sizes that
 			 * will crash.
 			 */
 			if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
 			    inp->inp_options != NULL) ||
 			    ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 				error = EINVAL;
 				INP_RUNLOCK(inp);
 				goto cantsend;
 			}
 			break;
 #ifdef INET6
 		case IPV6_VERSION >> 4:
 		    {
 			struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
 
 			/* Don't allow packet length sizes that will crash */
 			if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
 				error = EINVAL;
 				INP_RUNLOCK(inp);
 				goto cantsend;
 			}
 			break;
 		    }
 #endif
 		default:
 			error = EINVAL;
 			INP_RUNLOCK(inp);
 			goto cantsend;
 		}
 
 		/* Send packet to output processing */
 		KMOD_IPSTAT_INC(ips_rawout);		/* XXX */
 
 #ifdef MAC
 		mac_inpcb_create_mbuf(inp, m);
 #endif
 		/*
 		 * Get ready to inject the packet into ip_output().
 		 * Just in case socket options were specified on the
 		 * divert socket, we duplicate them.  This is done
 		 * to avoid having to hold the PCB locks over the call
 		 * to ip_output(), as doing this results in a number of
 		 * lock ordering complexities.
 		 *
 		 * Note that we set the multicast options argument for
 		 * ip_output() to NULL since it should be invariant that
 		 * they are not present.
 		 */
 		KASSERT(inp->inp_moptions == NULL,
 		    ("multicast options set on a divert socket"));
 		/*
 		 * XXXCSJP: It is unclear to me whether or not it makes
 		 * sense for divert sockets to have options.  However,
 		 * for now we will duplicate them with the INP locks
 		 * held so we can use them in ip_output() without
 		 * requring a reference to the pcb.
 		 */
 		if (inp->inp_options != NULL) {
 			options = m_dup(inp->inp_options, M_NOWAIT);
 			if (options == NULL) {
 				INP_RUNLOCK(inp);
 				error = ENOBUFS;
 				goto cantsend;
 			}
 		}
 		INP_RUNLOCK(inp);
 
 		switch (ip->ip_v) {
 		case IPVERSION:
 			error = ip_output(m, options, NULL,
 			    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 			    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 			break;
 #ifdef INET6
 		case IPV6_VERSION >> 4:
 			error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 			break;
 #endif
 		}
 		if (options != NULL)
 			m_freem(options);
 	} else {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 			ifa_free(ifa);
 		}
 #ifdef MAC
 		mac_socket_create_mbuf(so, m);
 #endif
 		/* Send packet to input processing via netisr */
 		switch (ip->ip_v) {
 		case IPVERSION:
 			netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
 			break;
 #ifdef INET6
 		case IPV6_VERSION >> 4:
 			netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
 			break;
 #endif
 		default:
 			error = EINVAL;
 			goto cantsend;
 		}
 	}
 
 	return (error);
 
 cantsend:
 	m_freem(m);
 	return (error);
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp  = sotoinpcb(so);
 	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
 			return (error);
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	INP_INFO_WLOCK(&V_divcbinfo);
 	error = in_pcballoc(so, &V_divcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static void
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
 	INP_INFO_WLOCK(&V_divcbinfo);
 	INP_WLOCK(inp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 	INP_INFO_WLOCK(&V_divcbinfo);
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_divcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_divcbinfo);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		KMOD_IPSTAT_INC(ips_toosmall);
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 static void
 div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
         struct in_addr faddr;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_divcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_divcbinfo);
 	gencnt = V_divcbinfo.ipi_gencnt;
 	n = V_divcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 
 	error = sysctl_wire_old_buffer(req,
 	    2 * sizeof(xig) + n*sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&V_divcbinfo);
 	for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_divcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_divcbinfo);
 		xig.xig_gen = V_divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_divcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_divcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 #ifdef SYSCTL_NODE
 static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0,
     "IPDIVERT");
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets");
 #endif
 
 struct pr_usrreqs div_usrreqs = {
 	.pru_attach =		div_attach,
 	.pru_bind =		div_bind,
 	.pru_control =		in_control,
 	.pru_detach =		div_detach,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		div_send,
 	.pru_shutdown =		div_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel
 };
 
 struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_protocol =		IPPROTO_DIVERT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		div_input,
 	.pr_ctlinput =		div_ctlinput,
 	.pr_ctloutput =		ip_ctloutput,
 	.pr_init =		div_init,
-#ifdef VIMAGE
-	.pr_destroy =		div_destroy,
-#endif
 	.pr_usrreqs =		&div_usrreqs
 };
 
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
-#ifndef VIMAGE
 	int n;
-#endif
 
 	switch (type) {
 	case MOD_LOAD:
 		/*
 		 * Protocol will be initialized by pf_proto_register().
 		 * We don't have to register ip_protox because we are not
 		 * a true IP protocol that goes over the wire.
 		 */
 		err = pf_proto_register(PF_INET, &div_protosw);
 		if (err != 0)
 			return (err);
 		ip_divert_ptr = divert_packet;
 		ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
 		    div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
-#ifdef VIMAGE
-		err = EPERM;
-		break;
-#else
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
 		 */
 		INP_INFO_WLOCK(&V_divcbinfo);
 		n = V_divcbinfo.ipi_count;
 		if (n != 0) {
 			err = EBUSY;
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			break;
 		}
 		ip_divert_ptr = NULL;
 		err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 		div_destroy();
 		EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
 		break;
-#endif /* !VIMAGE */
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
-DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_FW, SI_ORDER_ANY);
 MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3);
 MODULE_VERSION(ipdivert, 1);
Index: projects/vnet/sys/netinet/ip_mroute.c
===================================================================
--- projects/vnet/sys/netinet/ip_mroute.c	(revision 294565)
+++ projects/vnet/sys/netinet/ip_mroute.c	(revision 294566)
@@ -1,2948 +1,2948 @@
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  * Modified by Ahmed Helmy, SGI, June 1996
  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
  * bandwidth metering and signaling
  */
 
 /*
  * TODO: Prefix functions with ipmf_.
  * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
  * domain attachment (if_afdata) so we can track consumers of that service.
  * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
  * move it to socket options.
  * TODO: Cleanup LSRR removal further.
  * TODO: Push RSVP stubs into raw_ip.c.
  * TODO: Use bitstring.h for vif set.
  * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
  * TODO: Sync ip6_mroute.c with this file.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
 
 #define _PIM_VT 1
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/stddef.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/counter.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/pim.h>
 #include <netinet/pim_var.h>
 #include <netinet/udp.h>
 
 #include <machine/in_cksum.h>
 
 #ifndef KTR_IPMF
 #define KTR_IPMF KTR_INET
 #endif
 
 #define		VIFI_INVALID	((vifi_t) -1)
 
 static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */
 #define	V_last_tv_sec	VNET(last_tv_sec)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
 
 /*
  * Locking.  We use two locks: one for the virtual interface table and
  * one for the forwarding table.  These locks may be nested in which case
  * the VIF lock must always be taken first.  Note that each lock is used
  * to cover not only the specific data structure but also related data
  * structures.
  */
 
 static struct mtx mrouter_mtx;
 #define	MROUTER_LOCK()		mtx_lock(&mrouter_mtx)
 #define	MROUTER_UNLOCK()	mtx_unlock(&mrouter_mtx)
 #define	MROUTER_LOCK_ASSERT()	mtx_assert(&mrouter_mtx, MA_OWNED)
 #define	MROUTER_LOCK_INIT()						\
 	mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER_LOCK_DESTROY()	mtx_destroy(&mrouter_mtx)
 
 static int ip_mrouter_cnt;	/* # of vnets with active mrouters */
 static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
 
 static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat);
 VNET_PCPUSTAT_SYSINIT(mrtstat);
 VNET_PCPUSTAT_SYSUNINIT(mrtstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
     mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
     "netinet/ip_mroute.h)");
 
 static VNET_DEFINE(u_long, mfchash);
 #define	V_mfchash		VNET(mfchash)
 #define	MFCHASH(a, g)							\
 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
 #define	MFCHASHSIZE	256
 
 static u_long mfchashsize;			/* Hash size */
 static VNET_DEFINE(u_char *, nexpire);		/* 0..mfchashsize-1 */
 #define	V_nexpire		VNET(nexpire)
 static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
 #define	V_mfchashtbl		VNET(mfchashtbl)
 
 static struct mtx mfc_mtx;
 #define	MFC_LOCK()		mtx_lock(&mfc_mtx)
 #define	MFC_UNLOCK()		mtx_unlock(&mfc_mtx)
 #define	MFC_LOCK_ASSERT()	mtx_assert(&mfc_mtx, MA_OWNED)
 #define	MFC_LOCK_INIT()							\
 	mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF)
 #define	MFC_LOCK_DESTROY()	mtx_destroy(&mfc_mtx)
 
 static VNET_DEFINE(vifi_t, numvifs);
 #define	V_numvifs		VNET(numvifs)
 static VNET_DEFINE(struct vif, viftable[MAXVIFS]);
 #define	V_viftable		VNET(viftable)
 SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]",
     "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
 
 static struct mtx vif_mtx;
 #define	VIF_LOCK()		mtx_lock(&vif_mtx)
 #define	VIF_UNLOCK()		mtx_unlock(&vif_mtx)
 #define	VIF_LOCK_ASSERT()	mtx_assert(&vif_mtx, MA_OWNED)
 #define	VIF_LOCK_INIT()							\
 	mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF)
 #define	VIF_LOCK_DESTROY()	mtx_destroy(&vif_mtx)
 
 static eventhandler_tag if_detach_event_tag = NULL;
 
 static VNET_DEFINE(struct callout, expire_upcalls_ch);
 #define	V_expire_upcalls_ch	VNET(expire_upcalls_ch)
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 /*
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
 /*
  * Pending timeouts are stored in a hash table, the key being the
  * expiration time. Periodically, the entries are analysed and processed.
  */
 #define	BW_METER_BUCKETS	1024
 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]);
 #define	V_bw_meter_timers	VNET(bw_meter_timers)
 static VNET_DEFINE(struct callout, bw_meter_ch);
 #define	V_bw_meter_ch		VNET(bw_meter_ch)
 #define	BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
 
 /*
  * Pending upcalls are stored in a vector which is flushed when
  * full, or periodically
  */
 static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]);
 #define	V_bw_upcalls		VNET(bw_upcalls)
 static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */
 #define	V_bw_upcalls_n    	VNET(bw_upcalls_n)
 static VNET_DEFINE(struct callout, bw_upcalls_ch);
 #define	V_bw_upcalls_ch		VNET(bw_upcalls_ch)
 
 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
 
 static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat);
 VNET_PCPUSTAT_SYSINIT(pimstat);
 VNET_PCPUSTAT_SYSUNINIT(pimstat);
 
 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
     pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
 
 static u_long	pim_squelch_wholepkt = 0;
 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
     &pim_squelch_wholepkt, 0,
     "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
 
 extern  struct domain inetdomain;
 static const struct protosw in_pim_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		pim_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 static const struct encaptab *pim_encap_cookie;
 
 static int pim_encapcheck(const struct mbuf *, int, int, void *);
 
 /*
  * Note: the PIM Register encapsulation adds the following in front of a
  * data packet:
  *
  * struct pim_encap_hdr {
  *    struct ip ip;
  *    struct pim_encap_pimhdr  pim;
  * }
  *
  */
 
 struct pim_encap_pimhdr {
 	struct pim pim;
 	uint32_t   flags;
 };
 #define		PIM_ENCAP_TTL	64
 
 static struct ip pim_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2,
 	IPVERSION,
 #else
 	IPVERSION,
 	sizeof(struct ip) >> 2,
 #endif
 	0,			/* tos */
 	sizeof(struct ip),	/* total length */
 	0,			/* id */
 	0,			/* frag offset */
 	PIM_ENCAP_TTL,
 	IPPROTO_PIM,
 	0,			/* checksum */
 };
 
 static struct pim_encap_pimhdr pim_encap_pimhdr = {
     {
 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
 	0,			/* reserved */
 	0,			/* checksum */
     },
     0				/* flags */
 };
 
 static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID;
 #define	V_reg_vif_num		VNET(reg_vif_num)
 static VNET_DEFINE(struct ifnet, multicast_register_if);
 #define	V_multicast_register_if	VNET(multicast_register_if)
 
 /*
  * Private variables.
  */
 
 static u_long	X_ip_mcast_src(int);
 static int	X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
 		    struct ip_moptions *);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *, struct sockopt *);
 static int	X_ip_mrouter_set(struct socket *, struct sockopt *);
 static int	X_legal_vif_num(int);
 static int	X_mrt_ioctl(u_long, caddr_t, int);
 
 static int	add_bw_upcall(struct bw_upcall *);
 static int	add_mfc(struct mfcctl2 *);
 static int	add_vif(struct vifctl *);
 static void	bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
 static void	bw_meter_process(void);
 static void	bw_meter_receive_packet(struct bw_meter *, int,
 		    struct timeval *);
 static void	bw_upcalls_send(void);
 static int	del_bw_upcall(struct bw_upcall *);
 static int	del_mfc(struct mfcctl2 *);
 static int	del_vif(vifi_t);
 static int	del_vif_locked(vifi_t);
 static void	expire_bw_meter_process(void *);
 static void	expire_bw_upcalls_send(void *);
 static void	expire_mfc(struct mfc *);
 static void	expire_upcalls(void *);
 static void	free_bw_list(struct bw_meter *);
 static int	get_sg_cnt(struct sioc_sg_req *);
 static int	get_vif_cnt(struct sioc_vif_req *);
 static void	if_detached_event(void *, struct ifnet *);
 static int	ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
 static int	ip_mrouter_init(struct socket *, int);
 static __inline struct mfc *
 		mfc_find(struct in_addr *, struct in_addr *);
 static void	phyint_send(struct ip *, struct vif *, struct mbuf *);
 static struct mbuf *
 		pim_register_prepare(struct ip *, struct mbuf *);
 static int	pim_register_send(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_rp(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_upcall(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static void	schedule_bw_meter(struct bw_meter *, struct timeval *);
 static void	send_packet(struct vif *, struct mbuf *);
 static int	set_api_config(uint32_t *);
 static int	set_assert(int);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in *);
 static void	unschedule_bw_meter(struct bw_meter *);
 
 /*
  * Kernel multicast forwarding API capabilities and setup.
  * If more API capabilities are added to the kernel, they should be
  * recorded in `mrt_api_support'.
  */
 #define MRT_API_VERSION		0x0305
 
 static const int mrt_api_version = MRT_API_VERSION;
 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
 					 MRT_MFC_FLAGS_BORDER_VIF |
 					 MRT_MFC_RP |
 					 MRT_MFC_BW_UPCALL);
 static VNET_DEFINE(uint32_t, mrt_api_config);
 #define	V_mrt_api_config	VNET(mrt_api_config)
 static VNET_DEFINE(int, pim_assert_enabled);
 #define	V_pim_assert_enabled	VNET(pim_assert_enabled)
 static struct timeval pim_assert_interval = { 3, 0 };	/* Rate limit */
 
 /*
  * Find a route for a given origin IP address and multicast group address.
  * Statistics must be updated by the caller.
  */
 static __inline struct mfc *
 mfc_find(struct in_addr *o, struct in_addr *g)
 {
 	struct mfc *rt;
 
 	MFC_LOCK_ASSERT();
 
 	LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, *o) &&
 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
 		    TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	return (rt);
 }
 
 /*
  * Handle MRT setsockopt commands to modify the multicast forwarding tables.
  */
 static int
 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
     int	error, optval;
     vifi_t	vifi;
     struct	vifctl vifc;
     struct	mfcctl2 mfc;
     struct	bw_upcall bw_upcall;
     uint32_t	i;
 
     if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
 	return EPERM;
 
     error = 0;
     switch (sopt->sopt_name) {
     case MRT_INIT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	error = ip_mrouter_init(so, optval);
 	break;
 
     case MRT_DONE:
 	error = ip_mrouter_done();
 	break;
 
     case MRT_ADD_VIF:
 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 	if (error)
 	    break;
 	error = add_vif(&vifc);
 	break;
 
     case MRT_DEL_VIF:
 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 	if (error)
 	    break;
 	error = del_vif(vifi);
 	break;
 
     case MRT_ADD_MFC:
     case MRT_DEL_MFC:
 	/*
 	 * select data size depending on API version.
 	 */
 	if (sopt->sopt_name == MRT_ADD_MFC &&
 		V_mrt_api_config & MRT_API_FLAGS_ALL) {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
 				sizeof(struct mfcctl2));
 	} else {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
 				sizeof(struct mfcctl));
 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
 			sizeof(mfc) - sizeof(struct mfcctl));
 	}
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_MFC)
 	    error = add_mfc(&mfc);
 	else
 	    error = del_mfc(&mfc);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	set_assert(optval);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (!error)
 	    error = set_api_config(&i);
 	if (!error)
 	    error = sooptcopyout(sopt, &i, sizeof i);
 	break;
 
     case MRT_ADD_BW_UPCALL:
     case MRT_DEL_BW_UPCALL:
 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
 				sizeof bw_upcall);
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
 	    error = add_bw_upcall(&bw_upcall);
 	else
 	    error = del_bw_upcall(&bw_upcall);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
     int error;
 
     switch (sopt->sopt_name) {
     case MRT_VERSION:
 	error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyout(sopt, &V_pim_assert_enabled,
 	    sizeof V_pim_assert_enabled);
 	break;
 
     case MRT_API_SUPPORT:
 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
 {
     int error = 0;
 
     /*
      * Currently the only function calling this ioctl routine is rtioctl().
      * Typically, only root can create the raw socket in order to execute
      * this ioctl method, however the request might be coming from a prison
      */
     error = priv_check(curthread, PRIV_NETINET_MROUTE);
     if (error)
 	return (error);
     switch (cmd) {
     case (SIOCGETVIFCNT):
 	error = get_vif_cnt((struct sioc_vif_req *)data);
 	break;
 
     case (SIOCGETSGCNT):
 	error = get_sg_cnt((struct sioc_sg_req *)data);
 	break;
 
     default:
 	error = EINVAL;
 	break;
     }
     return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req *req)
 {
     struct mfc *rt;
 
     MFC_LOCK();
     rt = mfc_find(&req->src, &req->grp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 	return EADDRNOTAVAIL;
     }
     req->pktcnt = rt->mfc_pkt_cnt;
     req->bytecnt = rt->mfc_byte_cnt;
     req->wrong_if = rt->mfc_wrong_if;
     MFC_UNLOCK();
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(struct sioc_vif_req *req)
 {
     vifi_t vifi = req->vifi;
 
     VIF_LOCK();
     if (vifi >= V_numvifs) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
 
     req->icount = V_viftable[vifi].v_pkt_in;
     req->ocount = V_viftable[vifi].v_pkt_out;
     req->ibytes = V_viftable[vifi].v_bytes_in;
     req->obytes = V_viftable[vifi].v_bytes_out;
     VIF_UNLOCK();
 
     return 0;
 }
 
 static void
 if_detached_event(void *arg __unused, struct ifnet *ifp)
 {
     vifi_t vifi;
     u_long i;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return;
     }
 
     VIF_LOCK();
     MFC_LOCK();
 
     /*
      * Tear down multicast forwarder state associated with this ifnet.
      * 1. Walk the vif list, matching vifs against this ifnet.
      * 2. Walk the multicast forwarding cache (mfc) looking for
      *    inner matches with this vif's index.
      * 3. Expire any matching multicast forwarding cache entries.
      * 4. Free vif state. This should disable ALLMULTI on the interface.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (V_viftable[vifi].v_ifp != ifp)
 		continue;
 	for (i = 0; i < mfchashsize; i++) {
 		struct mfc *rt, *nrt;
 
 		LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 			if (rt->mfc_parent == vifi) {
 				expire_mfc(rt);
 			}
 		}
 	}
 	del_vif_locked(vifi);
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     MROUTER_UNLOCK();
 }
                         
 /*
  * Enable multicast forwarding.
  */
 static int
 ip_mrouter_init(struct socket *so, int version)
 {
 
     CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__,
         so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
 	return EOPNOTSUPP;
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     MROUTER_LOCK();
 
     if (ip_mrouter_unloading) {
 	MROUTER_UNLOCK();
 	return ENOPROTOOPT;
     }
 
     if (V_ip_mrouter != NULL) {
 	MROUTER_UNLOCK();
 	return EADDRINUSE;
     }
 
     V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
 	HASH_NOWAIT);
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
 
     V_ip_mrouter = so;
     ip_mrouter_cnt++;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Disable multicast forwarding.
  */
 static int
 X_ip_mrouter_done(void)
 {
     struct ifnet *ifp;
     u_long i;
     vifi_t vifi;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return EINVAL;
     }
 
     /*
      * Detach/disable hooks to the reset of the system.
      */
     V_ip_mrouter = NULL;
     ip_mrouter_cnt--;
     V_mrt_api_config = 0;
 
     VIF_LOCK();
 
     /*
      * For each phyint in use, disable promiscuous reception of all IP
      * multicasts.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
 		!(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 	    ifp = V_viftable[vifi].v_ifp;
 	    if_allmulti(ifp, 0);
 	}
     }
     bzero((caddr_t)V_viftable, sizeof(V_viftable));
     V_numvifs = 0;
     V_pim_assert_enabled = 0;
     
     VIF_UNLOCK();
 
     callout_stop(&V_expire_upcalls_ch);
     callout_stop(&V_bw_upcalls_ch);
     callout_stop(&V_bw_meter_ch);
 
     MFC_LOCK();
 
     /*
      * Free all multicast forwarding cache entries.
      * Do not use hashdestroy(), as we must perform other cleanup.
      */
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		expire_mfc(rt);
 	}
     }
     free(V_mfchashtbl, M_MRTABLE);
     V_mfchashtbl = NULL;
 
     bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
 
     V_bw_upcalls_n = 0;
     bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 
     MFC_UNLOCK();
 
     V_reg_vif_num = VIFI_INVALID;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(int i)
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     V_pim_assert_enabled = i;
 
     return 0;
 }
 
 /*
  * Configure API capabilities
  */
 int
 set_api_config(uint32_t *apival)
 {
     u_long i;
 
     /*
      * We can set the API capabilities only if it is the first operation
      * after MRT_INIT. I.e.:
      *  - there are no vifs installed
      *  - pim_assert is not enabled
      *  - the MFC table is empty
      */
     if (V_numvifs > 0) {
 	*apival = 0;
 	return EPERM;
     }
     if (V_pim_assert_enabled) {
 	*apival = 0;
 	return EPERM;
     }
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
 	    MFC_UNLOCK();
 	    *apival = 0;
 	    return EPERM;
 	}
     }
 
     MFC_UNLOCK();
 
     V_mrt_api_config = *apival & mrt_api_support;
     *apival = V_mrt_api_config;
 
     return 0;
 }
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(struct vifctl *vifcp)
 {
     struct vif *vifp = V_viftable + vifcp->vifc_vifi;
     struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error;
 
     VIF_LOCK();
     if (vifcp->vifc_vifi >= MAXVIFS) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
     /* rate limiting is no longer supported by this code */
     if (vifcp->vifc_rate_limit != 0) {
 	log(LOG_ERR, "rate limiting is no longer supported\n");
 	VIF_UNLOCK();
 	return EINVAL;
     }
     if (!in_nullhost(vifp->v_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRINUSE;
     }
     if (in_nullhost(vifcp->vifc_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /* Find the interface with an address in AF_INET family */
     if (vifcp->vifc_flags & VIFF_REGISTER) {
 	/*
 	 * XXX: Because VIFF_REGISTER does not really need a valid
 	 * local interface (e.g. it could be 127.0.0.2), we don't
 	 * check its address.
 	 */
 	ifp = NULL;
     } else {
 	sin.sin_addr = vifcp->vifc_lcl_addr;
 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
 	if (ifa == NULL) {
 	    VIF_UNLOCK();
 	    return EADDRNOTAVAIL;
 	}
 	ifp = ifa->ifa_ifp;
 	ifa_free(ifa);
     }
 
     if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
 	CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
 	VIF_UNLOCK();
 	return EOPNOTSUPP;
     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
 	ifp = &V_multicast_register_if;
 	CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
 	if (V_reg_vif_num == VIFI_INVALID) {
 	    if_initname(&V_multicast_register_if, "register_vif", 0);
 	    V_multicast_register_if.if_flags = IFF_LOOPBACK;
 	    V_reg_vif_num = vifcp->vifc_vifi;
 	}
     } else {		/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 	    VIF_UNLOCK();
 	    return EOPNOTSUPP;
 	}
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	error = if_allmulti(ifp, 1);
 	if (error) {
 	    VIF_UNLOCK();
 	    return error;
 	}
     }
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
 	V_numvifs = vifcp->vifc_vifi + 1;
 
     VIF_UNLOCK();
 
     CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__,
 	(int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr),
 	(int)vifcp->vifc_threshold);
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif_locked(vifi_t vifi)
 {
     struct vif *vifp;
 
     VIF_LOCK_ASSERT();
 
     if (vifi >= V_numvifs) {
 	return EINVAL;
     }
     vifp = &V_viftable[vifi];
     if (in_nullhost(vifp->v_lcl_addr)) {
 	return EADDRNOTAVAIL;
     }
 
     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
 	if_allmulti(vifp->v_ifp, 0);
 
     if (vifp->v_flags & VIFF_REGISTER)
 	V_reg_vif_num = VIFI_INVALID;
 
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
 
     /* Adjust numvifs down */
     for (vifi = V_numvifs; vifi > 0; vifi--)
 	if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
 	    break;
     V_numvifs = vifi;
 
     return 0;
 }
 
 static int
 del_vif(vifi_t vifi)
 {
     int cc;
 
     VIF_LOCK();
     cc = del_vif_locked(vifi);
     VIF_UNLOCK();
 
     return cc;
 }
 
 /*
  * update an mfc entry without resetting counters and S,G addresses.
  */
 static void
 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     int i;
 
     rt->mfc_parent = mfccp->mfcc_parent;
     for (i = 0; i < V_numvifs; i++) {
 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
 	    MRT_MFC_FLAGS_ALL;
     }
     /* set the RP address */
     if (V_mrt_api_config & MRT_MFC_RP)
 	rt->mfc_rp = mfccp->mfcc_rp;
     else
 	rt->mfc_rp.s_addr = INADDR_ANY;
 }
 
 /*
  * fully initialize an mfc entry from the parameter.
  */
 static void
 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     rt->mfc_origin     = mfccp->mfcc_origin;
     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 
     update_mfc_params(rt, mfccp);
 
     /* initialize pkt counters per src-grp */
     rt->mfc_pkt_cnt    = 0;
     rt->mfc_byte_cnt   = 0;
     rt->mfc_wrong_if   = 0;
     timevalclear(&rt->mfc_last_assert);
 }
 
 static void
 expire_mfc(struct mfc *rt)
 {
 	struct rtdetq *rte, *nrte;
 
 	MFC_LOCK_ASSERT();
 
 	free_bw_list(rt->mfc_bw_meter);
 
 	TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 		m_freem(rte->m);
 		TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 		free(rte, M_MRTABLE);
 	}
 
 	LIST_REMOVE(rt, mfc_hash);
 	free(rt, M_MRTABLE);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(struct mfcctl2 *mfccp)
 {
     struct mfc *rt;
     struct rtdetq *rte, *nrte;
     u_long hash = 0;
     u_short nstl;
 
     VIF_LOCK();
     MFC_LOCK();
 
     rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x",
 	    __func__, inet_ntoa(mfccp->mfcc_origin),
 	    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 	    mfccp->mfcc_parent);
 	update_mfc_params(rt, mfccp);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return (0);
     }
 
     /*
      * Find the entry for which the upcall was made and update
      */
     nstl = 0;
     hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
     LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 	if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 	    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 	    !TAILQ_EMPTY(&rt->mfc_stall)) {
 		CTR5(KTR_IPMF,
 		    "%s: add mfc orig %s group %lx parent %x qh %p",
 		    __func__, inet_ntoa(mfccp->mfcc_origin),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent,
 		    TAILQ_FIRST(&rt->mfc_stall));
 		if (nstl++)
 			CTR1(KTR_IPMF, "%s: multiple matches", __func__);
 
 		init_mfc_params(rt, mfccp);
 		rt->mfc_expire = 0;	/* Don't clean this guy up */
 		V_nexpire[hash]--;
 
 		/* Free queued packets, but attempt to forward them first. */
 		TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 			if (rte->ifp != NULL)
 				ip_mdq(rte->m, rte->ifp, rt, -1);
 			m_freem(rte->m);
 			TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 			rt->mfc_nstall--;
 			free(rte, M_MRTABLE);
 		}
 	}
     }
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 			init_mfc_params(rt, mfccp);
 			if (rt->mfc_expire)
 			    V_nexpire[hash]--;
 			rt->mfc_expire = 0;
 			break; /* XXX */
 		}
 	}
 
 	if (rt == NULL) {		/* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (ENOBUFS);
 	    }
 
 	    init_mfc_params(rt, mfccp);
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    rt->mfc_expire     = 0;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* insert new entry at head of hash chain */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	}
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     return (0);
 }
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(struct mfcctl2 *mfccp)
 {
     struct in_addr	origin;
     struct in_addr	mcastgrp;
     struct mfc		*rt;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
 
     CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__,
 	inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr));
 
     MFC_LOCK();
 
     rt = mfc_find(&origin, &mcastgrp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /*
      * free the bw_meter entries
      */
     free_bw_list(rt->mfc_bw_meter);
     rt->mfc_bw_meter = NULL;
 
     LIST_REMOVE(rt, mfc_hash);
     free(rt, M_MRTABLE);
 
     MFC_UNLOCK();
 
     return (0);
 }
 
 /*
  * Send a message to the routing daemon on the multicast routing socket.
  */
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 {
     if (s) {
 	SOCKBUF_LOCK(&s->so_rcv);
 	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
 	    NULL) != 0) {
 	    sorwakeup_locked(s);
 	    return 0;
 	}
 	SOCKBUF_UNLOCK(&s->so_rcv);
     }
     m_freem(mm);
     return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
     struct ip_moptions *imo)
 {
     struct mfc *rt;
     int error;
     vifi_t vifi;
 
     CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p",
 	inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
 
     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
 	 * Packet arrived via a physical interface or
 	 * an encapsulated tunnel or a register_vif.
 	 */
     } else {
 	/*
 	 * Packet arrived through a source-route tunnel.
 	 * Source-route tunnels are no longer supported.
 	 */
 	return (1);
     }
 
     VIF_LOCK();
     MFC_LOCK();
     if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
 	if (ip->ip_ttl < MAXTTL)
 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 	error = ip_mdq(m, ifp, NULL, vifi);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     }
 
     /*
      * Don't forward a packet with time-to-live of zero or one,
      * or a packet destined to a local-only group.
      */
     if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return 0;
     }
 
     /*
      * Determine forwarding vifs from the forwarding cache table
      */
     MRTSTAT_INC(mrts_mfc_lookups);
     rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 
     /* Entry exists, so forward if necessary */
     if (rt != NULL) {
 	error = ip_mdq(m, ifp, rt, -1);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     } else {
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon
 	 */
 
 	struct mbuf *mb0;
 	struct rtdetq *rte;
 	u_long hash;
 	int hlen = ip->ip_hl << 2;
 
 	MRTSTAT_INC(mrts_mfc_misses);
 	MRTSTAT_INC(mrts_no_route);
 	CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)",
 	    inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE,
 	    M_NOWAIT|M_ZERO);
 	if (rte == NULL) {
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	mb0 = m_copypacket(m, M_NOWAIT);
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
 	    mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 	    free(rte, M_MRTABLE);
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this flow ? */
 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 		    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 		    !TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	if (rt == NULL) {
 	    int i;
 	    struct igmpmsg *im;
 	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 	    struct mbuf *mm;
 
 	    /*
 	     * Locate the vifi for the incoming interface for this packet.
 	     * If none found, drop packet.
 	     */
 	    for (vifi = 0; vifi < V_numvifs &&
 		    V_viftable[vifi].v_ifp != ifp; vifi++)
 		;
 	    if (vifi >= V_numvifs)	/* vif not found, drop packet */
 		goto non_fatal;
 
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL)
 		goto fail;
 
 	    /* Make a copy of the header to send to the user level process */
 	    mm = m_copy(mb0, 0, hlen);
 	    if (mm == NULL)
 		goto fail1;
 
 	    /*
 	     * Send message to routing daemon to install
 	     * a route into the kernel table
 	     */
 
 	    im = mtod(mm, struct igmpmsg *);
 	    im->im_msgtype = IGMPMSG_NOCACHE;
 	    im->im_mbz = 0;
 	    im->im_vif = vifi;
 
 	    MRTSTAT_INC(mrts_upcalls);
 
 	    k_igmpsrc.sin_addr = ip->ip_src;
 	    if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		CTR0(KTR_IPMF, "ip_mforward: socket queue full");
 		MRTSTAT_INC(mrts_upq_sockfull);
 fail1:
 		free(rt, M_MRTABLE);
 fail:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return ENOBUFS;
 	    }
 
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 	    V_nexpire[hash]++;
 	    for (i = 0; i < V_numvifs; i++) {
 		rt->mfc_ttls[i] = 0;
 		rt->mfc_flags[i] = 0;
 	    }
 	    rt->mfc_parent = -1;
 
 	    /* clear the RP address */
 	    rt->mfc_rp.s_addr = INADDR_ANY;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt = 0;
 	    rt->mfc_byte_cnt = 0;
 	    rt->mfc_wrong_if = 0;
 	    timevalclear(&rt->mfc_last_assert);
 
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    /* link into table */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	    TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 
 	} else {
 	    /* determine if queue has overflowed */
 	    if (rt->mfc_nstall > MAX_UPQ) {
 		MRTSTAT_INC(mrts_upq_ovflw);
 non_fatal:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (0);
 	    }
 	    TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 	}
 
 	rte->m			= mb0;
 	rte->ifp		= ifp;
 
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 
 	return 0;
     }
 }
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *arg)
 {
     u_long i;
 
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	if (V_nexpire[i] == 0)
 	    continue;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		if (TAILQ_EMPTY(&rt->mfc_stall))
 			continue;
 
 		if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 			continue;
 
 		/*
 		 * free the bw_meter entries
 		 */
 		while (rt->mfc_bw_meter != NULL) {
 		    struct bw_meter *x = rt->mfc_bw_meter;
 
 		    rt->mfc_bw_meter = x->bm_mfc_next;
 		    free(x, M_BWMETER);
 		}
 
 		MRTSTAT_INC(mrts_cache_cleanups);
 		CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
 		    (u_long)ntohl(rt->mfc_origin.s_addr),
 		    (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
 
 		expire_mfc(rt);
 	    }
     }
 
     MFC_UNLOCK();
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
 
     CURVNET_RESTORE();
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 {
     struct ip  *ip = mtod(m, struct ip *);
     vifi_t vifi;
     int plen = ntohs(ip->ip_len);
 
     VIF_LOCK_ASSERT();
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < V_numvifs) {
 	if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + xmt_vif, m, rt);
 	else
 		phyint_send(ip, V_viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
 	CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
 	    __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
 	MRTSTAT_INC(mrts_wrong_if);
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, send a message
 	 * to the routing daemon.
 	 *
 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 	 * can complete the SPT switch, regardless of the type
 	 * of the iif (broadcast media, GRE tunnel, etc).
 	 */
 	if (V_pim_assert_enabled && (vifi < V_numvifs) &&
 	    V_viftable[vifi].v_ifp) {
 
 	    if (ifp == &V_multicast_register_if)
 		PIMSTAT_INC(pims_rcv_registers_wrongiif);
 
 	    /* Get vifi for the incoming packet */
 	    for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
 		vifi++)
 		;
 	    if (vifi >= V_numvifs)
 		return 0;	/* The iif is not found: ignore the packet. */
 
 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 		return 0;	/* WRONGVIF disabled: ignore the packet */
 
 	    if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct igmpmsg *im;
 		int hlen = ip->ip_hl << 2;
 		struct mbuf *mm = m_copy(m, 0, hlen);
 
 		if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL)
 		    return ENOBUFS;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		MRTSTAT_INC(mrts_upcalls);
 
 		k_igmpsrc.sin_addr = im->im_src;
 		if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		    CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 		    MRTSTAT_INC(mrts_upq_sockfull);
 		    return ENOBUFS;
 		}
 	    }
 	}
 	return 0;
     }
 
 
     /* If I sourced this packet, it counts as output, else it was input. */
     if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
 	V_viftable[vifi].v_pkt_out++;
 	V_viftable[vifi].v_bytes_out += plen;
     } else {
 	V_viftable[vifi].v_pkt_in++;
 	V_viftable[vifi].v_bytes_in += plen;
     }
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifi = 0; vifi < V_numvifs; vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    V_viftable[vifi].v_pkt_out++;
 	    V_viftable[vifi].v_bytes_out += plen;
 	    if (V_viftable[vifi].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + vifi, m, rt);
 	    else
 		phyint_send(ip, V_viftable + vifi, m);
 	}
 
     /*
      * Perform upcall-related bw measuring.
      */
     if (rt->mfc_bw_meter != NULL) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	microtime(&now);
 	MFC_LOCK_ASSERT();
 	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 	    bw_meter_receive_packet(x, plen, &now);
     }
 
     return 0;
 }
 
 /*
  * Check if a vif number is legal/ok. This is used by in_mcast.c.
  */
 static int
 X_legal_vif_num(int vif)
 {
 	int ret;
 
 	ret = 0;
 	if (vif < 0)
 		return (ret);
 
 	VIF_LOCK();
 	if (vif < V_numvifs)
 		ret = 1;
 	VIF_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(int vifi)
 {
 	in_addr_t addr;
 
 	addr = INADDR_ANY;
 	if (vifi < 0)
 		return (addr);
 
 	VIF_LOCK();
 	if (vifi < V_numvifs)
 		addr = V_viftable[vifi].v_lcl_addr.s_addr;
 	VIF_UNLOCK();
 
 	return (addr);
 }
 
 static void
 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 {
     struct mbuf *mb_copy;
     int hlen = ip->ip_hl << 2;
 
     VIF_LOCK_ASSERT();
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     send_packet(vifp, mb_copy);
 }
 
 static void
 send_packet(struct vif *vifp, struct mbuf *m)
 {
 	struct ip_moptions imo;
 	struct in_multi *imm[2];
 	int error;
 
 	VIF_LOCK_ASSERT();
 
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_num_memberships = 0;
 	imo.imo_max_memberships = 2;
 	imo.imo_membership  = &imm[0];
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
 
 /*
  * Stubs for old RSVP socket shim implementation.
  */
 
 static int
 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static void
 X_ip_rsvp_force_done(struct socket *so __unused)
 {
 
 }
 
 static int
 X_rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 	if (!V_rsvp_on)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Code for bandwidth monitors
  */
 
 /*
  * Define common interface for timeval-related methods
  */
 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 
 static uint32_t
 compute_bw_meter_flags(struct bw_upcall *req)
 {
     uint32_t flags = 0;
 
     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 	flags |= BW_METER_UNIT_PACKETS;
     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 	flags |= BW_METER_UNIT_BYTES;
     if (req->bu_flags & BW_UPCALL_GEQ)
 	flags |= BW_METER_GEQ;
     if (req->bu_flags & BW_UPCALL_LEQ)
 	flags |= BW_METER_LEQ;
 
     return flags;
 }
 
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
     struct timeval now;
     struct bw_meter *x;
     uint32_t flags;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     /* Test if the flags are valid */
     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 	return EINVAL;
     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 	return EINVAL;
     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	return EINVAL;
 
     /* Test if the threshold time interval is valid */
     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 	return EINVAL;
 
     flags = compute_bw_meter_flags(req);
 
     /*
      * Find if we have already same bw_meter entry
      */
     MFC_LOCK();
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			   &req->bu_threshold.b_time, ==)) &&
 	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 	    MFC_UNLOCK();
 	    return 0;		/* XXX Already installed */
 	}
     }
 
     /* Allocate the new bw_meter entry */
     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
     if (x == NULL) {
 	MFC_UNLOCK();
 	return ENOBUFS;
     }
 
     /* Set the new bw_meter entry */
     x->bm_threshold.b_time = req->bu_threshold.b_time;
     microtime(&now);
     x->bm_start_time = now;
     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags = flags;
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 
     /* Add the new bw_meter entry to the front of entries for this MFC */
     x->bm_mfc = mfc;
     x->bm_mfc_next = mfc->mfc_bw_meter;
     mfc->mfc_bw_meter = x;
     schedule_bw_meter(x, &now);
     MFC_UNLOCK();
 
     return 0;
 }
 
 static void
 free_bw_list(struct bw_meter *list)
 {
     while (list != NULL) {
 	struct bw_meter *x = list;
 
 	list = list->bm_mfc_next;
 	unschedule_bw_meter(x);
 	free(x, M_BWMETER);
     }
 }
 
 /*
  * Delete one or multiple bw_meter entries
  */
 static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct bw_meter *x;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     MFC_LOCK();
 
     /* Find the corresponding MFC entry */
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 	/*
 	 * Delete all bw_meter entries for this mfc
 	 */
 	struct bw_meter *list;
 
 	list = mfc->mfc_bw_meter;
 	mfc->mfc_bw_meter = NULL;
 	free_bw_list(list);
 	MFC_UNLOCK();
 	return 0;
     } else {			/* Delete a single bw_meter entry */
 	struct bw_meter *prev;
 	uint32_t flags = 0;
 
 	flags = compute_bw_meter_flags(req);
 
 	/* Find the bw_meter entry to delete */
 	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 		break;
 	}
 	if (x != NULL) { /* Delete entry from the list for this MFC */
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
 		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 
 	    unschedule_bw_meter(x);
 	    MFC_UNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
 	    return 0;
 	} else {
 	    MFC_UNLOCK();
 	    return EINVAL;
 	}
     }
     /* NOTREACHED */
 }
 
 /*
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
     struct timeval delta;
 
     MFC_LOCK_ASSERT();
 
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     if (x->bm_flags & BW_METER_GEQ) {
 	/*
 	 * Processing for ">=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /* Reset the bw_meter entry */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 	    }
 	}
     } else if (x->bm_flags & BW_METER_LEQ) {
 	/*
 	 * Processing for "<=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /*
 	     * We are behind time with the multicast forwarding table
 	     * scanning for "<=" type of bw_meter entries, so test now
 	     * if we should deliver an upcall.
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 	    }
 	    /* Reschedule the bw_meter entry */
 	    unschedule_bw_meter(x);
 	    schedule_bw_meter(x, nowp);
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should restart the measuring interval
 	 */
 	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 	    (x->bm_flags & BW_METER_UNIT_BYTES &&
 	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 	    /* Don't restart the measuring interval */
 	} else {
 	    /* Do restart the measuring interval */
 	    /*
 	     * XXX: note that we don't unschedule and schedule, because this
 	     * might be too much overhead per packet. Instead, when we process
 	     * all entries for a given timer hash bin, we check whether it is
 	     * really a timeout. If not, we reschedule at that time.
 	     */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
     }
 }
 
 /*
  * Prepare a bandwidth-related upcall
  */
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
     struct timeval delta;
     struct bw_upcall *u;
 
     MFC_LOCK_ASSERT();
 
     /*
      * Compute the measured time interval
      */
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     /*
      * If there are too many pending upcalls, deliver them now
      */
     if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
 	bw_upcalls_send();
 
     /*
      * Set the bw_upcall entry
      */
     u = &V_bw_upcalls[V_bw_upcalls_n++];
     u->bu_src = x->bm_mfc->mfc_origin;
     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
     u->bu_threshold.b_time = x->bm_threshold.b_time;
     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
     u->bu_measured.b_time = delta;
     u->bu_measured.b_packets = x->bm_measured.b_packets;
     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
     u->bu_flags = 0;
     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
     if (x->bm_flags & BW_METER_UNIT_BYTES)
 	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
     if (x->bm_flags & BW_METER_GEQ)
 	u->bu_flags |= BW_UPCALL_GEQ;
     if (x->bm_flags & BW_METER_LEQ)
 	u->bu_flags |= BW_UPCALL_LEQ;
 }
 
 /*
  * Send the pending bandwidth-related upcalls
  */
 static void
 bw_upcalls_send(void)
 {
     struct mbuf *m;
     int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]);
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 				      0,		/* unused2 */
 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 				      0,		/* im_mbz  */
 				      0,		/* im_vif  */
 				      0,		/* unused3 */
 				      { 0 },		/* im_src  */
 				      { 0 } };		/* im_dst  */
 
     MFC_LOCK_ASSERT();
 
     if (V_bw_upcalls_n == 0)
 	return;			/* No pending upcalls */
 
     V_bw_upcalls_n = 0;
 
     /*
      * Allocate a new mbuf, initialize it with the header and
      * the payload for the pending calls.
      */
     m = m_gethdr(M_NOWAIT, MT_DATA);
     if (m == NULL) {
 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 	return;
     }
 
     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
 
     /*
      * Send the upcalls
      * XXX do we need to set the address in k_igmpsrc ?
      */
     MRTSTAT_INC(mrts_upcalls);
     if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 	MRTSTAT_INC(mrts_upq_sockfull);
     }
 }
 
 /*
  * Compute the timeout hash value for the bw_meter entries
  */
 #define	BW_METER_TIMEHASH(bw_meter, hash)				\
     do {								\
 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
 									\
 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 	(hash) = next_timeval.tv_sec;					\
 	if (next_timeval.tv_usec)					\
 	    (hash)++; /* XXX: make sure we don't timeout early */	\
 	(hash) %= BW_METER_BUCKETS;					\
     } while (0)
 
 /*
  * Schedule a timer to process periodically bw_meter entry of type "<="
  * by linking the entry in the proper hash bucket.
  */
 static void
 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 {
     int time_hash;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Reset the bw_meter entry
      */
     x->bm_start_time = *nowp;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 
     /*
      * Compute the timeout hash value and insert the entry
      */
     BW_METER_TIMEHASH(x, time_hash);
     x->bm_time_next = V_bw_meter_timers[time_hash];
     V_bw_meter_timers[time_hash] = x;
     x->bm_time_hash = time_hash;
 }
 
 /*
  * Unschedule the periodic timer that processes bw_meter entry of type "<="
  * by removing the entry from the proper hash bucket.
  */
 static void
 unschedule_bw_meter(struct bw_meter *x)
 {
     int time_hash;
     struct bw_meter *prev, *tmp;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Compute the timeout hash value and delete the entry
      */
     time_hash = x->bm_time_hash;
     if (time_hash >= BW_METER_BUCKETS)
 	return;		/* Entry was not scheduled */
 
     for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 	if (tmp == x)
 	    break;
 
     if (tmp == NULL)
 	panic("unschedule_bw_meter: bw_meter entry not found");
 
     if (prev != NULL)
 	prev->bm_time_next = x->bm_time_next;
     else
 	V_bw_meter_timers[time_hash] = x->bm_time_next;
 
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 }
 
 
 /*
  * Process all "<=" type of bw_meter that should be processed now,
  * and for each entry prepare an upcall if necessary. Each processed
  * entry is rescheduled again for the (periodic) processing.
  *
  * This is run periodically (once per second normally). On each round,
  * all the potentially matching entries are in the hash slot that we are
  * looking at.
  */
 static void
 bw_meter_process()
 {
     uint32_t loops;
     int i;
     struct timeval now, process_endtime;
 
     microtime(&now);
     if (V_last_tv_sec == now.tv_sec)
 	return;		/* nothing to do */
 
     loops = now.tv_sec - V_last_tv_sec;
     V_last_tv_sec = now.tv_sec;
     if (loops > BW_METER_BUCKETS)
 	loops = BW_METER_BUCKETS;
 
     MFC_LOCK();
     /*
      * Process all bins of bw_meter entries from the one after the last
      * processed to the current one. On entry, i points to the last bucket
      * visited, so we need to increment i at the beginning of the loop.
      */
     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 	struct bw_meter *x, *tmp_list;
 
 	if (++i >= BW_METER_BUCKETS)
 	    i = 0;
 
 	/* Disconnect the list of bw_meter entries from the bin */
 	tmp_list = V_bw_meter_timers[i];
 	V_bw_meter_timers[i] = NULL;
 
 	/* Process the list of bw_meter entries */
 	while (tmp_list != NULL) {
 	    x = tmp_list;
 	    tmp_list = tmp_list->bm_time_next;
 
 	    /* Test if the time interval is over */
 	    process_endtime = x->bm_start_time;
 	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 		/* Not yet: reschedule, but don't reset */
 		int time_hash;
 
 		BW_METER_TIMEHASH(x, time_hash);
 		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 		    /*
 		     * XXX: somehow the bin processing is a bit ahead of time.
 		     * Put the entry in the next bin.
 		     */
 		    if (++time_hash >= BW_METER_BUCKETS)
 			time_hash = 0;
 		}
 		x->bm_time_next = V_bw_meter_timers[time_hash];
 		V_bw_meter_timers[time_hash] = x;
 		x->bm_time_hash = time_hash;
 
 		continue;
 	    }
 
 	    /*
 	     * Test if we should deliver an upcall
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, &now);
 	    }
 
 	    /*
 	     * Reschedule for next processing
 	     */
 	    schedule_bw_meter(x, &now);
 	}
     }
 
     /* Send all upcalls that are pending delivery */
     bw_upcalls_send();
 
     MFC_UNLOCK();
 }
 
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
 static void
 expire_bw_upcalls_send(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
     bw_upcalls_send();
     MFC_UNLOCK();
 
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * A periodic function for periodic scanning of the multicast forwarding
  * table for processing all "<=" bw_meter entries.
  */
 static void
 expire_bw_meter_process(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
 	bw_meter_process();
 
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * End of bandwidth monitoring code
  */
 
 /*
  * Send the packet up to the user daemon, or eventually do kernel encapsulation
  *
  */
 static int
 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
     struct mfc *rt)
 {
     struct mbuf *mb_copy, *mm;
 
     /*
      * Do not send IGMP_WHOLEPKT notifications to userland, if the
      * rendezvous point was unspecified, and we were told not to.
      */
     if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
 	in_nullhost(rt->mfc_rp))
 	return 0;
 
     mb_copy = pim_register_prepare(ip, m);
     if (mb_copy == NULL)
 	return ENOBUFS;
 
     /*
      * Send all the fragments. Note that the mbuf for each fragment
      * is freed by the sending machinery.
      */
     for (mm = mb_copy; mm; mm = mb_copy) {
 	mb_copy = mm->m_nextpkt;
 	mm->m_nextpkt = 0;
 	mm = m_pullup(mm, sizeof(struct ip));
 	if (mm != NULL) {
 	    ip = mtod(mm, struct ip *);
 	    if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
 		pim_register_send_rp(ip, vifp, mm, rt);
 	    } else {
 		pim_register_send_upcall(ip, vifp, mm, rt);
 	    }
 	}
     }
 
     return 0;
 }
 
 /*
  * Return a copy of the data packet that is ready for PIM Register
  * encapsulation.
  * XXX: Note that in the returned copy the IP header is a valid one.
  */
 static struct mbuf *
 pim_register_prepare(struct ip *ip, struct mbuf *m)
 {
     struct mbuf *mb_copy = NULL;
     int mtu;
 
     /* Take care of delayed checksums */
     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 	in_delayed_cksum(m);
 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
     }
 
     /*
      * Copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy == NULL)
 	return NULL;
     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
     if (mb_copy == NULL)
 	return NULL;
 
     /* take care of the TTL */
     ip = mtod(mb_copy, struct ip *);
     --ip->ip_ttl;
 
     /* Compute the MTU after the PIM Register encapsulation */
     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 
     if (ntohs(ip->ip_len) <= mtu) {
 	/* Turn the IP header into a valid one */
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     } else {
 	/* Fragment the packet */
 	mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
 	if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
 	    m_freem(mb_copy);
 	    return NULL;
 	}
     }
     return mb_copy;
 }
 
 /*
  * Send an upcall with the data packet to the user-level process.
  */
 static int
 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
     struct mbuf *mb_copy, struct mfc *rt)
 {
     struct mbuf *mb_first;
     int len = ntohs(ip->ip_len);
     struct igmpmsg *im;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 
     VIF_LOCK_ASSERT();
 
     /*
      * Add a new mbuf with an upcall header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
     mb_first->m_len = sizeof(struct igmpmsg);
     mb_first->m_next = mb_copy;
 
     /* Send message to routing daemon */
     im = mtod(mb_first, struct igmpmsg *);
     im->im_msgtype	= IGMPMSG_WHOLEPKT;
     im->im_mbz		= 0;
     im->im_vif		= vifp - V_viftable;
     im->im_src		= ip->ip_src;
     im->im_dst		= ip->ip_dst;
 
     k_igmpsrc.sin_addr	= ip->ip_src;
 
     MRTSTAT_INC(mrts_upcalls);
 
     if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 	CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 	MRTSTAT_INC(mrts_upq_sockfull);
 	return ENOBUFS;
     }
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * Encapsulate the data packet in PIM Register message and send it to the RP.
  */
 static int
 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
     struct mfc *rt)
 {
     struct mbuf *mb_first;
     struct ip *ip_outer;
     struct pim_encap_pimhdr *pimhdr;
     int len = ntohs(ip->ip_len);
     vifi_t vifi = rt->mfc_parent;
 
     VIF_LOCK_ASSERT();
 
     if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
 	m_freem(mb_copy);
 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
     }
 
     /*
      * Add a new mbuf with the encapsulating header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     mb_first->m_next = mb_copy;
 
     mb_first->m_pkthdr.len = len + mb_first->m_len;
 
     /*
      * Fill in the encapsulating IP and PIM header
      */
     ip_outer = mtod(mb_first, struct ip *);
     *ip_outer = pim_encap_iphdr;
     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 	sizeof(pim_encap_pimhdr));
     ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
     ip_outer->ip_dst = rt->mfc_rp;
     /*
      * Copy the inner header TOS to the outer header, and take care of the
      * IP_DF bit.
      */
     ip_outer->ip_tos = ip->ip_tos;
     if (ip->ip_off & htons(IP_DF))
 	ip_outer->ip_off |= htons(IP_DF);
     ip_fillid(ip_outer);
     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 					 + sizeof(pim_encap_iphdr));
     *pimhdr = pim_encap_pimhdr;
     /* If the iif crosses a border, set the Border-bit */
     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 
     mb_first->m_data += sizeof(pim_encap_iphdr);
     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
     mb_first->m_data -= sizeof(pim_encap_iphdr);
 
     send_packet(vifp, mb_first);
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * pim_encapcheck() is called by the encap4_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 
 #ifdef DIAGNOSTIC
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
 #endif
     if (proto != IPPROTO_PIM)
 	return 0;	/* not for us; reject the datagram. */
 
     return 64;		/* claim the datagram. */
 }
 
 /*
  * PIM-SMv2 and PIM-DM messages processing.
  * Receives and verifies the PIM control messages, and passes them
  * up to the listening socket, using rip_input().
  * The only message with special processing is the PIM_REGISTER message
  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  * is passed to if_simloop().
  */
 int
 pim_input(struct mbuf **mp, int *offp, int proto)
 {
     struct mbuf *m = *mp;
     struct ip *ip = mtod(m, struct ip *);
     struct pim *pim;
     int iphlen = *offp;
     int minlen;
     int datalen = ntohs(ip->ip_len) - iphlen;
     int ip_tos;
 
     *mp = NULL;
 
     /* Keep statistics */
     PIMSTAT_INC(pims_rcv_total_msgs);
     PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
 
     /*
      * Validate lengths
      */
     if (datalen < PIM_MINLEN) {
 	PIMSTAT_INC(pims_rcv_tooshort);
 	CTR3(KTR_IPMF, "%s: short packet (%d) from %s",
 	    __func__, datalen, inet_ntoa(ip->ip_src));
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /*
      * If the packet is at least as big as a REGISTER, go agead
      * and grab the PIM REGISTER header size, to avoid another
      * possible m_pullup() later.
      *
      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
      */
     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
     /*
      * Get the IP and PIM headers in contiguous memory, and
      * possibly the PIM REGISTER header.
      */
     if (m->m_len < minlen && (m = m_pullup(m, minlen)) == 0) {
 	CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
 	return (IPPROTO_DONE);
     }
 
     /* m_pullup() may have given us a new mbuf so reset ip. */
     ip = mtod(m, struct ip *);
     ip_tos = ip->ip_tos;
 
     /* adjust mbuf to point to the PIM header */
     m->m_data += iphlen;
     m->m_len  -= iphlen;
     pim = mtod(m, struct pim *);
 
     /*
      * Validate checksum. If PIM REGISTER, exclude the data packet.
      *
      * XXX: some older PIMv2 implementations don't make this distinction,
      * so for compatibility reason perform the checksum over part of the
      * message, and if error, then over the whole message.
      */
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 	/* do nothing, checksum okay */
     } else if (in_cksum(m, datalen)) {
 	PIMSTAT_INC(pims_rcv_badsum);
 	CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* PIM version check */
     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 	PIMSTAT_INC(pims_rcv_badversion);
 	CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
 	    (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* restore mbuf back to the outer IP */
     m->m_data -= iphlen;
     m->m_len  += iphlen;
 
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 	/*
 	 * Since this is a REGISTER, we'll make a copy of the register
 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 	 * routing daemon.
 	 */
 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 	struct mbuf *mcp;
 	struct ip *encap_ip;
 	u_int32_t *reghdr;
 	struct ifnet *vifp;
 
 	VIF_LOCK();
 	if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
 	    VIF_UNLOCK();
 	    CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
 		(int)V_reg_vif_num);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 	/* XXX need refcnt? */
 	vifp = V_viftable[V_reg_vif_num].v_ifp;
 	VIF_UNLOCK();
 
 	/*
 	 * Validate length
 	 */
 	if (datalen < PIM_REG_MINLEN) {
 	    PIMSTAT_INC(pims_rcv_tooshort);
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	reghdr = (u_int32_t *)(pim + 1);
 	encap_ip = (struct ip *)(reghdr + 1);
 
 	CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d",
 	    __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len));
 
 	/* verify the version number of the inner packet */
 	if (encap_ip->ip_v != IPVERSION) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* verify the inner packet is destined to a mcast group */
 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__,
 		inet_ntoa(encap_ip->ip_dst));
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* If a NULL_REGISTER, pass it to the daemon */
 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 	    goto pim_input_to_daemon;
 
 	/*
 	 * Copy the TOS from the outer IP header to the inner IP header.
 	 */
 	if (encap_ip->ip_tos != ip_tos) {
 	    /* Outer TOS -> inner TOS */
 	    encap_ip->ip_tos = ip_tos;
 	    /* Recompute the inner header checksum. Sigh... */
 
 	    /* adjust mbuf to point to the inner IP header */
 	    m->m_data += (iphlen + PIM_MINLEN);
 	    m->m_len  -= (iphlen + PIM_MINLEN);
 
 	    encap_ip->ip_sum = 0;
 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 
 	    /* restore mbuf to point back to the outer IP header */
 	    m->m_data -= (iphlen + PIM_MINLEN);
 	    m->m_len  += (iphlen + PIM_MINLEN);
 	}
 
 	/*
 	 * Decapsulate the inner IP packet and loopback to forward it
 	 * as a normal multicast packet. Also, make a copy of the
 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 	 * to pass to the daemon later, so it can take the appropriate
 	 * actions (e.g., send back PIM_REGISTER_STOP).
 	 * XXX: here m->m_data points to the outer IP header.
 	 */
 	mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 	if (mcp == NULL) {
 	    CTR1(KTR_IPMF, "%s: m_copy() failed", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* Keep statistics */
 	/* XXX: registers_bytes include only the encap. mcast pkt */
 	PIMSTAT_INC(pims_rcv_registers_msgs);
 	PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
 
 	/*
 	 * forward the inner ip packet; point m_data at the inner ip.
 	 */
 	m_adj(m, iphlen + PIM_MINLEN);
 
 	CTR4(KTR_IPMF,
 	    "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
 	    __func__,
 	    (u_long)ntohl(encap_ip->ip_src.s_addr),
 	    (u_long)ntohl(encap_ip->ip_dst.s_addr),
 	    (int)V_reg_vif_num);
 
 	/* NB: vifp was collected above; can it change on us? */
 	if_simloop(vifp, m, dst.sin_family, 0);
 
 	/* prepare the register head to send to the mrouting daemon */
 	m = mcp;
     }
 
 pim_input_to_daemon:
     /*
      * Pass the PIM message up to the daemon; if it is a Register message,
      * pass the 'head' only up to the daemon. This includes the
      * outer IP header, PIM header, PIM-Register header and the
      * inner IP header.
      * XXX: the outer IP header pkt size of a Register is not adjust to
      * reflect the fact that the inner multicast data is truncated.
      */
     *mp = m;
     rip_input(mp, offp, proto);
 
     return (IPPROTO_DONE);
 }
 
 static int
 sysctl_mfctable(SYSCTL_HANDLER_ARGS)
 {
 	struct mfc	*rt;
 	int		 error, i;
 
 	if (req->newptr)
 		return (EPERM);
 	if (V_mfchashtbl == NULL)	/* XXX unlocked */
 		return (0);
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	MFC_LOCK();
 	for (i = 0; i < mfchashsize; i++) {
 		LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
 			error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
 			if (error)
 				goto out_locked;
 		}
 	}
 out_locked:
 	MFC_UNLOCK();
 	return (error);
 }
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
     sysctl_mfctable, "IPv4 Multicast Forwarding Table "
     "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
 
 static void
 vnet_mroute_init(const void *unused __unused)
 {
 
 	MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
 	bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 	callout_init(&V_expire_upcalls_ch, 1);
 	callout_init(&V_bw_upcalls_ch, 1);
 	callout_init(&V_bw_meter_ch, 1);
 }
 
-VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mroute_init,
+VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
 	NULL);
 
 static void
 vnet_mroute_uninit(const void *unused __unused)
 {
 
 	FREE(V_nexpire, M_MRTABLE);
 	V_nexpire = NULL;
 }
 
-VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, 
+VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, 
 	vnet_mroute_uninit, NULL);
 
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
 
     switch (type) {
     case MOD_LOAD:
 	MROUTER_LOCK_INIT();
 
 	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 
 	    if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
 	if (if_detach_event_tag == NULL) {
 		printf("ip_mroute: unable to register "
 		    "ifnet_departure_event handler\n");
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	MFC_LOCK_INIT();
 	VIF_LOCK_INIT();
 
 	mfchashsize = MFCHASHSIZE;
 	if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
 	    !powerof2(mfchashsize)) {
 		printf("WARNING: %s not a power of 2; using default\n",
 		    "net.inet.ip.mfchashsize");
 		mfchashsize = MFCHASHSIZE;
 	}
 
 	pim_squelch_wholepkt = 0;
 	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
 	    &pim_squelch_wholepkt);
 
 	pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM,
 	    pim_encapcheck, &in_pim_protosw, NULL);
 	if (pim_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim encap\n");
 		VIF_LOCK_DESTROY();
 		MFC_LOCK_DESTROY();
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	ip_mcast_src = X_ip_mcast_src;
 	ip_mforward = X_ip_mforward;
 	ip_mrouter_done = X_ip_mrouter_done;
 	ip_mrouter_get = X_ip_mrouter_get;
 	ip_mrouter_set = X_ip_mrouter_set;
 
 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 	ip_rsvp_vif = X_ip_rsvp_vif;
 
 	legal_vif_num = X_legal_vif_num;
 	mrt_ioctl = X_mrt_ioctl;
 	rsvp_input_p = X_rsvp_input;
 	break;
 
     case MOD_UNLOAD:
 	/*
 	 * Typically module unload happens after the user-level
 	 * process has shutdown the kernel services (the check
 	 * below insures someone can't just yank the module out
 	 * from under a running process).  But if the module is
 	 * just loaded and then unloaded w/o starting up a user
 	 * process we still need to cleanup.
 	 */
 	MROUTER_LOCK();
 	if (ip_mrouter_cnt != 0) {
 	    MROUTER_UNLOCK();
 	    return (EINVAL);
 	}
 	ip_mrouter_unloading = 1;
 	MROUTER_UNLOCK();
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 
 	if (pim_encap_cookie) {
 	    encap_detach(pim_encap_cookie);
 	    pim_encap_cookie = NULL;
 	}
 
 	ip_mcast_src = NULL;
 	ip_mforward = NULL;
 	ip_mrouter_done = NULL;
 	ip_mrouter_get = NULL;
 	ip_mrouter_set = NULL;
 
 	ip_rsvp_force_done = NULL;
 	ip_rsvp_vif = NULL;
 
 	legal_vif_num = NULL;
 	mrt_ioctl = NULL;
 	rsvp_input_p = NULL;
 
 	VIF_LOCK_DESTROY();
 	MFC_LOCK_DESTROY();
 	MROUTER_LOCK_DESTROY();
 	break;
 
     default:
 	return EOPNOTSUPP;
     }
     return 0;
 }
 
 static moduledata_t ip_mroutemod = {
     "ip_mroute",
     ip_mroute_modevent,
     0
 };
 
-DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
+DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
Index: projects/vnet/sys/netinet/ip_var.h
===================================================================
--- projects/vnet/sys/netinet/ip_var.h	(revision 294565)
+++ projects/vnet/sys/netinet/ip_var.h	(revision 294566)
@@ -1,304 +1,301 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IP_VAR_H_
 #define	_NETINET_IP_VAR_H_
 
 #include <sys/queue.h>
 
 /*
  * Overlay for ip header used by other protocols (tcp, udp).
  */
 struct ipovly {
 	u_char	ih_x1[9];		/* (unused) */
 	u_char	ih_pr;			/* protocol */
 	u_short	ih_len;			/* protocol length */
 	struct	in_addr ih_src;		/* source internet address */
 	struct	in_addr ih_dst;		/* destination internet address */
 };
 
 #ifdef _KERNEL
 /*
  * Ip reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  * They are timed out after ipq_ttl drops to 0, and may also
  * be reclaimed if memory becomes tight.
  */
 struct ipq {
 	TAILQ_ENTRY(ipq) ipq_list;	/* to other reass headers */
 	u_char	ipq_ttl;		/* time for reass q to live */
 	u_char	ipq_p;			/* protocol of this fragment */
 	u_short	ipq_id;			/* sequence id for reassembly */
 	struct mbuf *ipq_frags;		/* to ip headers of fragments */
 	struct	in_addr ipq_src,ipq_dst;
 	u_char	ipq_nfrags;		/* # frags in this packet */
 	struct label *ipq_label;	/* MAC label */
 };
 #endif /* _KERNEL */
 
 /*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
  * is in m_len.
  */
 #define MAX_IPOPTLEN	40
 
 struct ipoption {
 	struct	in_addr ipopt_dst;	/* first-hop dst if source routed */
 	char	ipopt_list[MAX_IPOPTLEN];	/* options proper */
 };
 
 /*
  * Structure attached to inpcb.ip_moptions and
  * passed to ip_output when IP multicast options are in use.
  * This structure is lazy-allocated.
  */
 struct ip_moptions {
 	struct	ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
 	struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
 	u_long	imo_multicast_vif;	/* vif num outgoing multicasts */
 	u_char	imo_multicast_ttl;	/* TTL for outgoing multicasts */
 	u_char	imo_multicast_loop;	/* 1 => hear sends if a member */
 	u_short	imo_num_memberships;	/* no. memberships this socket */
 	u_short	imo_max_memberships;	/* max memberships this socket */
 	struct	in_multi **imo_membership;	/* group memberships */
 	struct	in_mfilter *imo_mfilters;	/* source filters */
 	STAILQ_ENTRY(ip_moptions) imo_link;
 };
 
 struct	ipstat {
 	uint64_t ips_total;		/* total packets received */
 	uint64_t ips_badsum;		/* checksum bad */
 	uint64_t ips_tooshort;		/* packet too short */
 	uint64_t ips_toosmall;		/* not enough data */
 	uint64_t ips_badhlen;		/* ip header length < data size */
 	uint64_t ips_badlen;		/* ip length < ip header length */
 	uint64_t ips_fragments;		/* fragments received */
 	uint64_t ips_fragdropped;	/* frags dropped (dups, out of space) */
 	uint64_t ips_fragtimeout;	/* fragments timed out */
 	uint64_t ips_forward;		/* packets forwarded */
 	uint64_t ips_fastforward;	/* packets fast forwarded */
 	uint64_t ips_cantforward;	/* packets rcvd for unreachable dest */
 	uint64_t ips_redirectsent;	/* packets forwarded on same net */
 	uint64_t ips_noproto;		/* unknown or unsupported protocol */
 	uint64_t ips_delivered;		/* datagrams delivered to upper level*/
 	uint64_t ips_localout;		/* total ip packets generated here */
 	uint64_t ips_odropped;		/* lost packets due to nobufs, etc. */
 	uint64_t ips_reassembled;	/* total packets reassembled ok */
 	uint64_t ips_fragmented;	/* datagrams successfully fragmented */
 	uint64_t ips_ofragments;	/* output fragments created */
 	uint64_t ips_cantfrag;		/* don't fragment flag was set, etc. */
 	uint64_t ips_badoptions;		/* error in option processing */
 	uint64_t ips_noroute;		/* packets discarded due to no route */
 	uint64_t ips_badvers;		/* ip version != 4 */
 	uint64_t ips_rawout;		/* total raw ip packets generated */
 	uint64_t ips_toolong;		/* ip length > max ip packet size */
 	uint64_t ips_notmember;		/* multicasts for unregistered grps */
 	uint64_t ips_nogif;		/* no match gif found */
 	uint64_t ips_badaddr;		/* invalid address on header */
 };
 
 #ifdef _KERNEL
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	IPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val))
 #define	IPSTAT_SUB(name, val)	IPSTAT_ADD(name, -(val))
 #define	IPSTAT_INC(name)	IPSTAT_ADD(name, 1)
 #define	IPSTAT_DEC(name)	IPSTAT_SUB(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_ipstat_inc(int statnum);
 #define	KMOD_IPSTAT_INC(name)	\
     kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t))
 void	kmod_ipstat_dec(int statnum);
 #define	KMOD_IPSTAT_DEC(name)	\
     kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t))
 
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
 #define	IP_SENDONES		0x4		/* send all-ones broadcast */
 #define	IP_SENDTOIF		0x8		/* send on specific ifnet */
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
 #define	IP_NODEFAULTFLOWID	0x40		/* Don't set the flowid from inp */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP_HDR_ALIGNED_P(ip)	1
 #else
 #define IP_HDR_ALIGNED_P(ip)	((((intptr_t) (ip)) & 3) == 0)
 #endif
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 
 VNET_DECLARE(int, ip_defttl);			/* default IP ttl */
 VNET_DECLARE(int, ipforwarding);		/* ip forwarding */
 #ifdef IPSTEALTH
 VNET_DECLARE(int, ipstealth);			/* stealth forwarding */
 #endif
 extern u_char	ip_protox[];
 VNET_DECLARE(struct socket *, ip_rsvpd);	/* reservation protocol daemon*/
 VNET_DECLARE(struct socket *, ip_mrouter);	/* multicast routing daemon */
 extern int	(*legal_vif_num)(int);
 extern u_long	(*ip_mcast_src)(int);
 VNET_DECLARE(int, rsvp_on);
 VNET_DECLARE(int, drop_redirect);
 extern struct	pr_usrreqs rip_usrreqs;
 
 #define	V_ip_id			VNET(ip_id)
 #define	V_ip_defttl		VNET(ip_defttl)
 #define	V_ipforwarding		VNET(ipforwarding)
 #ifdef IPSTEALTH
 #define	V_ipstealth		VNET(ipstealth)
 #endif
 #define	V_ip_rsvpd		VNET(ip_rsvpd)
 #define	V_ip_mrouter		VNET(ip_mrouter)
 #define	V_rsvp_on		VNET(rsvp_on)
 #define	V_drop_redirect		VNET(drop_redirect)
 
 void	inp_freemoptions(struct ip_moptions *);
 int	inp_getmoptions(struct inpcb *, struct sockopt *);
 int	inp_setmoptions(struct inpcb *, struct sockopt *);
 
 int	ip_ctloutput(struct socket *, struct sockopt *sopt);
 void	ip_drain(void);
 int	ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
 	    u_long if_hwassist_flags);
 void	ip_forward(struct mbuf *m, int srcrt);
 void	ip_init(void);
 #ifdef VIMAGE
 void	ip_destroy(void);
 #endif
 extern int
 	(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 	    struct ip_moptions *);
 int	ip_output(struct mbuf *,
 	    struct mbuf *, struct route *, int, struct ip_moptions *,
 	    struct inpcb *);
 int	ipproto_register(short);
 int	ipproto_unregister(short);
 struct mbuf *
 	ip_reass(struct mbuf *);
 void	ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
 	    struct mbuf *);
 void	ip_slowtimo(void);
 void	ip_fillid(struct ip *);
 int	rip_ctloutput(struct socket *, struct sockopt *);
 void	rip_ctlinput(int, struct sockaddr *, void *);
 void	rip_init(void);
-#ifdef VIMAGE
-void	rip_destroy(void);
-#endif
 int	rip_input(struct mbuf **, int *, int);
 int	rip_output(struct mbuf *, struct socket *, ...);
 int	ipip_input(struct mbuf **, int *, int);
 int	rsvp_input(struct mbuf **, int *, int);
 int	ip_rsvp_init(struct socket *);
 int	ip_rsvp_done(void);
 extern int	(*ip_rsvp_vif)(struct socket *, struct sockopt *);
 extern void	(*ip_rsvp_force_done)(struct socket *);
 extern int	(*rsvp_input_p)(struct mbuf **, int *, int);
 
 VNET_DECLARE(struct pfil_head, inet_pfil_hook);	/* packet filter hooks */
 #define	V_inet_pfil_hook	VNET(inet_pfil_hook)
 
 void	in_delayed_cksum(struct mbuf *m);
 
 /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
 /*
  * Reference to an ipfw or packet filter rule that can be carried
  * outside critical sections.
  * A rule is identified by rulenum:rule_id which is ordered.
  * In version chain_id the rule can be found in slot 'slot', so
  * we don't need a lookup if chain_id == chain->id.
  *
  * On exit from the firewall this structure refers to the rule after
  * the matching one (slot points to the new rule; rulenum:rule_id-1
  * is the matching rule), and additional info (e.g. info often contains
  * the insn argument or tablearg in the low 16 bits, in host format).
  * On entry, the structure is valid if slot>0, and refers to the starting
  * rules. 'info' contains the reason for reinject, e.g. divert port,
  * divert direction, and so on.
  */
 struct ipfw_rule_ref {
 	uint32_t	slot;		/* slot for matching rule	*/
 	uint32_t	rulenum;	/* matching rule number		*/
 	uint32_t	rule_id;	/* matching rule id		*/
 	uint32_t	chain_id;	/* ruleset id			*/
 	uint32_t	info;		/* see below			*/
 };
 
 enum {
 	IPFW_INFO_MASK	= 0x0000ffff,
 	IPFW_INFO_OUT	= 0x00000000,	/* outgoing, just for convenience */
 	IPFW_INFO_IN	= 0x80000000,	/* incoming, overloads dir */
 	IPFW_ONEPASS	= 0x40000000,	/* One-pass, do not reinject */
 	IPFW_IS_MASK	= 0x30000000,	/* which source ? */
 	IPFW_IS_DIVERT	= 0x20000000,
 	IPFW_IS_DUMMYNET =0x10000000,
 	IPFW_IS_PIPE	= 0x08000000,	/* pipe=1, queue = 0 */
 };
 #define MTAG_IPFW	1148380143	/* IPFW-tagged cookie */
 #define MTAG_IPFW_RULE	1262273568	/* rule reference */
 #define	MTAG_IPFW_CALL	1308397630	/* call stack */
 
 struct ip_fw_args;
 typedef int	(*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
 typedef int	(*ip_fw_ctl_ptr_t)(struct sockopt *);
 VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
 #define	V_ip_fw_ctl_ptr		VNET(ip_fw_ctl_ptr)
 
 /* Divert hooks. */
 extern void	(*ip_divert_ptr)(struct mbuf *m, int incoming);
 /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
 extern int	(*ng_ipfw_input_p)(struct mbuf **, int,
 			struct ip_fw_args *, int);
 
 extern int	(*ip_dn_ctl_ptr)(struct sockopt *);
 extern int	(*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IP_VAR_H_ */
Index: projects/vnet/sys/netinet/raw_ip.c
===================================================================
--- projects/vnet/sys/netinet/raw_ip.c	(revision 294565)
+++ projects/vnet/sys/netinet/raw_ip.c	(revision 294566)
@@ -1,1131 +1,1132 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/stdarg.h>
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_defttl), 0,
     "Maximum TTL on IP packets");
 
 VNET_DEFINE(struct inpcbhead, ripcb);
 VNET_DEFINE(struct inpcbinfo, ripcbinfo);
 
 #define	V_ripcb			VNET(ripcb)
 #define	V_ripcbinfo		VNET(ripcbinfo)
 
 /*
  * Control and data hooks for ipfw, dummynet, divert and so on.
  * The data hooks are not used here but it is convenient
  * to keep them all in one place.
  */
 VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
 
 int	(*ip_dn_ctl_ptr)(struct sockopt *);
 int	(*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
 void	(*ip_divert_ptr)(struct mbuf *, int);
 int	(*ng_ipfw_input_p)(struct mbuf **, int,
 			struct ip_fw_args *, int);
 
 #ifdef INET
 /*
  * Hooks for multicast routing. They all default to NULL, so leave them not
  * initialized and rely on BSS being set to 0.
  */
 
 /*
  * The socket used to communicate with the multicast routing daemon.
  */
 VNET_DEFINE(struct socket *, ip_mrouter);
 
 /*
  * The various mrouter and rsvp functions.
  */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *);
 int (*mrt_ioctl)(u_long, caddr_t, int);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 int (*rsvp_input_p)(struct mbuf **, int *, int);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 #endif /* INET */
 
 u_long	rip_sendspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 
 u_long	rip_recvspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
 /*
  * Hash functions
  */
 
 #define INP_PCBHASH_RAW_SIZE	256
 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
         (((proto) + (laddr) + (faddr)) % (mask) + 1)
 
 #ifdef INET
 static void
 rip_inshash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *pcbhash;
 	int hash;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 	
 	if (inp->inp_ip_p != 0 &&
 	    inp->inp_laddr.s_addr != INADDR_ANY &&
 	    inp->inp_faddr.s_addr != INADDR_ANY) {
 		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
 		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
 	} else
 		hash = 0;
 	pcbhash = &pcbinfo->ipi_hashbase[hash];
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 }
 
 static void
 rip_delhash(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	LIST_REMOVE(inp, inp_hash);
 }
 #endif /* INET */
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block q.
  */
 static void
 rip_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
 }
 
 static int
 rip_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "rawinp");
 	return (0);
 }
 
 void
 rip_init(void)
 {
 
 	in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
 	    1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_NONE);
 	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 #ifdef VIMAGE
-void
+static void
 rip_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_ripcbinfo);
 }
+VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
 #endif
 
 #ifdef INET
 static int
 rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
     struct sockaddr_in *ripsrc)
 {
 	int policyfail = 0;
 
 	INP_LOCK_ASSERT(last);
 
 #ifdef IPSEC
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 		policyfail = 1;
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
 		policyfail = 1;
 #endif
 	/* Check the minimum TTL for socket. */
 	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
 		policyfail = 1;
 	if (!policyfail) {
 		struct mbuf *opts = NULL;
 		struct socket *so;
 
 		so = last->inp_socket;
 		if ((last->inp_flags & INP_CONTROLOPTS) ||
 		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
 			ip_savecontrol(last, &opts, ip, n);
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (sbappendaddr_locked(&so->so_rcv,
 		    (struct sockaddr *)ripsrc, n, opts) == 0) {
 			/* should notify about lost packet */
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		} else
 			sorwakeup_locked(so);
 	} else
 		m_freem(n);
 	return (policyfail);
 }
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ifnet *ifp;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct inpcb *inp, *last;
 	struct sockaddr_in ripsrc;
 	int hash;
 
 	*mp = NULL;
 
 	bzero(&ripsrc, sizeof(ripsrc));
 	ripsrc.sin_len = sizeof(ripsrc);
 	ripsrc.sin_family = AF_INET;
 	ripsrc.sin_addr = ip->ip_src;
 	last = NULL;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
 	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
 		if (inp->inp_ip_p != proto)
 			continue;
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			continue;
 		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			continue;
 		if (jailed_without_vnet(inp->inp_cred)) {
 			/*
 			 * XXX: If faddr was bound to multicast group,
 			 * jailed raw socket will drop datagram.
 			 */
 			if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
 				continue;
 		}
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 		    	    (void) rip_append(last, ip, n, &ripsrc);
 			/* XXX count dropped packet */
 			INP_RUNLOCK(last);
 		}
 		INP_RLOCK(inp);
 		last = inp;
 	}
 	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
 		if (inp->inp_ip_p && inp->inp_ip_p != proto)
 			continue;
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (!in_nullhost(inp->inp_laddr) &&
 		    !in_hosteq(inp->inp_laddr, ip->ip_dst))
 			continue;
 		if (!in_nullhost(inp->inp_faddr) &&
 		    !in_hosteq(inp->inp_faddr, ip->ip_src))
 			continue;
 		if (jailed_without_vnet(inp->inp_cred)) {
 			/*
 			 * Allow raw socket in jail to receive multicast;
 			 * assume process had PRIV_NETINET_RAW at attach,
 			 * and fall through into normal filter path if so.
 			 */
 			if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 			    prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
 				continue;
 		}
 		/*
 		 * If this raw socket has multicast state, and we
 		 * have received a multicast, check if this socket
 		 * should receive it, as multicast filtering is now
 		 * the responsibility of the transport layer.
 		 */
 		if (inp->inp_moptions != NULL &&
 		    IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 			/*
 			 * If the incoming datagram is for IGMP, allow it
 			 * through unconditionally to the raw socket.
 			 *
 			 * In the case of IGMPv2, we may not have explicitly
 			 * joined the group, and may have set IFF_ALLMULTI
 			 * on the interface. imo_multi_filter() may discard
 			 * control traffic we actually need to see.
 			 *
 			 * Userland multicast routing daemons should continue
 			 * filter the control traffic appropriately.
 			 */
 			int blocked;
 
 			blocked = MCAST_PASS;
 			if (proto != IPPROTO_IGMP) {
 				struct sockaddr_in group;
 
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(inp->inp_moptions,
 				    ifp,
 				    (struct sockaddr *)&group,
 				    (struct sockaddr *)&ripsrc);
 			}
 
 			if (blocked != MCAST_PASS) {
 				IPSTAT_INC(ips_notmember);
 				continue;
 			}
 		}
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 				(void) rip_append(last, ip, n, &ripsrc);
 			/* XXX count dropped packet */
 			INP_RUNLOCK(last);
 		}
 		INP_RLOCK(inp);
 		last = inp;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	if (last != NULL) {
 		if (rip_append(last, ip, m, &ripsrc) != 0)
 			IPSTAT_INC(ips_delivered);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		IPSTAT_INC(ips_noproto);
 		IPSTAT_DEC(ips_delivered);
 	}
 	return (IPPROTO_DONE);
 }
 
 /*
  * Generate IP header and pass packet to ip_output.  Tack on options user may
  * have setup with control call.
  */
 int
 rip_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct ip *ip;
 	int error;
 	struct inpcb *inp = sotoinpcb(so);
 	va_list ap;
 	u_long dst;
 	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
 	    IP_ALLOWBROADCAST;
 
 	va_start(ap, so);
 	dst = va_arg(ap, u_long);
 	va_end(ap);
 
 	/*
 	 * If the user handed us a complete IP packet, use it.  Otherwise,
 	 * allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 
 		INP_RLOCK(inp);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		if (inp->inp_flags & INP_DONTFRAG)
 			ip->ip_off = htons(IP_DF);
 		else
 			ip->ip_off = htons(0);
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		if (jailed(inp->inp_cred)) {
 			/*
 			 * prison_local_ip4() would be good enough but would
 			 * let a source of INADDR_ANY pass, which we do not
 			 * want to see from jails.
 			 */
 			if (ip->ip_src.s_addr == INADDR_ANY) {
 				error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src,
 				    inp->inp_cred);
 			} else {
 				error = prison_local_ip4(inp->inp_cred,
 				    &ip->ip_src);
 			}
 			if (error != 0) {
 				INP_RUNLOCK(inp);
 				m_freem(m);
 				return (error);
 			}
 		}
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		INP_RLOCK(inp);
 		ip = mtod(m, struct ip *);
 		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
 		if (error != 0) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (error);
 		}
 
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash.
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
 		    || (ntohs(ip->ip_len) > m->m_pkthdr.len)
 		    || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		/*
 		 * This doesn't allow application to specify ID of zero,
 		 * but we got this limitation from the beginning of history.
 		 */
 		if (ip->ip_id == 0)
 			ip_fillid(ip);
 
 		/*
 		 * XXX prevent ip_output from overwriting header fields.
 		 */
 		flags |= IP_RAWOUTPUT;
 		IPSTAT_INC(ips_rawout);
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	error = ip_output(m, inp->inp_options, NULL, flags,
 	    inp->inp_moptions, inp);
 	INP_RUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Raw IP socket option processing.
  *
  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
  * only be created by a privileged process, and as such, socket option
  * operations to manage system properties on any raw socket were allowed to
  * take place without explicit additional access control checks.  However,
  * raw sockets can now also be created in jail(), and therefore explicit
  * checks are now required.  Likewise, raw sockets can be used by a process
  * after it gives up privilege, so some caution is required.  For options
  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
  * performed in ip_ctloutput() and therefore no check occurs here.
  * Unilaterally checking priv_check() here breaks normal IP socket option
  * operations on raw sockets.
  *
  * When adding new socket options here, make sure to add access control
  * checks here as necessary.
  *
  * XXX-BZ inp locking?
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP) {
 		if ((sopt->sopt_level == SOL_SOCKET) &&
 		    (sopt->sopt_name == SO_SETFIB)) {
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 		case IP_FW_NAT_GET_CONFIG:
 		case IP_FW_NAT_GET_LOG:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_GET:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 		case IP_FW_NAT_CFG:
 		case IP_FW_NAT_DEL:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which are
  * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
  * in_ifadown() to remove all routes corresponding to that address.  It also
  * receives the PRC_IFUP messages from if_up() and reinstalls the interface
  * routes.
  */
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		IN_IFADDR_RLOCK(&in_ifa_tracker);
 		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				ifa_ref(&ia->ia_ifa);
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				/*
 				 * in_scrubprefix() kills the interface route.
 				 */
 				in_scrubprefix(ia, 0);
 				/*
 				 * in_ifadown gets rid of all the rest of the
 				 * routes.  This is not quite the right thing
 				 * to do, but at least if we are running a
 				 * routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				ifa_free(&ia->ia_ifa);
 				break;
 			}
 		}
 		if (ia == NULL)		/* If ia matched, already unlocked. */
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		break;
 
 	case PRC_IFUP:
 		IN_IFADDR_RLOCK(&in_ifa_tracker);
 		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			return;
 		}
 		ifa_ref(&ia->ia_ifa);
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = ifa_del_loopback_route((struct ifaddr *)ia, sa);
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 
 		err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
 
 		ifa_free(&ia->ia_ifa);
 		break;
 	}
 }
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	if (proto >= IPPROTO_MAX || proto < 0)
 		return EPROTONOSUPPORT;
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = V_ip_defttl;
 	rip_inshash(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 
 	    ("rip_detach: not closed"));
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	if (so == V_ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == V_ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 }
 
 static void
 rip_dodisconnect(struct socket *so, struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = inp->inp_pcbinfo;
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	rip_inshash(inp);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 static void
 rip_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static void
 rip_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 	return (0);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 	int error;
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 
 	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
 	if (error != 0)
 		return (error);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
 
 	if (TAILQ_EMPTY(&V_ifnet) ||
 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
 	    (addr->sin_addr.s_addr &&
 	     (inp->inp_flags & INP_BINDANY) == 0 &&
 	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
 		return (EADDRNOTAVAIL);
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_laddr = addr->sin_addr;
 	rip_inshash(inp);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (TAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
 		return (EAFNOSUPPORT);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_faddr = addr->sin_addr;
 	rip_inshash(inp);
 	soisconnected(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	u_long dst;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
 
 	/*
 	 * Note: 'dst' reads below are unlocked.
 	 */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return (EISCONN);
 		}
 		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return (ENOTCONN);
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	return (rip_output(m, so, dst));
 }
 #endif /* INET */
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_ripcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	gencnt = V_ripcbinfo.ipi_gencnt;
 	n = V_ripcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_ripcbinfo);
 		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_ripcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_ripcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 #ifdef INET
 struct pr_usrreqs rip_usrreqs = {
 	.pru_abort =		rip_abort,
 	.pru_attach =		rip_attach,
 	.pru_bind =		rip_bind,
 	.pru_connect =		rip_connect,
 	.pru_control =		in_control,
 	.pru_detach =		rip_detach,
 	.pru_disconnect =	rip_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		rip_send,
 	.pru_shutdown =		rip_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		rip_close,
 };
 #endif /* INET */
Index: projects/vnet/sys/netinet/sctp_bsd_addr.c
===================================================================
--- projects/vnet/sys/netinet/sctp_bsd_addr.c	(revision 294565)
+++ projects/vnet/sys/netinet/sctp_bsd_addr.c	(revision 294566)
@@ -1,547 +1,550 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_indata.h>
 #include <sys/unistd.h>
 
 /* Declare all of our malloc named types */
 MALLOC_DEFINE(SCTP_M_MAP, "sctp_map", "sctp asoc map descriptor");
 MALLOC_DEFINE(SCTP_M_STRMI, "sctp_stri", "sctp stream in array");
 MALLOC_DEFINE(SCTP_M_STRMO, "sctp_stro", "sctp stream out array");
 MALLOC_DEFINE(SCTP_M_ASC_ADDR, "sctp_aadr", "sctp asconf address");
 MALLOC_DEFINE(SCTP_M_ASC_IT, "sctp_a_it", "sctp asconf iterator");
 MALLOC_DEFINE(SCTP_M_AUTH_CL, "sctp_atcl", "sctp auth chunklist");
 MALLOC_DEFINE(SCTP_M_AUTH_KY, "sctp_atky", "sctp auth key");
 MALLOC_DEFINE(SCTP_M_AUTH_HL, "sctp_athm", "sctp auth hmac list");
 MALLOC_DEFINE(SCTP_M_AUTH_IF, "sctp_athi", "sctp auth info");
 MALLOC_DEFINE(SCTP_M_STRESET, "sctp_stre", "sctp stream reset");
 MALLOC_DEFINE(SCTP_M_CMSG, "sctp_cmsg", "sctp CMSG buffer");
 MALLOC_DEFINE(SCTP_M_COPYAL, "sctp_cpal", "sctp copy all");
 MALLOC_DEFINE(SCTP_M_VRF, "sctp_vrf", "sctp vrf struct");
 MALLOC_DEFINE(SCTP_M_IFA, "sctp_ifa", "sctp ifa struct");
 MALLOC_DEFINE(SCTP_M_IFN, "sctp_ifn", "sctp ifn struct");
 MALLOC_DEFINE(SCTP_M_TIMW, "sctp_timw", "sctp time block");
 MALLOC_DEFINE(SCTP_M_MVRF, "sctp_mvrf", "sctp mvrf pcb list");
 MALLOC_DEFINE(SCTP_M_ITER, "sctp_iter", "sctp iterator control");
 MALLOC_DEFINE(SCTP_M_SOCKOPT, "sctp_socko", "sctp socket option");
 MALLOC_DEFINE(SCTP_M_MCORE, "sctp_mcore", "sctp mcore queue");
 
 /* Global NON-VNET structure that controls the iterator */
 struct iterator_control sctp_it_ctl;
 
 
 void
 sctp_wakeup_iterator(void)
 {
 	wakeup(&sctp_it_ctl.iterator_running);
 }
 
 static void
 sctp_iterator_thread(void *v SCTP_UNUSED)
 {
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 	/* In FreeBSD this thread never terminates. */
 	for (;;) {
 		msleep(&sctp_it_ctl.iterator_running,
 		    &sctp_it_ctl.ipi_iterator_wq_mtx,
 		    0, "waiting_for_work", 0);
 		sctp_iterator_worker();
 	}
 }
 
 void
 sctp_startup_iterator(void)
 {
 	if (sctp_it_ctl.thread_proc) {
 		/* You only get one */
 		return;
 	}
 	/* Initialize global locks here, thus only once. */
 	SCTP_ITERATOR_LOCK_INIT();
 	SCTP_IPI_ITERATOR_WQ_INIT();
 	TAILQ_INIT(&sctp_it_ctl.iteratorhead);
 	kproc_create(sctp_iterator_thread,
 	    (void *)NULL,
 	    &sctp_it_ctl.thread_proc,
 	    RFPROC,
 	    SCTP_KTHREAD_PAGES,
 	    SCTP_KTRHEAD_NAME);
 }
 
 #ifdef INET6
 
 void
 sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa)
 {
 	struct in6_ifaddr *ifa6;
 
 	ifa6 = (struct in6_ifaddr *)ifa->ifa;
 	ifa->flags = ifa6->ia6_flags;
 	if (!MODULE_GLOBAL(ip6_use_deprecated)) {
 		if (ifa->flags &
 		    IN6_IFF_DEPRECATED) {
 			ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
 		} else {
 			ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
 		}
 	} else {
 		ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
 	}
 	if (ifa->flags &
 	    (IN6_IFF_DETACHED |
 	    IN6_IFF_ANYCAST |
 	    IN6_IFF_NOTREADY)) {
 		ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
 	} else {
 		ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
 	}
 }
 
 #endif				/* INET6 */
 
 
 static uint32_t
 sctp_is_desired_interface_type(struct ifnet *ifn)
 {
 	int result;
 
 	/* check the interface type to see if it's one we care about */
 	switch (ifn->if_type) {
 	case IFT_ETHER:
 	case IFT_ISO88023:
 	case IFT_ISO88024:
 	case IFT_ISO88025:
 	case IFT_ISO88026:
 	case IFT_STARLAN:
 	case IFT_P10:
 	case IFT_P80:
 	case IFT_HY:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISDNBASIC:
 	case IFT_ISDNPRIMARY:
 	case IFT_PTPSERIAL:
 	case IFT_OTHER:
 	case IFT_PPP:
 	case IFT_LOOP:
 	case IFT_SLIP:
 	case IFT_GIF:
 	case IFT_L2VLAN:
 	case IFT_STF:
 	case IFT_IP:
 	case IFT_IPOVERCDLC:
 	case IFT_IPOVERCLAW:
 	case IFT_PROPVIRTUAL:	/* NetGraph Virtual too */
 	case IFT_VIRTUALIPADDRESS:
 		result = 1;
 		break;
 	default:
 		result = 0;
 	}
 
 	return (result);
 }
 
 
 
 
 static void
 sctp_init_ifns_for_vrf(int vrfid)
 {
 	/*
 	 * Here we must apply ANY locks needed by the IFN we access and also
 	 * make sure we lock any IFA that exists as we float through the
 	 * list of IFA's
 	 */
 	struct ifnet *ifn;
 	struct ifaddr *ifa;
 	struct sctp_ifa *sctp_ifa;
 	uint32_t ifa_flags;
 
 #ifdef INET6
 	struct in6_ifaddr *ifa6;
 
 #endif
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) {
 		if (sctp_is_desired_interface_type(ifn) == 0) {
 			/* non desired type */
 			continue;
 		}
 		IF_ADDR_RLOCK(ifn);
 		TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) {
 			if (ifa->ifa_addr == NULL) {
 				continue;
 			}
 			switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 			case AF_INET:
 				if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) {
 					continue;
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) {
 					/* skip unspecifed addresses */
 					continue;
 				}
 				break;
 #endif
 			default:
 				continue;
 			}
 			switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 			case AF_INET:
 				ifa_flags = 0;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				ifa6 = (struct in6_ifaddr *)ifa;
 				ifa_flags = ifa6->ia6_flags;
 				break;
 #endif
 			default:
 				ifa_flags = 0;
 				break;
 			}
 			sctp_ifa = sctp_add_addr_to_vrf(vrfid,
 			    (void *)ifn,
 			    ifn->if_index,
 			    ifn->if_type,
 			    ifn->if_xname,
 			    (void *)ifa,
 			    ifa->ifa_addr,
 			    ifa_flags,
 			    0);
 			if (sctp_ifa) {
 				sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifn);
 	}
 	IFNET_RUNLOCK();
 }
 
 void
 sctp_init_vrf_list(int vrfid)
 {
 	if (vrfid > SCTP_MAX_VRF_ID)
 		/* can't do that */
 		return;
 
 	/* Don't care about return here */
 	(void)sctp_allocate_vrf(vrfid);
 
 	/*
 	 * Now we need to build all the ifn's for this vrf and there
 	 * addresses
 	 */
 	sctp_init_ifns_for_vrf(vrfid);
 }
 
 void
 sctp_addr_change(struct ifaddr *ifa, int cmd)
 {
 	uint32_t ifa_flags = 0;
 
+	if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0)
+		return;
+
 	/*
 	 * BSD only has one VRF, if this changes we will need to hook in the
 	 * right things here to get the id to pass to the address managment
 	 * routine.
 	 */
 	if (SCTP_BASE_VAR(first_time) == 0) {
 		/* Special test to see if my ::1 will showup with this */
 		SCTP_BASE_VAR(first_time) = 1;
 		sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID);
 	}
 	if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) {
 		/* don't know what to do with this */
 		return;
 	}
 	if (ifa->ifa_addr == NULL) {
 		return;
 	}
 	if (sctp_is_desired_interface_type(ifa->ifa_ifp) == 0) {
 		/* non desired type */
 		return;
 	}
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) {
 			return;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags;
 		if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) {
 			/* skip unspecifed addresses */
 			return;
 		}
 		break;
 #endif
 	default:
 		/* non inet/inet6 skip */
 		return;
 	}
 	if (cmd == RTM_ADD) {
 		(void)sctp_add_addr_to_vrf(SCTP_DEFAULT_VRFID, (void *)ifa->ifa_ifp,
 		    ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_type, ifa->ifa_ifp->if_xname,
 		    (void *)ifa, ifa->ifa_addr, ifa_flags, 1);
 	} else {
 
 		sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr,
 		    ifa->ifa_ifp->if_index,
 		    ifa->ifa_ifp->if_xname);
 
 		/*
 		 * We don't bump refcount here so when it completes the
 		 * final delete will happen.
 		 */
 	}
 }
 
 void
      sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add){
 	struct ifnet *ifn;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) {
 		if (!(*pred) (ifn)) {
 			continue;
 		}
 		TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) {
 			sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE);
 		}
 	}
 	IFNET_RUNLOCK();
 }
 
 struct mbuf *
 sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header,
     int how, int allonebuf, int type)
 {
 	struct mbuf *m = NULL;
 
 	m = m_getm2(NULL, space_needed, how, type, want_header ? M_PKTHDR : 0);
 	if (m == NULL) {
 		/* bad, no memory */
 		return (m);
 	}
 	if (allonebuf) {
 		if (SCTP_BUF_SIZE(m) < space_needed) {
 			m_freem(m);
 			return (NULL);
 		}
 	}
 	if (SCTP_BUF_NEXT(m)) {
 		sctp_m_freem(SCTP_BUF_NEXT(m));
 		SCTP_BUF_NEXT(m) = NULL;
 	}
 #ifdef SCTP_MBUF_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mb(m, SCTP_MBUF_IALLOC);
 	}
 #endif
 	return (m);
 }
 
 
 #ifdef SCTP_PACKET_LOGGING
 void
 sctp_packet_log(struct mbuf *m)
 {
 	int *lenat, thisone;
 	void *copyto;
 	uint32_t *tick_tock;
 	int length;
 	int total_len;
 	int grabbed_lock = 0;
 	int value, newval, thisend, thisbegin;
 
 	/*
 	 * Buffer layout. -sizeof this entry (total_len) -previous end
 	 * (value) -ticks of log      (ticks) o -ip packet o -as logged -
 	 * where this started (thisbegin) x <--end points here
 	 */
 	length = SCTP_HEADER_LEN(m);
 	total_len = SCTP_SIZE32((length + (4 * sizeof(int))));
 	/* Log a packet to the buffer. */
 	if (total_len > SCTP_PACKET_LOG_SIZE) {
 		/* Can't log this packet I have not a buffer big enough */
 		return;
 	}
 	if (length < (int)(SCTP_MIN_V4_OVERHEAD + sizeof(struct sctp_cookie_ack_chunk))) {
 		return;
 	}
 	atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), 1);
 try_again:
 	if (SCTP_BASE_VAR(packet_log_writers) > SCTP_PKTLOG_WRITERS_NEED_LOCK) {
 		SCTP_IP_PKTLOG_LOCK();
 		grabbed_lock = 1;
 again_locked:
 		value = SCTP_BASE_VAR(packet_log_end);
 		newval = SCTP_BASE_VAR(packet_log_end) + total_len;
 		if (newval >= SCTP_PACKET_LOG_SIZE) {
 			/* we wrapped */
 			thisbegin = 0;
 			thisend = total_len;
 		} else {
 			thisbegin = SCTP_BASE_VAR(packet_log_end);
 			thisend = newval;
 		}
 		if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) {
 			goto again_locked;
 		}
 	} else {
 		value = SCTP_BASE_VAR(packet_log_end);
 		newval = SCTP_BASE_VAR(packet_log_end) + total_len;
 		if (newval >= SCTP_PACKET_LOG_SIZE) {
 			/* we wrapped */
 			thisbegin = 0;
 			thisend = total_len;
 		} else {
 			thisbegin = SCTP_BASE_VAR(packet_log_end);
 			thisend = newval;
 		}
 		if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) {
 			goto try_again;
 		}
 	}
 	/* Sanity check */
 	if (thisend >= SCTP_PACKET_LOG_SIZE) {
 		SCTP_PRINTF("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
 		    thisbegin,
 		    thisend,
 		    SCTP_BASE_VAR(packet_log_writers),
 		    grabbed_lock,
 		    SCTP_BASE_VAR(packet_log_end));
 		SCTP_BASE_VAR(packet_log_end) = 0;
 		goto no_log;
 
 	}
 	lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisbegin];
 	*lenat = total_len;
 	lenat++;
 	*lenat = value;
 	lenat++;
 	tick_tock = (uint32_t *) lenat;
 	lenat++;
 	*tick_tock = sctp_get_tick_count();
 	copyto = (void *)lenat;
 	thisone = thisend - sizeof(int);
 	lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisone];
 	*lenat = thisbegin;
 	if (grabbed_lock) {
 		SCTP_IP_PKTLOG_UNLOCK();
 		grabbed_lock = 0;
 	}
 	m_copydata(m, 0, length, (caddr_t)copyto);
 no_log:
 	if (grabbed_lock) {
 		SCTP_IP_PKTLOG_UNLOCK();
 	}
 	atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), 1);
 }
 
 
 int
 sctp_copy_out_packet_log(uint8_t * target, int length)
 {
 	/*
 	 * We wind through the packet log starting at start copying up to
 	 * length bytes out. We return the number of bytes copied.
 	 */
 	int tocopy, this_copy;
 	int *lenat;
 	int did_delay = 0;
 
 	tocopy = length;
 	if (length < (int)(2 * sizeof(int))) {
 		/* not enough room */
 		return (0);
 	}
 	if (SCTP_PKTLOG_WRITERS_NEED_LOCK) {
 		atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK);
 again:
 		if ((did_delay == 0) && (SCTP_BASE_VAR(packet_log_writers) != SCTP_PKTLOG_WRITERS_NEED_LOCK)) {
 			/*
 			 * we delay here for just a moment hoping the
 			 * writer(s) that were present when we entered will
 			 * have left and we only have locking ones that will
 			 * contend with us for the lock. This does not
 			 * assure 100% access, but its good enough for a
 			 * logging facility like this.
 			 */
 			did_delay = 1;
 			DELAY(10);
 			goto again;
 		}
 	}
 	SCTP_IP_PKTLOG_LOCK();
 	lenat = (int *)target;
 	*lenat = SCTP_BASE_VAR(packet_log_end);
 	lenat++;
 	this_copy = min((length - sizeof(int)), SCTP_PACKET_LOG_SIZE);
 	memcpy((void *)lenat, (void *)SCTP_BASE_VAR(packet_log_buffer), this_copy);
 	if (SCTP_PKTLOG_WRITERS_NEED_LOCK) {
 		atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers),
 		    SCTP_PKTLOG_WRITERS_NEED_LOCK);
 	}
 	SCTP_IP_PKTLOG_UNLOCK();
 	return (this_copy + sizeof(int));
 }
 
 #endif
Index: projects/vnet/sys/netinet/sctp_pcb.c
===================================================================
--- projects/vnet/sys/netinet/sctp_pcb.c	(revision 294565)
+++ projects/vnet/sys/netinet/sctp_pcb.c	(revision 294566)
@@ -1,7045 +1,7067 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_dtrace_define.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/udp.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/unistd.h>
 
 
 VNET_DEFINE(struct sctp_base_info, system_base_info);
 
 /* FIX: we don't handle multiple link local scopes */
 /* "scopeless" replacement IN6_ARE_ADDR_EQUAL */
 #ifdef INET6
 int
 SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b)
 {
 	struct sockaddr_in6 tmp_a, tmp_b;
 
 	memcpy(&tmp_a, a, sizeof(struct sockaddr_in6));
 	if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 		return (0);
 	}
 	memcpy(&tmp_b, b, sizeof(struct sockaddr_in6));
 	if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 		return (0);
 	}
 	return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr));
 }
 
 #endif
 
 void
 sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb)
 {
 	/*
 	 * We really don't need to lock this, but I will just because it
 	 * does not hurt.
 	 */
 	SCTP_INP_INFO_RLOCK();
 	spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep);
 	spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc);
 	spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr);
 	spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr);
 	spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk);
 	spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq);
 	spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq);
 	spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks);
 	SCTP_INP_INFO_RUNLOCK();
 }
 
 /*-
  * Addresses are added to VRF's (Virtual Router's). For BSD we
  * have only the default VRF 0. We maintain a hash list of
  * VRF's. Each VRF has its own list of sctp_ifn's. Each of
  * these has a list of addresses. When we add a new address
  * to a VRF we lookup the ifn/ifn_index, if the ifn does
  * not exist we create it and add it to the list of IFN's
  * within the VRF. Once we have the sctp_ifn, we add the
  * address to the list. So we look something like:
  *
  * hash-vrf-table
  *   vrf-> ifn-> ifn -> ifn
  *   vrf    |
  *    ...   +--ifa-> ifa -> ifa
  *   vrf
  *
  * We keep these separate lists since the SCTP subsystem will
  * point to these from its source address selection nets structure.
  * When an address is deleted it does not happen right away on
  * the SCTP side, it gets scheduled. What we do when a
  * delete happens is immediately remove the address from
  * the master list and decrement the refcount. As our
  * addip iterator works through and frees the src address
  * selection pointing to the sctp_ifa, eventually the refcount
  * will reach 0 and we will delete it. Note that it is assumed
  * that any locking on system level ifn/ifa is done at the
  * caller of these functions and these routines will only
  * lock the SCTP structures as they add or delete things.
  *
  * Other notes on VRF concepts.
  *  - An endpoint can be in multiple VRF's
  *  - An association lives within a VRF and only one VRF.
  *  - Any incoming packet we can deduce the VRF for by
  *    looking at the mbuf/pak inbound (for BSD its VRF=0 :D)
  *  - Any downward send call or connect call must supply the
  *    VRF via ancillary data or via some sort of set default
  *    VRF socket option call (again for BSD no brainer since
  *    the VRF is always 0).
  *  - An endpoint may add multiple VRF's to it.
  *  - Listening sockets can accept associations in any
  *    of the VRF's they are in but the assoc will end up
  *    in only one VRF (gotten from the packet or connect/send).
  *
  */
 
 struct sctp_vrf *
 sctp_allocate_vrf(int vrf_id)
 {
 	struct sctp_vrf *vrf = NULL;
 	struct sctp_vrflist *bucket;
 
 	/* First allocate the VRF structure */
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf) {
 		/* Already allocated */
 		return (vrf);
 	}
 	SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf),
 	    SCTP_M_VRF);
 	if (vrf == NULL) {
 		/* No memory */
 #ifdef INVARIANTS
 		panic("No memory for VRF:%d", vrf_id);
 #endif
 		return (NULL);
 	}
 	/* setup the VRF */
 	memset(vrf, 0, sizeof(struct sctp_vrf));
 	vrf->vrf_id = vrf_id;
 	LIST_INIT(&vrf->ifnlist);
 	vrf->total_ifa_count = 0;
 	vrf->refcount = 0;
 	/* now also setup table ids */
 	SCTP_INIT_VRF_TABLEID(vrf);
 	/* Init the HASH of addresses */
 	vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE,
 	    &vrf->vrf_addr_hashmark);
 	if (vrf->vrf_addr_hash == NULL) {
 		/* No memory */
 #ifdef INVARIANTS
 		panic("No memory for VRF:%d", vrf_id);
 #endif
 		SCTP_FREE(vrf, SCTP_M_VRF);
 		return (NULL);
 	}
 	/* Add it to the hash table */
 	bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_INSERT_HEAD(bucket, vrf, next_vrf);
 	atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
 	return (vrf);
 }
 
 
 struct sctp_ifn *
 sctp_find_ifn(void *ifn, uint32_t ifn_index)
 {
 	struct sctp_ifn *sctp_ifnp;
 	struct sctp_ifnlist *hash_ifn_head;
 
 	/*
 	 * We assume the lock is held for the addresses if that's wrong
 	 * problems could occur :-)
 	 */
 	hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
 	LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) {
 		if (sctp_ifnp->ifn_index == ifn_index) {
 			return (sctp_ifnp);
 		}
 		if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) {
 			return (sctp_ifnp);
 		}
 	}
 	return (NULL);
 }
 
 
 struct sctp_vrf *
 sctp_find_vrf(uint32_t vrf_id)
 {
 	struct sctp_vrflist *bucket;
 	struct sctp_vrf *liste;
 
 	bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_FOREACH(liste, bucket, next_vrf) {
 		if (vrf_id == liste->vrf_id) {
 			return (liste);
 		}
 	}
 	return (NULL);
 }
 
 
 void
 sctp_free_vrf(struct sctp_vrf *vrf)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) {
 		if (vrf->vrf_addr_hash) {
 			SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
 			vrf->vrf_addr_hash = NULL;
 		}
 		/* We zero'd the count */
 		LIST_REMOVE(vrf, next_vrf);
 		SCTP_FREE(vrf, SCTP_M_VRF);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
 	}
 }
 
 
 void
 sctp_free_ifn(struct sctp_ifn *sctp_ifnp)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) {
 		/* We zero'd the count */
 		if (sctp_ifnp->vrf) {
 			sctp_free_vrf(sctp_ifnp->vrf);
 		}
 		SCTP_FREE(sctp_ifnp, SCTP_M_IFN);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
 	}
 }
 
 
 void
 sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu)
 {
 	struct sctp_ifn *sctp_ifnp;
 
 	sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index);
 	if (sctp_ifnp != NULL) {
 		sctp_ifnp->ifn_mtu = mtu;
 	}
 }
 
 
 void
 sctp_free_ifa(struct sctp_ifa *sctp_ifap)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) {
 		/* We zero'd the count */
 		if (sctp_ifap->ifn_p) {
 			sctp_free_ifn(sctp_ifap->ifn_p);
 		}
 		SCTP_FREE(sctp_ifap, SCTP_M_IFA);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
 	}
 }
 
 
 static void
 sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock)
 {
 	struct sctp_ifn *found;
 
 	found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index);
 	if (found == NULL) {
 		/* Not in the list.. sorry */
 		return;
 	}
 	if (hold_addr_lock == 0)
 		SCTP_IPI_ADDR_WLOCK();
 	LIST_REMOVE(sctp_ifnp, next_bucket);
 	LIST_REMOVE(sctp_ifnp, next_ifn);
 	SCTP_DEREGISTER_INTERFACE(sctp_ifnp->ifn_index,
 	    sctp_ifnp->registered_af);
 	if (hold_addr_lock == 0)
 		SCTP_IPI_ADDR_WUNLOCK();
 	/* Take away the reference, and possibly free it */
 	sctp_free_ifn(sctp_ifnp);
 }
 
 
 void
 sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr,
     const char *if_name, uint32_t ifn_index)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap;
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out;
 
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
 		goto out;
 	}
 	if (sctp_ifap->ifn_p == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
 		goto out;
 	}
 	if (if_name) {
 		if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
 			    sctp_ifap->ifn_p->ifn_name, if_name);
 			goto out;
 		}
 	} else {
 		if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
 			    sctp_ifap->ifn_p->ifn_index, ifn_index);
 			goto out;
 		}
 	}
 
 	sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID);
 	sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
 out:
 	SCTP_IPI_ADDR_RUNLOCK();
 }
 
 
 void
 sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr,
     const char *if_name, uint32_t ifn_index)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap;
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out;
 
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
 		goto out;
 	}
 	if (sctp_ifap->ifn_p == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
 		goto out;
 	}
 	if (if_name) {
 		if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
 			    sctp_ifap->ifn_p->ifn_name, if_name);
 			goto out;
 		}
 	} else {
 		if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
 			    sctp_ifap->ifn_p->ifn_index, ifn_index);
 			goto out;
 		}
 	}
 
 	sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE);
 	sctp_ifap->localifa_flags |= SCTP_ADDR_VALID;
 out:
 	SCTP_IPI_ADDR_RUNLOCK();
 }
 
 
 /*-
  * Add an ifa to an ifn.
  * Register the interface as necessary.
  * NOTE: ADDR write lock MUST be held.
  */
 static void
 sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap)
 {
 	int ifa_af;
 
 	LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
 	sctp_ifap->ifn_p = sctp_ifnp;
 	atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
 	/* update address counts */
 	sctp_ifnp->ifa_count++;
 	ifa_af = sctp_ifap->address.sa.sa_family;
 	switch (ifa_af) {
 #ifdef INET
 	case AF_INET:
 		sctp_ifnp->num_v4++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sctp_ifnp->num_v6++;
 		break;
 #endif
 	default:
 		break;
 	}
 	if (sctp_ifnp->ifa_count == 1) {
 		/* register the new interface */
 		SCTP_REGISTER_INTERFACE(sctp_ifnp->ifn_index, ifa_af);
 		sctp_ifnp->registered_af = ifa_af;
 	}
 }
 
 
 /*-
  * Remove an ifa from its ifn.
  * If no more addresses exist, remove the ifn too. Otherwise, re-register
  * the interface based on the remaining address families left.
  * NOTE: ADDR write lock MUST be held.
  */
 static void
 sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap)
 {
 	LIST_REMOVE(sctp_ifap, next_ifa);
 	if (sctp_ifap->ifn_p) {
 		/* update address counts */
 		sctp_ifap->ifn_p->ifa_count--;
 		switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			sctp_ifap->ifn_p->num_v4--;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			sctp_ifap->ifn_p->num_v6--;
 			break;
 #endif
 		default:
 			break;
 		}
 
 		if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) {
 			/* remove the ifn, possibly freeing it */
 			sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED);
 		} else {
 			/* re-register address family type, if needed */
 			if ((sctp_ifap->ifn_p->num_v6 == 0) &&
 			    (sctp_ifap->ifn_p->registered_af == AF_INET6)) {
 				SCTP_DEREGISTER_INTERFACE(sctp_ifap->ifn_p->ifn_index, AF_INET6);
 				SCTP_REGISTER_INTERFACE(sctp_ifap->ifn_p->ifn_index, AF_INET);
 				sctp_ifap->ifn_p->registered_af = AF_INET;
 			} else if ((sctp_ifap->ifn_p->num_v4 == 0) &&
 			    (sctp_ifap->ifn_p->registered_af == AF_INET)) {
 				SCTP_DEREGISTER_INTERFACE(sctp_ifap->ifn_p->ifn_index, AF_INET);
 				SCTP_REGISTER_INTERFACE(sctp_ifap->ifn_p->ifn_index, AF_INET6);
 				sctp_ifap->ifn_p->registered_af = AF_INET6;
 			}
 			/* free the ifn refcount */
 			sctp_free_ifn(sctp_ifap->ifn_p);
 		}
 		sctp_ifap->ifn_p = NULL;
 	}
 }
 
 
 struct sctp_ifa *
 sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
     uint32_t ifn_type, const char *if_name, void *ifa,
     struct sockaddr *addr, uint32_t ifa_flags,
     int dynamic_add)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifn *sctp_ifnp = NULL;
 	struct sctp_ifa *sctp_ifap = NULL;
 	struct sctp_ifalist *hash_addr_head;
 	struct sctp_ifnlist *hash_ifn_head;
 	uint32_t hash_of_addr;
 	int new_ifn_af = 0;
 
 #ifdef SCTP_DEBUG
 	SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
 #endif
 	SCTP_IPI_ADDR_WLOCK();
 	sctp_ifnp = sctp_find_ifn(ifn, ifn_index);
 	if (sctp_ifnp) {
 		vrf = sctp_ifnp->vrf;
 	} else {
 		vrf = sctp_find_vrf(vrf_id);
 		if (vrf == NULL) {
 			vrf = sctp_allocate_vrf(vrf_id);
 			if (vrf == NULL) {
 				SCTP_IPI_ADDR_WUNLOCK();
 				return (NULL);
 			}
 		}
 	}
 	if (sctp_ifnp == NULL) {
 		/*
 		 * build one and add it, can't hold lock until after malloc
 		 * done though.
 		 */
 		SCTP_IPI_ADDR_WUNLOCK();
 		SCTP_MALLOC(sctp_ifnp, struct sctp_ifn *,
 		    sizeof(struct sctp_ifn), SCTP_M_IFN);
 		if (sctp_ifnp == NULL) {
 #ifdef INVARIANTS
 			panic("No memory for IFN");
 #endif
 			return (NULL);
 		}
 		memset(sctp_ifnp, 0, sizeof(struct sctp_ifn));
 		sctp_ifnp->ifn_index = ifn_index;
 		sctp_ifnp->ifn_p = ifn;
 		sctp_ifnp->ifn_type = ifn_type;
 		sctp_ifnp->refcount = 0;
 		sctp_ifnp->vrf = vrf;
 		atomic_add_int(&vrf->refcount, 1);
 		sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, addr->sa_family);
 		if (if_name != NULL) {
 			snprintf(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", if_name);
 		} else {
 			snprintf(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", "unknown");
 		}
 		hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
 		LIST_INIT(&sctp_ifnp->ifalist);
 		SCTP_IPI_ADDR_WLOCK();
 		LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket);
 		LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn);
 		atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
 		new_ifn_af = 1;
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap) {
 		/* Hmm, it already exists? */
 		if ((sctp_ifap->ifn_p) &&
 		    (sctp_ifap->ifn_p->ifn_index == ifn_index)) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n",
 			    sctp_ifap->ifn_p->ifn_name, ifn_index,
 			    (void *)sctp_ifap);
 			if (new_ifn_af) {
 				/* Remove the created one that we don't want */
 				sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED);
 			}
 			if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) {
 				/* easy to solve, just switch back to active */
 				SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n");
 				sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
 				sctp_ifap->ifn_p = sctp_ifnp;
 				atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
 			}
 	exit_stage_left:
 			SCTP_IPI_ADDR_WUNLOCK();
 			return (sctp_ifap);
 		} else {
 			if (sctp_ifap->ifn_p) {
 				/*
 				 * The last IFN gets the address, remove the
 				 * old one
 				 */
 				SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n",
 				    (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name,
 				    sctp_ifap->ifn_p->ifn_index, if_name,
 				    ifn_index);
 				/* remove the address from the old ifn */
 				sctp_remove_ifa_from_ifn(sctp_ifap);
 				/* move the address over to the new ifn */
 				sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
 				goto exit_stage_left;
 			} else {
 				/* repair ifnp which was NULL ? */
 				sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
 				SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n",
 				    (void *)sctp_ifnp, (void *)sctp_ifap);
 				sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
 			}
 			goto exit_stage_left;
 		}
 	}
 	SCTP_IPI_ADDR_WUNLOCK();
 	SCTP_MALLOC(sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA);
 	if (sctp_ifap == NULL) {
 #ifdef INVARIANTS
 		panic("No memory for IFA");
 #endif
 		return (NULL);
 	}
 	memset(sctp_ifap, 0, sizeof(struct sctp_ifa));
 	sctp_ifap->ifn_p = sctp_ifnp;
 	atomic_add_int(&sctp_ifnp->refcount, 1);
 	sctp_ifap->vrf_id = vrf_id;
 	sctp_ifap->ifa = ifa;
 	memcpy(&sctp_ifap->address, addr, addr->sa_len);
 	sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE;
 	sctp_ifap->flags = ifa_flags;
 	/* Set scope */
 	switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = &sctp_ifap->address.sin;
 			if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
 			    (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) {
 				sctp_ifap->src_is_loop = 1;
 			}
 			if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 				sctp_ifap->src_is_priv = 1;
 			}
 			sctp_ifnp->num_v4++;
 			if (new_ifn_af)
 				new_ifn_af = AF_INET;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* ok to use deprecated addresses? */
 			struct sockaddr_in6 *sin6;
 
 			sin6 = &sctp_ifap->address.sin6;
 			if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
 			    (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) {
 				sctp_ifap->src_is_loop = 1;
 			}
 			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 				sctp_ifap->src_is_priv = 1;
 			}
 			sctp_ifnp->num_v6++;
 			if (new_ifn_af)
 				new_ifn_af = AF_INET6;
 			break;
 		}
 #endif
 	default:
 		new_ifn_af = 0;
 		break;
 	}
 	hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa);
 
 	if ((sctp_ifap->src_is_priv == 0) &&
 	    (sctp_ifap->src_is_loop == 0)) {
 		sctp_ifap->src_is_glob = 1;
 	}
 	SCTP_IPI_ADDR_WLOCK();
 	hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)];
 	LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket);
 	sctp_ifap->refcount = 1;
 	LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
 	sctp_ifnp->ifa_count++;
 	vrf->total_ifa_count++;
 	atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
 	if (new_ifn_af) {
 		SCTP_REGISTER_INTERFACE(ifn_index, new_ifn_af);
 		sctp_ifnp->registered_af = new_ifn_af;
 	}
 	SCTP_IPI_ADDR_WUNLOCK();
 	if (dynamic_add) {
 		/*
 		 * Bump up the refcount so that when the timer completes it
 		 * will drop back down.
 		 */
 		struct sctp_laddr *wi;
 
 		atomic_add_int(&sctp_ifap->refcount, 1);
 		wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 		if (wi == NULL) {
 			/*
 			 * Gak, what can we do? We have lost an address
 			 * change can you say HOSED?
 			 */
 			SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
 			/* Opps, must decrement the count */
 			sctp_del_addr_from_vrf(vrf_id, addr, ifn_index,
 			    if_name);
 			return (NULL);
 		}
 		SCTP_INCR_LADDR_COUNT();
 		bzero(wi, sizeof(*wi));
 		(void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
 		wi->ifa = sctp_ifap;
 		wi->action = SCTP_ADD_IP_ADDRESS;
 
 		SCTP_WQ_ADDR_LOCK();
 		LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 		SCTP_WQ_ADDR_UNLOCK();
 
 		sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 		    (struct sctp_inpcb *)NULL,
 		    (struct sctp_tcb *)NULL,
 		    (struct sctp_nets *)NULL);
 	} else {
 		/* it's ready for use */
 		sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
 	}
 	return (sctp_ifap);
 }
 
 void
 sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr,
     uint32_t ifn_index, const char *if_name)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap = NULL;
 
 	SCTP_IPI_ADDR_WLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out_now;
 	}
 #ifdef SCTP_DEBUG
 	SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
 #endif
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap) {
 		/* Validate the delete */
 		if (sctp_ifap->ifn_p) {
 			int valid = 0;
 
 			/*-
 			 * The name has priority over the ifn_index
 			 * if its given. We do this especially for
 			 * panda who might recycle indexes fast.
 			 */
 			if (if_name) {
 				if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) == 0) {
 					/* They match its a correct delete */
 					valid = 1;
 				}
 			}
 			if (!valid) {
 				/* last ditch check ifn_index */
 				if (ifn_index == sctp_ifap->ifn_p->ifn_index) {
 					valid = 1;
 				}
 			}
 			if (!valid) {
 				SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n",
 				    ifn_index, ((if_name == NULL) ? "NULL" : if_name));
 				SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n",
 				    sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name);
 				SCTP_IPI_ADDR_WUNLOCK();
 				return;
 			}
 		}
 		SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap);
 		sctp_ifap->localifa_flags &= SCTP_ADDR_VALID;
 		/*
 		 * We don't set the flag. This means that the structure will
 		 * hang around in EP's that have bound specific to it until
 		 * they close. This gives us TCP like behavior if someone
 		 * removes an address (or for that matter adds it right
 		 * back).
 		 */
 		/* sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; */
 		vrf->total_ifa_count--;
 		LIST_REMOVE(sctp_ifap, next_bucket);
 		sctp_remove_ifa_from_ifn(sctp_ifap);
 	}
 #ifdef SCTP_DEBUG
 	else {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:",
 		    ifn_index);
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
 	}
 #endif
 
 out_now:
 	SCTP_IPI_ADDR_WUNLOCK();
 	if (sctp_ifap) {
 		struct sctp_laddr *wi;
 
 		wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 		if (wi == NULL) {
 			/*
 			 * Gak, what can we do? We have lost an address
 			 * change can you say HOSED?
 			 */
 			SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
 
 			/* Oops, must decrement the count */
 			sctp_free_ifa(sctp_ifap);
 			return;
 		}
 		SCTP_INCR_LADDR_COUNT();
 		bzero(wi, sizeof(*wi));
 		(void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
 		wi->ifa = sctp_ifap;
 		wi->action = SCTP_DEL_IP_ADDRESS;
 		SCTP_WQ_ADDR_LOCK();
 		/*
 		 * Should this really be a tailq? As it is we will process
 		 * the newest first :-0
 		 */
 		LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 		SCTP_WQ_ADDR_UNLOCK();
 
 		sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 		    (struct sctp_inpcb *)NULL,
 		    (struct sctp_tcb *)NULL,
 		    (struct sctp_nets *)NULL);
 	}
 	return;
 }
 
 
 static int
 sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to)
 {
 	int loopback_scope;
 
 #if defined(INET)
 	int ipv4_local_scope, ipv4_addr_legal;
 
 #endif
 #if defined(INET6)
 	int local_scope, site_scope, ipv6_addr_legal;
 
 #endif
 	struct sctp_vrf *vrf;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 
 	loopback_scope = stcb->asoc.scope.loopback_scope;
 #if defined(INET)
 	ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope;
 	ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	local_scope = stcb->asoc.scope.local_scope;
 	site_scope = stcb->asoc.scope.site_scope;
 	ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal;
 #endif
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(stcb->asoc.vrf_id);
 	if (vrf == NULL) {
 		/* no vrf, no addresses */
 		SCTP_IPI_ADDR_RUNLOCK();
 		return (0);
 	}
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			if ((loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 				continue;
 			}
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
 				    (!sctp_is_addr_pending(stcb, sctp_ifa))) {
 					/*
 					 * We allow pending addresses, where
 					 * we have sent an asconf-add to be
 					 * considered valid.
 					 */
 					continue;
 				}
 				if (sctp_ifa->address.sa.sa_family != to->sa_family) {
 					continue;
 				}
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (ipv4_addr_legal) {
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = &sctp_ifa->address.sin;
 						rsin = (struct sockaddr_in *)to;
 						if ((ipv4_local_scope == 0) &&
 						    IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 							continue;
 						}
 						if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin->sin_addr) != 0) {
 							continue;
 						}
 						if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
 							SCTP_IPI_ADDR_RUNLOCK();
 							return (1);
 						}
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					if (ipv6_addr_legal) {
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = &sctp_ifa->address.sin6;
 						rsin6 = (struct sockaddr_in6 *)to;
 						if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin6->sin6_addr) != 0) {
 							continue;
 						}
 						if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 							if (local_scope == 0)
 								continue;
 							if (sin6->sin6_scope_id == 0) {
 								if (sa6_recoverscope(sin6) != 0)
 									continue;
 							}
 						}
 						if ((site_scope == 0) &&
 						    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 							continue;
 						}
 						if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
 							SCTP_IPI_ADDR_RUNLOCK();
 							return (1);
 						}
 					}
 					break;
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n");
 				continue;
 			}
 			if (sctp_is_addr_restricted(stcb, laddr->ifa) &&
 			    (!sctp_is_addr_pending(stcb, laddr->ifa))) {
 				/*
 				 * We allow pending addresses, where we have
 				 * sent an asconf-add to be considered
 				 * valid.
 				 */
 				continue;
 			}
 			if (laddr->ifa->address.sa.sa_family != to->sa_family) {
 				continue;
 			}
 			switch (to->sa_family) {
 #ifdef INET
 			case AF_INET:
 				{
 					struct sockaddr_in *sin, *rsin;
 
 					sin = &laddr->ifa->address.sin;
 					rsin = (struct sockaddr_in *)to;
 					if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
 						SCTP_IPI_ADDR_RUNLOCK();
 						return (1);
 					}
 					break;
 				}
 #endif
 #ifdef INET6
 			case AF_INET6:
 				{
 					struct sockaddr_in6 *sin6, *rsin6;
 
 					sin6 = &laddr->ifa->address.sin6;
 					rsin6 = (struct sockaddr_in6 *)to;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
 						SCTP_IPI_ADDR_RUNLOCK();
 						return (1);
 					}
 					break;
 				}
 
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 
 		}
 	}
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (0);
 }
 
 
 static struct sctp_tcb *
 sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from,
     struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	/**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */
 	/*
 	 * If we support the TCP model, then we must now dig through to see
 	 * if we can find our endpoint in the list of tcp ep's.
 	 */
 	uint16_t lport, rport;
 	struct sctppcbhead *ephead;
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 
 	if ((to == NULL) || (from == NULL)) {
 		return (NULL);
 	}
 	switch (to->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (from->sa_family == AF_INET) {
 			lport = ((struct sockaddr_in *)to)->sin_port;
 			rport = ((struct sockaddr_in *)from)->sin_port;
 		} else {
 			return (NULL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (from->sa_family == AF_INET6) {
 			lport = ((struct sockaddr_in6 *)to)->sin6_port;
 			rport = ((struct sockaddr_in6 *)from)->sin6_port;
 		} else {
 			return (NULL);
 		}
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
 	/*
 	 * Ok now for each of the guys in this bucket we must look and see:
 	 * - Does the remote port match. - Does there single association's
 	 * addresses match this address (to). If so we update p_ep to point
 	 * to this ep and return the tcb from it.
 	 */
 	LIST_FOREACH(inp, ephead, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (lport != inp->sctp_lport) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		switch (to->sa_family) {
 #ifdef INET
 		case AF_INET:
 			{
 				struct sockaddr_in *sin;
 
 				sin = (struct sockaddr_in *)to;
 				if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)to;
 				if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 			}
 #endif
 		default:
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (inp->def_vrf_id != vrf_id) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* check to see if the ep has one of the addresses */
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 			/* We are NOT bound all, so look further */
 			int match = 0;
 
 			LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 
 				if (laddr->ifa == NULL) {
 					SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__);
 					continue;
 				}
 				if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 					SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n");
 					continue;
 				}
 				if (laddr->ifa->address.sa.sa_family ==
 				    to->sa_family) {
 					/* see if it matches */
 #ifdef INET
 					if (from->sa_family == AF_INET) {
 						struct sockaddr_in *intf_addr,
 						           *sin;
 
 						intf_addr = &laddr->ifa->address.sin;
 						sin = (struct sockaddr_in *)to;
 						if (sin->sin_addr.s_addr ==
 						    intf_addr->sin_addr.s_addr) {
 							match = 1;
 							break;
 						}
 					}
 #endif
 #ifdef INET6
 					if (from->sa_family == AF_INET6) {
 						struct sockaddr_in6 *intf_addr6;
 						struct sockaddr_in6 *sin6;
 
 						sin6 = (struct sockaddr_in6 *)
 						    to;
 						intf_addr6 = &laddr->ifa->address.sin6;
 
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    intf_addr6)) {
 							match = 1;
 							break;
 						}
 					}
 #endif
 				}
 			}
 			if (match == 0) {
 				/* This endpoint does not have this address */
 				SCTP_INP_RUNLOCK(inp);
 				continue;
 			}
 		}
 		/*
 		 * Ok if we hit here the ep has the address, does it hold
 		 * the tcb?
 		 */
 		/* XXX: Why don't we TAILQ_FOREACH through sctp_asoc_list? */
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb == NULL) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		SCTP_TCB_LOCK(stcb);
 		if (!sctp_does_stcb_own_this_addr(stcb, to)) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (stcb->rport != rport) {
 			/* remote port does not match. */
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (!sctp_does_stcb_own_this_addr(stcb, to)) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* Does this TCB have a matching address? */
 		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 
 			if (net->ro._l_addr.sa.sa_family != from->sa_family) {
 				/* not the same family, can't be a match */
 				continue;
 			}
 			switch (from->sa_family) {
 #ifdef INET
 			case AF_INET:
 				{
 					struct sockaddr_in *sin, *rsin;
 
 					sin = (struct sockaddr_in *)&net->ro._l_addr;
 					rsin = (struct sockaddr_in *)from;
 					if (sin->sin_addr.s_addr ==
 					    rsin->sin_addr.s_addr) {
 						/* found it */
 						if (netp != NULL) {
 							*netp = net;
 						}
 						/*
 						 * Update the endpoint
 						 * pointer
 						 */
 						*inp_p = inp;
 						SCTP_INP_RUNLOCK(inp);
 						return (stcb);
 					}
 					break;
 				}
 #endif
 #ifdef INET6
 			case AF_INET6:
 				{
 					struct sockaddr_in6 *sin6, *rsin6;
 
 					sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 					rsin6 = (struct sockaddr_in6 *)from;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6,
 					    rsin6)) {
 						/* found it */
 						if (netp != NULL) {
 							*netp = net;
 						}
 						/*
 						 * Update the endpoint
 						 * pointer
 						 */
 						*inp_p = inp;
 						SCTP_INP_RUNLOCK(inp);
 						return (stcb);
 					}
 					break;
 				}
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_RUNLOCK(inp);
 	}
 	return (NULL);
 }
 
 
 /*
  * rules for use
  *
  * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an
  * stcb, both will be locked (locked_tcb and stcb) but decrement will be done
  * (if locked == NULL). 3) Decrement happens on return ONLY if locked ==
  * NULL.
  */
 
 struct sctp_tcb *
 sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote,
     struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb)
 {
 	struct sctpasochead *head;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb = NULL;
 	struct sctp_nets *net;
 	uint16_t rport;
 
 	inp = *inp_p;
 	switch (remote->sa_family) {
 #ifdef INET
 	case AF_INET:
 		rport = (((struct sockaddr_in *)remote)->sin_port);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		rport = (((struct sockaddr_in6 *)remote)->sin6_port);
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	if (locked_tcb) {
 		/*
 		 * UN-lock so we can do proper locking here this occurs when
 		 * called from load_addresses_from_init.
 		 */
 		atomic_add_int(&locked_tcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(locked_tcb);
 	}
 	SCTP_INP_INFO_RLOCK();
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*-
 		 * Now either this guy is our listener or it's the
 		 * connector. If it is the one that issued the connect, then
 		 * it's only chance is to be the first TCB in the list. If
 		 * it is the acceptor, then do the special_lookup to hash
 		 * and find the real inp.
 		 */
 		if ((inp->sctp_socket) && (inp->sctp_socket->so_qlimit)) {
 			/* to is peer addr, from is my addr */
 			stcb = sctp_tcb_special_locate(inp_p, remote, local,
 			    netp, inp->def_vrf_id);
 			if ((stcb != NULL) && (locked_tcb == NULL)) {
 				/* we have a locked tcb, lower refcount */
 				SCTP_INP_DECR_REF(inp);
 			}
 			if ((locked_tcb != NULL) && (locked_tcb != stcb)) {
 				SCTP_INP_RLOCK(locked_tcb->sctp_ep);
 				SCTP_TCB_LOCK(locked_tcb);
 				atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 				SCTP_INP_RUNLOCK(locked_tcb->sctp_ep);
 			}
 			SCTP_INP_INFO_RUNLOCK();
 			return (stcb);
 		} else {
 			SCTP_INP_WLOCK(inp);
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 				goto null_return;
 			}
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				goto null_return;
 			}
 			SCTP_TCB_LOCK(stcb);
 
 			if (stcb->rport != rport) {
 				/* remote port does not match. */
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			/* now look at the list of remote addresses */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 #ifdef INVARIANTS
 				if (net == (TAILQ_NEXT(net, sctp_next))) {
 					panic("Corrupt net list");
 				}
 #endif
 				if (net->ro._l_addr.sa.sa_family !=
 				    remote->sa_family) {
 					/* not the same family */
 					continue;
 				}
 				switch (remote->sa_family) {
 #ifdef INET
 				case AF_INET:
 					{
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = (struct sockaddr_in *)
 						    &net->ro._l_addr;
 						rsin = (struct sockaddr_in *)remote;
 						if (sin->sin_addr.s_addr ==
 						    rsin->sin_addr.s_addr) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 #ifdef INET6
 				case AF_INET6:
 					{
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 						rsin6 = (struct sockaddr_in6 *)remote;
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    rsin6)) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	} else {
 		SCTP_INP_WLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			goto null_return;
 		}
 		head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport,
 		    inp->sctp_hashmark)];
 		LIST_FOREACH(stcb, head, sctp_tcbhash) {
 			if (stcb->rport != rport) {
 				/* remote port does not match */
 				continue;
 			}
 			SCTP_TCB_LOCK(stcb);
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			/* now look at the list of remote addresses */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 #ifdef INVARIANTS
 				if (net == (TAILQ_NEXT(net, sctp_next))) {
 					panic("Corrupt net list");
 				}
 #endif
 				if (net->ro._l_addr.sa.sa_family !=
 				    remote->sa_family) {
 					/* not the same family */
 					continue;
 				}
 				switch (remote->sa_family) {
 #ifdef INET
 				case AF_INET:
 					{
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = (struct sockaddr_in *)
 						    &net->ro._l_addr;
 						rsin = (struct sockaddr_in *)remote;
 						if (sin->sin_addr.s_addr ==
 						    rsin->sin_addr.s_addr) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 #ifdef INET6
 				case AF_INET6:
 					{
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = (struct sockaddr_in6 *)
 						    &net->ro._l_addr;
 						rsin6 = (struct sockaddr_in6 *)remote;
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    rsin6)) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 null_return:
 	/* clean up for returning null */
 	if (locked_tcb) {
 		SCTP_TCB_LOCK(locked_tcb);
 		atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 	}
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_INFO_RUNLOCK();
 	/* not found */
 	return (NULL);
 }
 
 
 /*
  * Find an association for a specific endpoint using the association id given
  * out in the COMM_UP notification
  */
 struct sctp_tcb *
 sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
 {
 	/*
 	 * Use my the assoc_id to find a endpoint
 	 */
 	struct sctpasochead *head;
 	struct sctp_tcb *stcb;
 	uint32_t id;
 
 	if (inp == NULL) {
 		SCTP_PRINTF("TSNH ep_associd\n");
 		return (NULL);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		SCTP_PRINTF("TSNH ep_associd0\n");
 		return (NULL);
 	}
 	id = (uint32_t) asoc_id;
 	head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
 	if (head == NULL) {
 		/* invalid id TSNH */
 		SCTP_PRINTF("TSNH ep_associd1\n");
 		return (NULL);
 	}
 	LIST_FOREACH(stcb, head, sctp_tcbasocidhash) {
 		if (stcb->asoc.assoc_id == id) {
 			if (inp != stcb->sctp_ep) {
 				/*
 				 * some other guy has the same id active (id
 				 * collision ??).
 				 */
 				SCTP_PRINTF("TSNH ep_associd2\n");
 				continue;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				continue;
 			}
 			if (want_lock) {
 				SCTP_TCB_LOCK(stcb);
 			}
 			return (stcb);
 		}
 	}
 	return (NULL);
 }
 
 
 struct sctp_tcb *
 sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
 {
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_RLOCK(inp);
 	stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock);
 	SCTP_INP_RUNLOCK(inp);
 	return (stcb);
 }
 
 
 /*
  * Endpoint probe expects that the INP_INFO is locked.
  */
 static struct sctp_inpcb *
 sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head,
     uint16_t lport, uint32_t vrf_id)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 
 #ifdef INET
 	struct sockaddr_in *sin;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sockaddr_in6 *intf_addr6;
 
 #endif
 	int fnd;
 
 #ifdef INET
 	sin = NULL;
 #endif
 #ifdef INET6
 	sin6 = NULL;
 #endif
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)nam;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)nam;
 		break;
 #endif
 	default:
 		/* unsupported family */
 		return (NULL);
 	}
 
 	if (head == NULL)
 		return (NULL);
 
 	LIST_FOREACH(inp, head, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) &&
 		    (inp->sctp_lport == lport)) {
 			/* got it */
 			switch (nam->sa_family) {
 #ifdef INET
 			case AF_INET:
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 				    SCTP_IPV6_V6ONLY(inp)) {
 					/*
 					 * IPv4 on a IPv6 socket with ONLY
 					 * IPv6 set
 					 */
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				/*
 				 * A V6 address and the endpoint is NOT
 				 * bound V6
 				 */
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 			/* does a VRF id match? */
 			fnd = 0;
 			if (inp->def_vrf_id == vrf_id)
 				fnd = 1;
 
 			SCTP_INP_RUNLOCK(inp);
 			if (!fnd)
 				continue;
 			return (inp);
 		}
 		SCTP_INP_RUNLOCK(inp);
 	}
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (sin->sin_addr.s_addr == INADDR_ANY) {
 			/* Can't hunt for one that has no address specified */
 			return (NULL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/* Can't hunt for one that has no address specified */
 			return (NULL);
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	/*
 	 * ok, not bound to all so see if we can find a EP bound to this
 	 * address.
 	 */
 	LIST_FOREACH(inp, head, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/*
 		 * Ok this could be a likely candidate, look at all of its
 		 * addresses
 		 */
 		if (inp->sctp_lport != lport) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* does a VRF id match? */
 		fnd = 0;
 		if (inp->def_vrf_id == vrf_id)
 			fnd = 1;
 
 		if (!fnd) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa == NULL) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
 				    __func__);
 				continue;
 			}
 			SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ",
 			    (void *)laddr->ifa);
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n");
 				continue;
 			}
 			if (laddr->ifa->address.sa.sa_family == nam->sa_family) {
 				/* possible, see if it matches */
 				switch (nam->sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (sin->sin_addr.s_addr ==
 					    laddr->ifa->address.sin.sin_addr.s_addr) {
 						SCTP_INP_RUNLOCK(inp);
 						return (inp);
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					intf_addr6 = &laddr->ifa->address.sin6;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6,
 					    intf_addr6)) {
 						SCTP_INP_RUNLOCK(inp);
 						return (inp);
 					}
 					break;
 #endif
 				}
 			}
 		}
 		SCTP_INP_RUNLOCK(inp);
 	}
 	return (NULL);
 }
 
 
 static struct sctp_inpcb *
 sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id)
 {
 	struct sctppcbhead *head;
 	struct sctp_inpcb *t_inp;
 	int fnd;
 
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
 	    SCTP_BASE_INFO(hashmark))];
 	LIST_FOREACH(t_inp, head, sctp_hash) {
 		if (t_inp->sctp_lport != lport) {
 			continue;
 		}
 		/* is it in the VRF in question */
 		fnd = 0;
 		if (t_inp->def_vrf_id == vrf_id)
 			fnd = 1;
 		if (!fnd)
 			continue;
 
 		/* This one is in use. */
 		/* check the v6/v4 binding issue */
 		if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 		    SCTP_IPV6_V6ONLY(t_inp)) {
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				/* collision in V6 space */
 				return (t_inp);
 			} else {
 				/* inp is BOUND_V4 no conflict */
 				continue;
 			}
 		} else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 			/* t_inp is bound v4 and v6, conflict always */
 			return (t_inp);
 		} else {
 			/* t_inp is bound only V4 */
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 			    SCTP_IPV6_V6ONLY(inp)) {
 				/* no conflict */
 				continue;
 			}
 			/* else fall through to conflict */
 		}
 		return (t_inp);
 	}
 	return (NULL);
 }
 
 
 int
 sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
 {
 	/* For 1-2-1 with port reuse */
 	struct sctppcbhead *head;
 	struct sctp_inpcb *tinp, *ninp;
 
 	if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
 		/* only works with port reuse on */
 		return (-1);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) {
 		return (0);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	SCTP_INP_INFO_WLOCK();
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport,
 	    SCTP_BASE_INFO(hashmark))];
 	/* Kick out all non-listeners to the TCP hash */
 	LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) {
 		if (tinp->sctp_lport != inp->sctp_lport) {
 			continue;
 		}
 		if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			continue;
 		}
 		if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 			continue;
 		}
 		if (tinp->sctp_socket->so_qlimit) {
 			continue;
 		}
 		SCTP_INP_WLOCK(tinp);
 		LIST_REMOVE(tinp, sctp_hash);
 		head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))];
 		tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
 		LIST_INSERT_HEAD(head, tinp, sctp_hash);
 		SCTP_INP_WUNLOCK(tinp);
 	}
 	SCTP_INP_WLOCK(inp);
 	/* Pull from where he was */
 	LIST_REMOVE(inp, sctp_hash);
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL;
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))];
 	LIST_INSERT_HEAD(head, inp, sctp_hash);
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_RLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	return (0);
 }
 
 
 struct sctp_inpcb *
 sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock,
     uint32_t vrf_id)
 {
 	/*
 	 * First we check the hash table to see if someone has this port
 	 * bound with just the port.
 	 */
 	struct sctp_inpcb *inp;
 	struct sctppcbhead *head;
 	int lport;
 	unsigned int i;
 
 #ifdef INET
 	struct sockaddr_in *sin;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 
 #endif
 
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)nam;
 		lport = sin->sin_port;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)nam;
 		lport = sin6->sin6_port;
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	/*
 	 * I could cheat here and just cast to one of the types but we will
 	 * do it right. It also provides the check against an Unsupported
 	 * type too.
 	 */
 	/* Find the head of the ALLADDR chain */
 	if (have_lock == 0) {
 		SCTP_INP_INFO_RLOCK();
 	}
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
 	    SCTP_BASE_INFO(hashmark))];
 	inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
 
 	/*
 	 * If the TCP model exists it could be that the main listening
 	 * endpoint is gone but there still exists a connected socket for
 	 * this guy. If so we can return the first one that we find. This
 	 * may NOT be the correct one so the caller should be wary on the
 	 * returned INP. Currently the only caller that sets find_tcp_pool
 	 * is in bindx where we are verifying that a user CAN bind the
 	 * address. He either has bound it already, or someone else has, or
 	 * its open to bind, so this is good enough.
 	 */
 	if (inp == NULL && find_tcp_pool) {
 		for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) {
 			head = &SCTP_BASE_INFO(sctp_tcpephash)[i];
 			inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
 			if (inp) {
 				break;
 			}
 		}
 	}
 	if (inp) {
 		SCTP_INP_INCR_REF(inp);
 	}
 	if (have_lock == 0) {
 		SCTP_INP_INFO_RUNLOCK();
 	}
 	return (inp);
 }
 
 
 /*
  * Find an association for an endpoint with the pointer to whom you want to
  * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may
  * need to change the *to to some other struct like a mbuf...
  */
 struct sctp_tcb *
 sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool,
     uint32_t vrf_id)
 {
 	struct sctp_inpcb *inp = NULL;
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_INFO_RLOCK();
 	if (find_tcp_pool) {
 		if (inp_p != NULL) {
 			stcb = sctp_tcb_special_locate(inp_p, from, to, netp,
 			    vrf_id);
 		} else {
 			stcb = sctp_tcb_special_locate(&inp, from, to, netp,
 			    vrf_id);
 		}
 		if (stcb != NULL) {
 			SCTP_INP_INFO_RUNLOCK();
 			return (stcb);
 		}
 	}
 	inp = sctp_pcb_findep(to, 0, 1, vrf_id);
 	if (inp_p != NULL) {
 		*inp_p = inp;
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	if (inp == NULL) {
 		return (NULL);
 	}
 	/*
 	 * ok, we have an endpoint, now lets find the assoc for it (if any)
 	 * we now place the source address or from in the to of the find
 	 * endpoint call. Since in reality this chain is used from the
 	 * inbound packet side.
 	 */
 	if (inp_p != NULL) {
 		stcb = sctp_findassociation_ep_addr(inp_p, from, netp, to,
 		    NULL);
 	} else {
 		stcb = sctp_findassociation_ep_addr(&inp, from, netp, to,
 		    NULL);
 	}
 	return (stcb);
 }
 
 
 /*
  * This routine will grub through the mbuf that is a INIT or INIT-ACK and
  * find all addresses that the sender has specified in any address list. Each
  * address will be used to lookup the TCB and see if one exits.
  */
 static struct sctp_tcb *
 sctp_findassociation_special_addr(struct mbuf *m, int offset,
     struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp,
     struct sockaddr *dst)
 {
 	struct sctp_paramhdr *phdr, parm_buf;
 
 #if defined(INET) || defined(INET6)
 	struct sctp_tcb *stcb;
 	uint16_t ptype;
 
 #endif
 	uint16_t plen;
 
 #ifdef INET
 	struct sockaddr_in sin4;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 
 #endif
 
 #ifdef INET
 	memset(&sin4, 0, sizeof(sin4));
 	sin4.sin_len = sizeof(sin4);
 	sin4.sin_family = AF_INET;
 	sin4.sin_port = sh->src_port;
 #endif
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(sin6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = sh->src_port;
 #endif
 
 	offset += sizeof(struct sctp_init_chunk);
 
 	phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
 	while (phdr != NULL) {
 		/* now we must see if we want the parameter */
 #if defined(INET) || defined(INET6)
 		ptype = ntohs(phdr->param_type);
 #endif
 		plen = ntohs(phdr->param_length);
 		if (plen == 0) {
 			break;
 		}
 #ifdef INET
 		if (ptype == SCTP_IPV4_ADDRESS &&
 		    plen == sizeof(struct sctp_ipv4addr_param)) {
 			/* Get the rest of the address */
 			struct sctp_ipv4addr_param ip4_parm, *p4;
 
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&ip4_parm, min(plen, sizeof(ip4_parm)));
 			if (phdr == NULL) {
 				return (NULL);
 			}
 			p4 = (struct sctp_ipv4addr_param *)phdr;
 			memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr));
 			/* look it up */
 			stcb = sctp_findassociation_ep_addr(inp_p,
 			    (struct sockaddr *)&sin4, netp, dst, NULL);
 			if (stcb != NULL) {
 				return (stcb);
 			}
 		}
 #endif
 #ifdef INET6
 		if (ptype == SCTP_IPV6_ADDRESS &&
 		    plen == sizeof(struct sctp_ipv6addr_param)) {
 			/* Get the rest of the address */
 			struct sctp_ipv6addr_param ip6_parm, *p6;
 
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&ip6_parm, min(plen, sizeof(ip6_parm)));
 			if (phdr == NULL) {
 				return (NULL);
 			}
 			p6 = (struct sctp_ipv6addr_param *)phdr;
 			memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr));
 			/* look it up */
 			stcb = sctp_findassociation_ep_addr(inp_p,
 			    (struct sockaddr *)&sin6, netp, dst, NULL);
 			if (stcb != NULL) {
 				return (stcb);
 			}
 		}
 #endif
 		offset += SCTP_SIZE32(plen);
 		phdr = sctp_get_next_param(m, offset, &parm_buf,
 		    sizeof(parm_buf));
 	}
 	return (NULL);
 }
 
 static struct sctp_tcb *
 sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport,
     uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag)
 {
 	/*
 	 * Use my vtag to hash. If we find it we then verify the source addr
 	 * is in the assoc. If all goes well we save a bit on rec of a
 	 * packet.
 	 */
 	struct sctpasochead *head;
 	struct sctp_nets *net;
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_INFO_RLOCK();
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag,
 	    SCTP_BASE_INFO(hashasocmark))];
 	LIST_FOREACH(stcb, head, sctp_asocs) {
 		SCTP_INP_RLOCK(stcb->sctp_ep);
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(stcb->sctp_ep);
 			continue;
 		}
 		if (stcb->sctp_ep->def_vrf_id != vrf_id) {
 			SCTP_INP_RUNLOCK(stcb->sctp_ep);
 			continue;
 		}
 		SCTP_TCB_LOCK(stcb);
 		SCTP_INP_RUNLOCK(stcb->sctp_ep);
 		if (stcb->asoc.my_vtag == vtag) {
 			/* candidate */
 			if (stcb->rport != rport) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (stcb->sctp_ep->sctp_lport != lport) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			/* RRS:Need toaddr check here */
 			if (sctp_does_stcb_own_this_addr(stcb, to) == 0) {
 				/* Endpoint does not own this address */
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (remote_tag) {
 				/*
 				 * If we have both vtags that's all we match
 				 * on
 				 */
 				if (stcb->asoc.peer_vtag == remote_tag) {
 					/*
 					 * If both tags match we consider it
 					 * conclusive and check NO
 					 * source/destination addresses
 					 */
 					goto conclusive;
 				}
 			}
 			if (skip_src_check) {
 		conclusive:
 				if (from) {
 					*netp = sctp_findnet(stcb, from);
 				} else {
 					*netp = NULL;	/* unknown */
 				}
 				if (inp_p)
 					*inp_p = stcb->sctp_ep;
 				SCTP_INP_INFO_RUNLOCK();
 				return (stcb);
 			}
 			net = sctp_findnet(stcb, from);
 			if (net) {
 				/* yep its him. */
 				*netp = net;
 				SCTP_STAT_INCR(sctps_vtagexpress);
 				*inp_p = stcb->sctp_ep;
 				SCTP_INP_INFO_RUNLOCK();
 				return (stcb);
 			} else {
 				/*
 				 * not him, this should only happen in rare
 				 * cases so I peg it.
 				 */
 				SCTP_STAT_INCR(sctps_vtagbogus);
 			}
 		}
 		SCTP_TCB_UNLOCK(stcb);
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	return (NULL);
 }
 
 
 /*
  * Find an association with the pointer to the inbound IP packet. This can be
  * a IPv4 or IPv6 packet.
  */
 struct sctp_tcb *
 sctp_findassociation_addr(struct mbuf *m, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_chunkhdr *ch,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	int find_tcp_pool;
 	struct sctp_tcb *stcb;
 	struct sctp_inpcb *inp;
 
 	if (sh->v_tag) {
 		/* we only go down this path if vtag is non-zero */
 		stcb = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag),
 		    inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0);
 		if (stcb) {
 			return (stcb);
 		}
 	}
 	find_tcp_pool = 0;
 	if ((ch->chunk_type != SCTP_INITIATION) &&
 	    (ch->chunk_type != SCTP_INITIATION_ACK) &&
 	    (ch->chunk_type != SCTP_COOKIE_ACK) &&
 	    (ch->chunk_type != SCTP_COOKIE_ECHO)) {
 		/* Other chunk types go to the tcp pool. */
 		find_tcp_pool = 1;
 	}
 	if (inp_p) {
 		stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp,
 		    find_tcp_pool, vrf_id);
 		inp = *inp_p;
 	} else {
 		stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp,
 		    find_tcp_pool, vrf_id);
 	}
 	SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp);
 	if (stcb == NULL && inp) {
 		/* Found a EP but not this address */
 		if ((ch->chunk_type == SCTP_INITIATION) ||
 		    (ch->chunk_type == SCTP_INITIATION_ACK)) {
 			/*-
 			 * special hook, we do NOT return linp or an
 			 * association that is linked to an existing
 			 * association that is under the TCP pool (i.e. no
 			 * listener exists). The endpoint finding routine
 			 * will always find a listener before examining the
 			 * TCP pool.
 			 */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) {
 				if (inp_p) {
 					*inp_p = NULL;
 				}
 				return (NULL);
 			}
 			stcb = sctp_findassociation_special_addr(m,
 			    offset, sh, &inp, netp, dst);
 			if (inp_p != NULL) {
 				*inp_p = inp;
 			}
 		}
 	}
 	SCTPDBG(SCTP_DEBUG_PCB1, "stcb is %p\n", (void *)stcb);
 	return (stcb);
 }
 
 /*
  * lookup an association by an ASCONF lookup address.
  * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup
  */
 struct sctp_tcb *
 sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
     struct sockaddr *dst, struct sctphdr *sh,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	struct sctp_tcb *stcb;
 	union sctp_sockstore remote_store;
 	struct sctp_paramhdr parm_buf, *phdr;
 	int ptype;
 	int zero_address = 0;
 
 #ifdef INET
 	struct sockaddr_in *sin;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 
 #endif
 
 	memset(&remote_store, 0, sizeof(remote_store));
 	phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk),
 	    &parm_buf, sizeof(struct sctp_paramhdr));
 	if (phdr == NULL) {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n",
 		    __func__);
 		return NULL;
 	}
 	ptype = (int)((uint32_t) ntohs(phdr->param_type));
 	/* get the correlation address */
 	switch (ptype) {
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		{
 			/* ipv6 address param */
 			struct sctp_ipv6addr_param *p6, p6_buf;
 
 			if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) {
 				return NULL;
 			}
 			p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m,
 			    offset + sizeof(struct sctp_asconf_chunk),
 			    &p6_buf.ph, sizeof(*p6));
 			if (p6 == NULL) {
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n",
 				    __func__);
 				return (NULL);
 			}
 			sin6 = &remote_store.sin6;
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_port = sh->src_port;
 			memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr));
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 				zero_address = 1;
 			break;
 		}
 #endif
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		{
 			/* ipv4 address param */
 			struct sctp_ipv4addr_param *p4, p4_buf;
 
 			if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) {
 				return NULL;
 			}
 			p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m,
 			    offset + sizeof(struct sctp_asconf_chunk),
 			    &p4_buf.ph, sizeof(*p4));
 			if (p4 == NULL) {
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n",
 				    __func__);
 				return (NULL);
 			}
 			sin = &remote_store.sin;
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			sin->sin_port = sh->src_port;
 			memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr));
 			if (sin->sin_addr.s_addr == INADDR_ANY)
 				zero_address = 1;
 			break;
 		}
 #endif
 	default:
 		/* invalid address param type */
 		return NULL;
 	}
 
 	if (zero_address) {
 		stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p,
 		    netp, sh->src_port, sh->dest_port, 1, vrf_id, 0);
 		if (stcb != NULL) {
 			SCTP_INP_DECR_REF(*inp_p);
 		}
 	} else {
 		stcb = sctp_findassociation_ep_addr(inp_p,
 		    &remote_store.sa, netp,
 		    dst, NULL);
 	}
 	return (stcb);
 }
 
 
 /*
  * allocate a sctp_inpcb and setup a temporary binding to a port/all
  * addresses. This way if we don't get a bind we by default pick a ephemeral
  * port with all addresses bound.
  */
 int
 sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
 {
 	/*
 	 * we get called when a new endpoint starts up. We need to allocate
 	 * the sctp_inpcb structure from the zone and init it. Mark it as
 	 * unbound and find a port that we can use as an ephemeral with
 	 * INADDR_ANY. If the user binds later no problem we can then add in
 	 * the specific addresses. And setup the default parameters for the
 	 * EP.
 	 */
 	int i, error;
 	struct sctp_inpcb *inp;
 	struct sctp_pcb *m;
 	struct timeval time;
 	sctp_sharedkey_t *null_key;
 
 	error = 0;
 
 	SCTP_INP_INFO_WLOCK();
 	inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb);
 	if (inp == NULL) {
 		SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n");
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		return (ENOBUFS);
 	}
 	/* zap it */
 	bzero(inp, sizeof(*inp));
 
 	/* bump generations */
 	/* setup socket pointers */
 	inp->sctp_socket = so;
 	inp->ip_inp.inp.inp_socket = so;
 	inp->ip_inp.inp.inp_cred = crhold(so->so_cred);
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		if (MODULE_GLOBAL(ip6_auto_flowlabel)) {
 			inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL;
 		}
 		if (MODULE_GLOBAL(ip6_v6only)) {
 			inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY;
 		}
 	}
 #endif
 	inp->sctp_associd_counter = 1;
 	inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT;
 	inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
 	inp->max_cwnd = 0;
 	inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off);
 	inp->ecn_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_ecn_enable);
 	inp->prsctp_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pr_enable);
 	inp->auth_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_auth_enable);
 	inp->asconf_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_asconf_enable);
 	inp->reconfig_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_reconfig_enable);
 	inp->nrsack_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_nrsack_enable);
 	inp->pktdrop_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pktdrop_enable);
 	inp->fibnum = so->so_fibnum;
 	/* init the small hash table we use to track asocid <-> tcb */
 	inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark);
 	if (inp->sctp_asocidhash == NULL) {
 		crfree(inp->ip_inp.inp.inp_cred);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return (ENOBUFS);
 	}
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->ip_inp.inp.inp_sp);
 	if (error != 0) {
 		crfree(inp->ip_inp.inp.inp_cred);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return error;
 	}
 #endif				/* IPSEC */
 	SCTP_INCR_EP_COUNT();
 	inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
 	SCTP_INP_INFO_WUNLOCK();
 
 	so->so_pcb = (caddr_t)inp;
 
 	if (SCTP_SO_TYPE(so) == SOCK_SEQPACKET) {
 		/* UDP style socket */
 		inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE |
 		    SCTP_PCB_FLAGS_UNBOUND);
 		/* Be sure it is NON-BLOCKING IO for UDP */
 		/* SCTP_SET_SO_NBIO(so); */
 	} else if (SCTP_SO_TYPE(so) == SOCK_STREAM) {
 		/* TCP style socket */
 		inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE |
 		    SCTP_PCB_FLAGS_UNBOUND);
 		/* Be sure we have blocking IO by default */
 		SCTP_CLEAR_SO_NBIO(so);
 	} else {
 		/*
 		 * unsupported socket type (RAW, etc)- in case we missed it
 		 * in protosw
 		 */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP);
 		so->so_pcb = NULL;
 		crfree(inp->ip_inp.inp.inp_cred);
 #ifdef IPSEC
 		ipsec_delete_pcbpolicy(&inp->ip_inp.inp);
 #endif
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		return (EOPNOTSUPP);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) {
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	} else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) {
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	} else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) {
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	}
 	inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize),
 	    &inp->sctp_hashmark);
 	if (inp->sctp_tcbhash == NULL) {
 		SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n");
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		so->so_pcb = NULL;
 		crfree(inp->ip_inp.inp.inp_cred);
 #ifdef IPSEC
 		ipsec_delete_pcbpolicy(&inp->ip_inp.inp);
 #endif
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		return (ENOBUFS);
 	}
 	inp->def_vrf_id = vrf_id;
 
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_LOCK_INIT(inp);
 	INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp");
 	SCTP_INP_READ_INIT(inp);
 	SCTP_ASOC_CREATE_LOCK_INIT(inp);
 	/* lock the new ep */
 	SCTP_INP_WLOCK(inp);
 
 	/* add it to the info area */
 	LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list);
 	SCTP_INP_INFO_WUNLOCK();
 
 	TAILQ_INIT(&inp->read_queue);
 	LIST_INIT(&inp->sctp_addr_list);
 
 	LIST_INIT(&inp->sctp_asoc_list);
 
 #ifdef SCTP_TRACK_FREED_ASOCS
 	/* TEMP CODE */
 	LIST_INIT(&inp->sctp_asoc_free_list);
 #endif
 	/* Init the timer structure for signature change */
 	SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer);
 	inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE;
 
 	/* now init the actual endpoint default data */
 	m = &inp->sctp_ep;
 
 	/* setup the base timeout information */
 	m->sctp_timeoutticks[SCTP_TIMER_SEND] = SEC_TO_TICKS(SCTP_SEND_SEC);	/* needed ? */
 	m->sctp_timeoutticks[SCTP_TIMER_INIT] = SEC_TO_TICKS(SCTP_INIT_SEC);	/* needed ? */
 	m->sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default));
 	m->sctp_timeoutticks[SCTP_TIMER_PMTU] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default));
 	/* all max/min max are in ms */
 	m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default);
 	m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default);
 	m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default);
 	m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default);
 	m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default);
 	m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default);
 	m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default);
 	m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default);
 	m->def_net_pf_threshold = SCTP_BASE_SYSCTL(sctp_path_pf_threshold);
 	m->sctp_sws_sender = SCTP_SWS_SENDER_DEF;
 	m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF;
 	m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default);
 	m->fr_max_burst = SCTP_BASE_SYSCTL(sctp_fr_max_burst_default);
 
 	m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module);
 	m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module);
 	m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default);
 	/* number of streams to pre-open on a association */
 	m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default);
 
 	/* Add adaptation cookie */
 	m->adaptation_layer_indicator = 0;
 	m->adaptation_layer_indicator_provided = 0;
 
 	/* seed random number generator */
 	m->random_counter = 1;
 	m->store_at = SCTP_SIGNATURE_SIZE;
 	SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers));
 	sctp_fill_random_store(m);
 
 	/* Minimum cookie size */
 	m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) +
 	    sizeof(struct sctp_state_cookie);
 	m->size_of_a_cookie += SCTP_SIGNATURE_SIZE;
 
 	/* Setup the initial secret */
 	(void)SCTP_GETTIME_TIMEVAL(&time);
 	m->time_of_secret_change = time.tv_sec;
 
 	for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) {
 		m->secret_key[0][i] = sctp_select_initial_TSN(m);
 	}
 	sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL);
 
 	/* How long is a cookie good for ? */
 	m->def_cookie_life = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default));
 	/*
 	 * Initialize authentication parameters
 	 */
 	m->local_hmacs = sctp_default_supported_hmaclist();
 	m->local_auth_chunks = sctp_alloc_chunklist();
 	if (inp->asconf_supported) {
 		sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks);
 		sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks);
 	}
 	m->default_dscp = 0;
 #ifdef INET6
 	m->default_flowlabel = 0;
 #endif
 	m->port = 0;		/* encapsulation disabled by default */
 	LIST_INIT(&m->shared_keys);
 	/* add default NULL key as key id 0 */
 	null_key = sctp_alloc_sharedkey();
 	sctp_insert_sharedkey(&m->shared_keys, null_key);
 	SCTP_INP_WUNLOCK(inp);
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 12);
 #endif
 	return (error);
 }
 
 
 void
 sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp,
     struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 	uint16_t lport, rport;
 	struct sctppcbhead *head;
 	struct sctp_laddr *laddr, *oladdr;
 
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	SCTP_TCB_UNLOCK(stcb);
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(old_inp);
 	SCTP_INP_WLOCK(new_inp);
 	SCTP_TCB_LOCK(stcb);
 	atomic_subtract_int(&stcb->asoc.refcnt, 1);
 
 	new_inp->sctp_ep.time_of_secret_change =
 	    old_inp->sctp_ep.time_of_secret_change;
 	memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key,
 	    sizeof(old_inp->sctp_ep.secret_key));
 	new_inp->sctp_ep.current_secret_number =
 	    old_inp->sctp_ep.current_secret_number;
 	new_inp->sctp_ep.last_secret_number =
 	    old_inp->sctp_ep.last_secret_number;
 	new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie;
 
 	/* make it so new data pours into the new socket */
 	stcb->sctp_socket = new_inp->sctp_socket;
 	stcb->sctp_ep = new_inp;
 
 	/* Copy the port across */
 	lport = new_inp->sctp_lport = old_inp->sctp_lport;
 	rport = stcb->rport;
 	/* Pull the tcb from the old association */
 	LIST_REMOVE(stcb, sctp_tcbhash);
 	LIST_REMOVE(stcb, sctp_tcblist);
 	if (stcb->asoc.in_asocid_hash) {
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 	}
 	/* Now insert the new_inp into the TCP connected hash */
 	head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
 
 	LIST_INSERT_HEAD(head, new_inp, sctp_hash);
 	/* Its safe to access */
 	new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
 
 	/* Now move the tcb into the endpoint list */
 	LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist);
 	/*
 	 * Question, do we even need to worry about the ep-hash since we
 	 * only have one connection? Probably not :> so lets get rid of it
 	 * and not suck up any kernel memory in that.
 	 */
 	if (stcb->asoc.in_asocid_hash) {
 		struct sctpasochead *lhd;
 
 		lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id,
 		    new_inp->hashasocidmark)];
 		LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash);
 	}
 	/* Ok. Let's restart timer. */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp,
 		    stcb, net);
 	}
 
 	SCTP_INP_INFO_WUNLOCK();
 	if (new_inp->sctp_tcbhash != NULL) {
 		SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark);
 		new_inp->sctp_tcbhash = NULL;
 	}
 	if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 		/* Subset bound, so copy in the laddr list from the old_inp */
 		LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) {
 			laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 			if (laddr == NULL) {
 				/*
 				 * Gak, what can we do? This assoc is really
 				 * HOSED. We probably should send an abort
 				 * here.
 				 */
 				SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n");
 				continue;
 			}
 			SCTP_INCR_LADDR_COUNT();
 			bzero(laddr, sizeof(*laddr));
 			(void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
 			laddr->ifa = oladdr->ifa;
 			atomic_add_int(&laddr->ifa->refcount, 1);
 			LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr,
 			    sctp_nxt_addr);
 			new_inp->laddr_count++;
 			if (oladdr == stcb->asoc.last_used_address) {
 				stcb->asoc.last_used_address = laddr;
 			}
 		}
 	}
 	/*
 	 * Now any running timers need to be adjusted since we really don't
 	 * care if they are running or not just blast in the new_inp into
 	 * all of them.
 	 */
 
 	stcb->asoc.dack_timer.ep = (void *)new_inp;
 	stcb->asoc.asconf_timer.ep = (void *)new_inp;
 	stcb->asoc.strreset_timer.ep = (void *)new_inp;
 	stcb->asoc.shut_guard_timer.ep = (void *)new_inp;
 	stcb->asoc.autoclose_timer.ep = (void *)new_inp;
 	stcb->asoc.delayed_event_timer.ep = (void *)new_inp;
 	stcb->asoc.delete_prim_timer.ep = (void *)new_inp;
 	/* now what about the nets? */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		net->pmtu_timer.ep = (void *)new_inp;
 		net->hb_timer.ep = (void *)new_inp;
 		net->rxt_timer.ep = (void *)new_inp;
 	}
 	SCTP_INP_WUNLOCK(new_inp);
 	SCTP_INP_WUNLOCK(old_inp);
 }
 
 
 
 
 /* sctp_ifap is used to bypass normal local address validation checks */
 int
 sctp_inpcb_bind(struct socket *so, struct sockaddr *addr,
     struct sctp_ifa *sctp_ifap, struct thread *p)
 {
 	/* bind a ep to a socket address */
 	struct sctppcbhead *head;
 	struct sctp_inpcb *inp, *inp_tmp;
 	struct inpcb *ip_inp;
 	int port_reuse_active = 0;
 	int bindall;
 	uint16_t lport;
 	int error;
 	uint32_t vrf_id;
 
 	lport = 0;
 	bindall = 1;
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	ip_inp = (struct inpcb *)so->so_pcb;
 #ifdef SCTP_DEBUG
 	if (addr) {
 		SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port: %d\n",
 		    ntohs(((struct sockaddr_in *)addr)->sin_port));
 		SCTPDBG(SCTP_DEBUG_PCB1, "Addr: ");
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
 	}
 #endif
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
 		/* already did a bind, subsequent binds NOT allowed ! */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 #ifdef INVARIANTS
 	if (p == NULL)
 		panic("null proc/thread");
 #endif
 	if (addr != NULL) {
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			{
 				struct sockaddr_in *sin;
 
 				/* IPV6_V6ONLY socket? */
 				if (SCTP_IPV6_V6ONLY(ip_inp)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 				if (addr->sa_len != sizeof(*sin)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 				sin = (struct sockaddr_in *)addr;
 				lport = sin->sin_port;
 				/*
 				 * For LOOPBACK the prison_local_ip4() call
 				 * will transmute the ip address to the
 				 * proper value.
 				 */
 				if (p && (error = prison_local_ip4(p->td_ucred, &sin->sin_addr)) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 					return (error);
 				}
 				if (sin->sin_addr.s_addr != INADDR_ANY) {
 					bindall = 0;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				/*
 				 * Only for pure IPv6 Address. (No IPv4
 				 * Mapped!)
 				 */
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)addr;
 
 				if (addr->sa_len != sizeof(*sin6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 				lport = sin6->sin6_port;
 				/*
 				 * For LOOPBACK the prison_local_ip6() call
 				 * will transmute the ipv6 address to the
 				 * proper value.
 				 */
 				if (p && (error = prison_local_ip6(p->td_ucred, &sin6->sin6_addr,
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 					return (error);
 				}
 				if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					bindall = 0;
 					/* KAME hack: embed scopeid */
 					if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 						return (EINVAL);
 					}
 				}
 				/* this must be cleared for ifa_ifwithaddr() */
 				sin6->sin6_scope_id = 0;
 				break;
 			}
 #endif
 		default:
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EAFNOSUPPORT);
 			return (EAFNOSUPPORT);
 		}
 	}
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(inp);
 	/* Setup a vrf_id to be the default for the non-bind-all case. */
 	vrf_id = inp->def_vrf_id;
 
 	/* increase our count due to the unlock we do */
 	SCTP_INP_INCR_REF(inp);
 	if (lport) {
 		/*
 		 * Did the caller specify a port? if so we must see if an ep
 		 * already has this one bound.
 		 */
 		/* got to be root to get at low ports */
 		if (ntohs(lport) < IPPORT_RESERVED) {
 			if (p && (error =
 			    priv_check(p, PRIV_NETINET_RESERVEDPORT)
 			    )) {
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				return (error);
 			}
 		}
 		SCTP_INP_WUNLOCK(inp);
 		if (bindall) {
 			vrf_id = inp->def_vrf_id;
 			inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
 			if (inp_tmp != NULL) {
 				/*
 				 * lock guy returned and lower count note
 				 * that we are not bound so inp_tmp should
 				 * NEVER be inp. And it is this inp
 				 * (inp_tmp) that gets the reference bump,
 				 * so we must lower it.
 				 */
 				SCTP_INP_DECR_REF(inp_tmp);
 				/* unlock info */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					/*
 					 * Ok, must be one-2-one and
 					 * allowing port re-use
 					 */
 					port_reuse_active = 1;
 					goto continue_anyway;
 				}
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 				return (EADDRINUSE);
 			}
 		} else {
 			inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
 			if (inp_tmp != NULL) {
 				/*
 				 * lock guy returned and lower count note
 				 * that we are not bound so inp_tmp should
 				 * NEVER be inp. And it is this inp
 				 * (inp_tmp) that gets the reference bump,
 				 * so we must lower it.
 				 */
 				SCTP_INP_DECR_REF(inp_tmp);
 				/* unlock info */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					/*
 					 * Ok, must be one-2-one and
 					 * allowing port re-use
 					 */
 					port_reuse_active = 1;
 					goto continue_anyway;
 				}
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 				return (EADDRINUSE);
 			}
 		}
 continue_anyway:
 		SCTP_INP_WLOCK(inp);
 		if (bindall) {
 			/* verify that no lport is not used by a singleton */
 			if ((port_reuse_active == 0) &&
 			    (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))) {
 				/* Sorry someone already has this one bound */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					port_reuse_active = 1;
 				} else {
 					SCTP_INP_DECR_REF(inp);
 					SCTP_INP_WUNLOCK(inp);
 					SCTP_INP_INFO_WUNLOCK();
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 					return (EADDRINUSE);
 				}
 			}
 		}
 	} else {
 		uint16_t first, last, candidate;
 		uint16_t count;
 		int done;
 
 		if (ip_inp->inp_flags & INP_HIGHPORT) {
 			first = MODULE_GLOBAL(ipport_hifirstauto);
 			last = MODULE_GLOBAL(ipport_hilastauto);
 		} else if (ip_inp->inp_flags & INP_LOWPORT) {
 			if (p && (error =
 			    priv_check(p, PRIV_NETINET_RESERVEDPORT)
 			    )) {
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 				return (error);
 			}
 			first = MODULE_GLOBAL(ipport_lowfirstauto);
 			last = MODULE_GLOBAL(ipport_lowlastauto);
 		} else {
 			first = MODULE_GLOBAL(ipport_firstauto);
 			last = MODULE_GLOBAL(ipport_lastauto);
 		}
 		if (first > last) {
 			uint16_t temp;
 
 			temp = first;
 			first = last;
 			last = temp;
 		}
 		count = last - first + 1;	/* number of candidates */
 		candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count);
 
 		done = 0;
 		while (!done) {
 			if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) {
 				done = 1;
 			}
 			if (!done) {
 				if (--count == 0) {
 					SCTP_INP_DECR_REF(inp);
 					SCTP_INP_WUNLOCK(inp);
 					SCTP_INP_INFO_WUNLOCK();
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 					return (EADDRINUSE);
 				}
 				if (candidate == last)
 					candidate = first;
 				else
 					candidate = candidate + 1;
 			}
 		}
 		lport = htons(candidate);
 	}
 	SCTP_INP_DECR_REF(inp);
 	if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE |
 	    SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		/*
 		 * this really should not happen. The guy did a non-blocking
 		 * bind and then did a close at the same time.
 		 */
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 	/* ok we look clear to give out this port, so lets setup the binding */
 	if (bindall) {
 		/* binding to all addresses, so just set in the proper flags */
 		inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL;
 		/* set the automatic addr changes from kernel flag */
 		if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) {
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 		} else {
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 		}
 		if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) {
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
 		} else {
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
 		}
 		/*
 		 * set the automatic mobility_base from kernel flag (by
 		 * micchie)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) {
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		} else {
 			sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		}
 		/*
 		 * set the automatic mobility_fasthandoff from kernel flag
 		 * (by micchie)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) {
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		} else {
 			sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		}
 	} else {
 		/*
 		 * bind specific, make sure flags is off and add a new
 		 * address structure to the sctp_addr_list inside the ep
 		 * structure.
 		 * 
 		 * We will need to allocate one and insert it at the head. The
 		 * socketopt call can just insert new addresses in there as
 		 * well. It will also have to do the embed scope kame hack
 		 * too (before adding).
 		 */
 		struct sctp_ifa *ifa;
 		union sctp_sockstore store;
 
 		memset(&store, 0, sizeof(store));
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			memcpy(&store.sin, addr, sizeof(struct sockaddr_in));
 			store.sin.sin_port = 0;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6));
 			store.sin6.sin6_port = 0;
 			break;
 #endif
 		default:
 			break;
 		}
 		/*
 		 * first find the interface with the bound address need to
 		 * zero out the port to find the address! yuck! can't do
 		 * this earlier since need port for sctp_pcb_findep()
 		 */
 		if (sctp_ifap != NULL) {
 			ifa = sctp_ifap;
 		} else {
 			/*
 			 * Note for BSD we hit here always other O/S's will
 			 * pass things in via the sctp_ifap argument
 			 * (Panda).
 			 */
 			ifa = sctp_find_ifa_by_addr(&store.sa,
 			    vrf_id, SCTP_ADDR_NOT_LOCKED);
 		}
 		if (ifa == NULL) {
 			/* Can't find an interface with that address */
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRNOTAVAIL);
 			return (EADDRNOTAVAIL);
 		}
 #ifdef INET6
 		if (addr->sa_family == AF_INET6) {
 			/* GAK, more FIXME IFA lock? */
 			if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 				/* Can't bind a non-existent addr. */
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				return (EINVAL);
 			}
 		}
 #endif
 		/* we're not bound all */
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL;
 		/* allow bindx() to send ASCONF's for binding changes */
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 		/* clear automatic addr changes from kernel flag */
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 
 		/* add this address to the endpoint list */
 		error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0);
 		if (error != 0) {
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			return (error);
 		}
 		inp->laddr_count++;
 	}
 	/* find the bucket */
 	if (port_reuse_active) {
 		/* Put it into tcp 1-2-1 hash */
 		head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))];
 		inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
 	} else {
 		head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))];
 	}
 	/* put it in the bucket */
 	LIST_INSERT_HEAD(head, inp, sctp_hash);
 	SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n",
 	    (void *)head, ntohs(lport), port_reuse_active);
 	/* set in the port */
 	inp->sctp_lport = lport;
 
 	/* turn off just the unbound flag */
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	return (0);
 }
 
 
 static void
 sctp_iterator_inp_being_freed(struct sctp_inpcb *inp)
 {
 	struct sctp_iterator *it, *nit;
 
 	/*
 	 * We enter with the only the ITERATOR_LOCK in place and a write
 	 * lock on the inp_info stuff.
 	 */
 	it = sctp_it_ctl.cur_it;
 	if (it && (it->vn != curvnet)) {
 		/* Its not looking at our VNET */
 		return;
 	}
 	if (it && (it->inp == inp)) {
 		/*
 		 * This is tricky and we hold the iterator lock, but when it
 		 * returns and gets the lock (when we release it) the
 		 * iterator will try to operate on inp. We need to stop that
 		 * from happening. But of course the iterator has a
 		 * reference on the stcb and inp. We can mark it and it will
 		 * stop.
 		 * 
 		 * If its a single iterator situation, we set the end iterator
 		 * flag. Otherwise we set the iterator to go to the next
 		 * inp.
 		 * 
 		 */
 		if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 			sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
 		} else {
 			sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP;
 		}
 	}
 	/*
 	 * Now go through and remove any single reference to our inp that
 	 * may be still pending on the list
 	 */
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 	TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
 		if (it->vn != curvnet) {
 			continue;
 		}
 		if (it->inp == inp) {
 			/* This one points to me is it inp specific? */
 			if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 				/* Remove and free this one */
 				TAILQ_REMOVE(&sctp_it_ctl.iteratorhead,
 				    it, sctp_nxt_itr);
 				if (it->function_atend != NULL) {
 					(*it->function_atend) (it->pointer, it->val);
 				}
 				SCTP_FREE(it, SCTP_M_ITER);
 			} else {
 				it->inp = LIST_NEXT(it->inp, sctp_list);
 				if (it->inp) {
 					SCTP_INP_INCR_REF(it->inp);
 				}
 			}
 			/*
 			 * When its put in the refcnt is incremented so decr
 			 * it
 			 */
 			SCTP_INP_DECR_REF(inp);
 		}
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 }
 
 /* release sctp_inpcb unbind the port */
 void
 sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
 {
 	/*
 	 * Here we free a endpoint. We must find it (if it is in the Hash
 	 * table) and remove it from there. Then we must also find it in the
 	 * overall list and remove it from there. After all removals are
 	 * complete then any timer has to be stopped. Then start the actual
 	 * freeing. a) Any local lists. b) Any associations. c) The hash of
 	 * all associations. d) finally the ep itself.
 	 */
 	struct sctp_tcb *asoc, *nasoc;
 	struct sctp_laddr *laddr, *nladdr;
 	struct inpcb *ip_pcb;
 	struct socket *so;
 	int being_refed = 0;
 	struct sctp_queued_to_read *sq, *nsq;
 	int cnt;
 	sctp_sharedkey_t *shared_key, *nshared_key;
 
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 0);
 #endif
 	SCTP_ITERATOR_LOCK();
 	/* mark any iterators on the list or being processed */
 	sctp_iterator_inp_being_freed(inp);
 	SCTP_ITERATOR_UNLOCK();
 	so = inp->sctp_socket;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		/* been here before.. eeks.. get out of here */
 		SCTP_PRINTF("This conflict in free SHOULD not be happening! from %d, imm %d\n", from, immediate);
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 1);
 #endif
 		return;
 	}
 	SCTP_ASOC_CREATE_LOCK(inp);
 	SCTP_INP_INFO_WLOCK();
 
 	SCTP_INP_WLOCK(inp);
 	if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) {
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP;
 		/* socket is gone, so no more wakeups allowed */
 		inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE;
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
 
 	}
 	/* First time through we have the socket lock, after that no more. */
 	sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL,
 	    SCTP_FROM_SCTP_PCB + SCTP_LOC_1);
 
 	if (inp->control) {
 		sctp_m_freem(inp->control);
 		inp->control = NULL;
 	}
 	if (inp->pkt) {
 		sctp_m_freem(inp->pkt);
 		inp->pkt = NULL;
 	}
 	ip_pcb = &inp->ip_inp.inp;	/* we could just cast the main pointer
 					 * here but I will be nice :> (i.e.
 					 * ip_pcb = ep;) */
 	if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) {
 		int cnt_in_sd;
 
 		cnt_in_sd = 0;
 		LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) {
 			SCTP_TCB_LOCK(asoc);
 			if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				/* Skip guys being freed */
 				cnt_in_sd++;
 				if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
 					/*
 					 * Special case - we did not start a
 					 * kill timer on the asoc due to it
 					 * was not closed. So go ahead and
 					 * start it now.
 					 */
 					asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
 					sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
 				}
 				SCTP_TCB_UNLOCK(asoc);
 				continue;
 			}
 			if (((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_WAIT) ||
 			    (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_ECHOED)) &&
 			    (asoc->asoc.total_output_queue_size == 0)) {
 				/*
 				 * If we have data in queue, we don't want
 				 * to just free since the app may have done,
 				 * send()/close or connect/send/close. And
 				 * it wants the data to get across first.
 				 */
 				/* Just abandon things in the front states */
 				if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE,
 				    SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) {
 					cnt_in_sd++;
 				}
 				continue;
 			}
 			/* Disconnect the socket please */
 			asoc->sctp_socket = NULL;
 			asoc->asoc.state |= SCTP_STATE_CLOSED_SOCKET;
 			if ((asoc->asoc.size_on_reasm_queue > 0) ||
 			    (asoc->asoc.control_pdapi) ||
 			    (asoc->asoc.size_on_all_streams > 0) ||
 			    (so && (so->so_rcv.sb_cc > 0))) {
 				/* Left with Data unread */
 				struct mbuf *op_err;
 
 				op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 				asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3;
 				sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 				SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
 				    (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 					SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 				}
 				if (sctp_free_assoc(inp, asoc,
 				    SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) {
 					cnt_in_sd++;
 				}
 				continue;
 			} else if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
 				    TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
 			    (asoc->asoc.stream_queue_cnt == 0)) {
 				if (asoc->asoc.locked_on_sending) {
 					goto abort_anyway;
 				}
 				if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					struct sctp_nets *netp;
 
 					/*
 					 * there is nothing queued to send,
 					 * so I send shutdown
 					 */
 					if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_SET_STATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_SENT);
 					SCTP_CLEAR_SUBSTATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_PENDING);
 					sctp_stop_timers_for_shutdown(asoc);
 					if (asoc->asoc.alternate) {
 						netp = asoc->asoc.alternate;
 					} else {
 						netp = asoc->asoc.primary_destination;
 					}
 					sctp_send_shutdown(asoc, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, asoc->sctp_ep, asoc,
 					    netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc,
 					    asoc->asoc.primary_destination);
 					sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED);
 				}
 			} else {
 				/* mark into shutdown pending */
 				struct sctp_stream_queue_pending *sp;
 
 				asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING;
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc,
 				    asoc->asoc.primary_destination);
 				if (asoc->asoc.locked_on_sending) {
 					sp = TAILQ_LAST(&((asoc->asoc.locked_on_sending)->outqueue),
 					    sctp_streamhead);
 					if (sp == NULL) {
 						SCTP_PRINTF("Error, sp is NULL, locked on sending is %p strm:%d\n",
 						    (void *)asoc->asoc.locked_on_sending,
 						    asoc->asoc.locked_on_sending->stream_no);
 					} else {
 						if ((sp->length == 0) && (sp->msg_is_complete == 0))
 							asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT;
 					}
 				}
 				if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
 				    TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
 				    (asoc->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 					struct mbuf *op_err;
 
 			abort_anyway:
 					op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 					asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5;
 					sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 					if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					if (sctp_free_assoc(inp, asoc,
 					    SCTP_PCBFREE_NOFORCE,
 					    SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) {
 						cnt_in_sd++;
 					}
 					continue;
 				} else {
 					sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 				}
 			}
 			cnt_in_sd++;
 			SCTP_TCB_UNLOCK(asoc);
 		}
 		/* now is there some left in our SHUTDOWN state? */
 		if (cnt_in_sd) {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 2);
 #endif
 			inp->sctp_socket = NULL;
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_ASOC_CREATE_UNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			return;
 		}
 	}
 	inp->sctp_socket = NULL;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) !=
 	    SCTP_PCB_FLAGS_UNBOUND) {
 		/*
 		 * ok, this guy has been bound. It's port is somewhere in
 		 * the SCTP_BASE_INFO(hash table). Remove it!
 		 */
 		LIST_REMOVE(inp, sctp_hash);
 		inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND;
 	}
 	/*
 	 * If there is a timer running to kill us, forget it, since it may
 	 * have a contest on the INP lock.. which would cause us to die ...
 	 */
 	cnt = 0;
 	LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) {
 		SCTP_TCB_LOCK(asoc);
 		if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
 				asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
 				sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
 			}
 			cnt++;
 			SCTP_TCB_UNLOCK(asoc);
 			continue;
 		}
 		/* Free associations that are NOT killing us */
 		if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_COOKIE_WAIT) &&
 		    ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) {
 			struct mbuf *op_err;
 
 			op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 			asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7;
 			sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 			SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 		} else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			cnt++;
 			SCTP_TCB_UNLOCK(asoc);
 			continue;
 		}
 		if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 		}
 		if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE,
 		    SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) {
 			cnt++;
 		}
 	}
 	if (cnt) {
 		/* Ok we have someone out there that will kill us */
 		(void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 3);
 #endif
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return;
 	}
 	if (SCTP_INP_LOCK_CONTENDED(inp))
 		being_refed++;
 	if (SCTP_INP_READ_CONTENDED(inp))
 		being_refed++;
 	if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp))
 		being_refed++;
 
 	if ((inp->refcount) ||
 	    (being_refed) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) {
 		(void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 4);
 #endif
 		sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return;
 	}
 	inp->sctp_ep.signature_change.type = 0;
 	inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE;
 	/*
 	 * Remove it from the list .. last thing we need a lock for.
 	 */
 	LIST_REMOVE(inp, sctp_list);
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_ASOC_CREATE_UNLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	/*
 	 * Now we release all locks. Since this INP cannot be found anymore
 	 * except possibly by the kill timer that might be running. We call
 	 * the drain function here. It should hit the case were it sees the
 	 * ACTIVE flag cleared and exit out freeing us to proceed and
 	 * destroy everything.
 	 */
 	if (from != SCTP_CALLED_FROM_INPKILL_TIMER) {
 		(void)SCTP_OS_TIMER_STOP_DRAIN(&inp->sctp_ep.signature_change.timer);
 	} else {
 		/* Probably un-needed */
 		(void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
 	}
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 5);
 #endif
 
 
 	if ((inp->sctp_asocidhash) != NULL) {
 		SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark);
 		inp->sctp_asocidhash = NULL;
 	}
 	/* sa_ignore FREED_MEMORY */
 	TAILQ_FOREACH_SAFE(sq, &inp->read_queue, next, nsq) {
 		/* Its only abandoned if it had data left */
 		if (sq->length)
 			SCTP_STAT_INCR(sctps_left_abandon);
 
 		TAILQ_REMOVE(&inp->read_queue, sq, next);
 		sctp_free_remote_addr(sq->whoFrom);
 		if (so)
 			so->so_rcv.sb_cc -= sq->length;
 		if (sq->data) {
 			sctp_m_freem(sq->data);
 			sq->data = NULL;
 		}
 		/*
 		 * no need to free the net count, since at this point all
 		 * assoc's are gone.
 		 */
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
 		SCTP_DECR_READQ_COUNT();
 	}
 	/* Now the sctp_pcb things */
 	/*
 	 * free each asoc if it is not already closed/free. we can't use the
 	 * macro here since le_next will get freed as part of the
 	 * sctp_free_assoc() call.
 	 */
 #ifdef IPSEC
 	ipsec_delete_pcbpolicy(ip_pcb);
 #endif
 	if (ip_pcb->inp_options) {
 		(void)sctp_m_free(ip_pcb->inp_options);
 		ip_pcb->inp_options = 0;
 	}
 #ifdef INET6
 	if (ip_pcb->inp_vflag & INP_IPV6) {
 		struct in6pcb *in6p;
 
 		in6p = (struct in6pcb *)inp;
 		ip6_freepcbopts(in6p->in6p_outputopts);
 	}
 #endif				/* INET6 */
 	ip_pcb->inp_vflag = 0;
 	/* free up authentication fields */
 	if (inp->sctp_ep.local_auth_chunks != NULL)
 		sctp_free_chunklist(inp->sctp_ep.local_auth_chunks);
 	if (inp->sctp_ep.local_hmacs != NULL)
 		sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
 
 	LIST_FOREACH_SAFE(shared_key, &inp->sctp_ep.shared_keys, next, nshared_key) {
 		LIST_REMOVE(shared_key, next);
 		sctp_free_sharedkey(shared_key);
 		/* sa_ignore FREED_MEMORY */
 	}
 
 	/*
 	 * if we have an address list the following will free the list of
 	 * ifaddr's that are set into this ep. Again macro limitations here,
 	 * since the LIST_FOREACH could be a bad idea.
 	 */
 	LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) {
 		sctp_remove_laddr(laddr);
 	}
 
 #ifdef SCTP_TRACK_FREED_ASOCS
 	/* TEMP CODE */
 	LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_free_list, sctp_tcblist, nasoc) {
 		LIST_REMOVE(asoc, sctp_tcblist);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), asoc);
 		SCTP_DECR_ASOC_COUNT();
 	}
 	/* *** END TEMP CODE *** */
 #endif
 	/* Now lets see about freeing the EP hash table. */
 	if (inp->sctp_tcbhash != NULL) {
 		SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark);
 		inp->sctp_tcbhash = NULL;
 	}
 	/* Now we must put the ep memory back into the zone pool */
 	crfree(inp->ip_inp.inp.inp_cred);
 	INP_LOCK_DESTROY(&inp->ip_inp.inp);
 	SCTP_INP_LOCK_DESTROY(inp);
 	SCTP_INP_READ_DESTROY(inp);
 	SCTP_ASOC_CREATE_LOCK_DESTROY(inp);
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 	SCTP_DECR_EP_COUNT();
 }
 
 
 struct sctp_nets *
 sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr)
 {
 	struct sctp_nets *net;
 
 	/* locate the address */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr))
 			return (net);
 	}
 	return (NULL);
 }
 
 
 int
 sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id)
 {
 	struct sctp_ifa *sctp_ifa;
 
 	sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED);
 	if (sctp_ifa) {
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 /*
  * add's a remote endpoint address, done with the INIT/INIT-ACK as well as
  * when a ASCONF arrives that adds it. It will also initialize all the cwnd
  * stats of stuff.
  */
 int
 sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
     struct sctp_nets **netp, int set_scope, int from)
 {
 	/*
 	 * The following is redundant to the same lines in the
 	 * sctp_aloc_assoc() but is needed since others call the add address
 	 * function
 	 */
 	struct sctp_nets *net, *netfirst;
 	int addr_inscope;
 
 	SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ",
 	    from);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr);
 
 	netfirst = sctp_findnet(stcb, newaddr);
 	if (netfirst) {
 		/*
 		 * Lie and return ok, we don't want to make the association
 		 * go away for this behavior. It will happen in the TCP
 		 * model in a connected socket. It does not reach the hash
 		 * table until after the association is built so it can't be
 		 * found. Mark as reachable, since the initial creation will
 		 * have been cleared and the NOT_IN_ASSOC flag will have
 		 * been added... and we don't want to end up removing it
 		 * back out.
 		 */
 		if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			netfirst->dest_state = (SCTP_ADDR_REACHABLE |
 			    SCTP_ADDR_UNCONFIRMED);
 		} else {
 			netfirst->dest_state = SCTP_ADDR_REACHABLE;
 		}
 
 		return (0);
 	}
 	addr_inscope = 1;
 	switch (newaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)newaddr;
 			if (sin->sin_addr.s_addr == 0) {
 				/* Invalid address */
 				return (-1);
 			}
 			/* zero out the bzero area */
 			memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
 
 			/* assure len is set */
 			sin->sin_len = sizeof(struct sockaddr_in);
 			if (set_scope) {
 				if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 					stcb->asoc.scope.ipv4_local_scope = 1;
 				}
 			} else {
 				/* Validate the address is in scope */
 				if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) &&
 				    (stcb->asoc.scope.ipv4_local_scope == 0)) {
 					addr_inscope = 0;
 				}
 			}
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)newaddr;
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				/* Invalid address */
 				return (-1);
 			}
 			/* assure len is set */
 			sin6->sin6_len = sizeof(struct sockaddr_in6);
 			if (set_scope) {
 				if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) {
 					stcb->asoc.scope.loopback_scope = 1;
 					stcb->asoc.scope.local_scope = 0;
 					stcb->asoc.scope.ipv4_local_scope = 1;
 					stcb->asoc.scope.site_scope = 1;
 				} else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 					/*
 					 * If the new destination is a
 					 * LINK_LOCAL we must have common
 					 * site scope. Don't set the local
 					 * scope since we may not share all
 					 * links, only loopback can do this.
 					 * Links on the local network would
 					 * also be on our private network
 					 * for v4 too.
 					 */
 					stcb->asoc.scope.ipv4_local_scope = 1;
 					stcb->asoc.scope.site_scope = 1;
 				} else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
 					/*
 					 * If the new destination is
 					 * SITE_LOCAL then we must have site
 					 * scope in common.
 					 */
 					stcb->asoc.scope.site_scope = 1;
 				}
 			} else {
 				/* Validate the address is in scope */
 				if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.loopback_scope == 0)) {
 					addr_inscope = 0;
 				} else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.local_scope == 0)) {
 					addr_inscope = 0;
 				} else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.site_scope == 0)) {
 					addr_inscope = 0;
 				}
 			}
 			break;
 		}
 #endif
 	default:
 		/* not supported family type */
 		return (-1);
 	}
 	net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets);
 	if (net == NULL) {
 		return (-1);
 	}
 	SCTP_INCR_RADDR_COUNT();
 	bzero(net, sizeof(struct sctp_nets));
 	(void)SCTP_GETTIME_TIMEVAL(&net->start_time);
 	memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len);
 	switch (newaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport;
 		break;
 #endif
 	default:
 		break;
 	}
 	net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id);
 	if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) {
 		stcb->asoc.scope.loopback_scope = 1;
 		stcb->asoc.scope.ipv4_local_scope = 1;
 		stcb->asoc.scope.local_scope = 0;
 		stcb->asoc.scope.site_scope = 1;
 		addr_inscope = 1;
 	}
 	net->failure_threshold = stcb->asoc.def_net_failure;
 	net->pf_threshold = stcb->asoc.def_net_pf_threshold;
 	if (addr_inscope == 0) {
 		net->dest_state = (SCTP_ADDR_REACHABLE |
 		    SCTP_ADDR_OUT_OF_SCOPE);
 	} else {
 		if (from == SCTP_ADDR_IS_CONFIRMED)
 			/* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */
 			net->dest_state = SCTP_ADDR_REACHABLE;
 		else
 			net->dest_state = SCTP_ADDR_REACHABLE |
 			    SCTP_ADDR_UNCONFIRMED;
 	}
 	/*
 	 * We set this to 0, the timer code knows that this means its an
 	 * initial value
 	 */
 	net->rto_needed = 1;
 	net->RTO = 0;
 	net->RTO_measured = 0;
 	stcb->asoc.numnets++;
 	net->ref_count = 1;
 	net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1;
 	net->port = stcb->asoc.port;
 	net->dscp = stcb->asoc.default_dscp;
 #ifdef INET6
 	net->flowlabel = stcb->asoc.default_flowlabel;
 #endif
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 		net->dest_state |= SCTP_ADDR_NOHB;
 	} else {
 		net->dest_state &= ~SCTP_ADDR_NOHB;
 	}
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 		net->dest_state |= SCTP_ADDR_NO_PMTUD;
 	} else {
 		net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 	}
 	net->heart_beat_delay = stcb->asoc.heart_beat_delay;
 	/* Init the timer structure */
 	SCTP_OS_TIMER_INIT(&net->rxt_timer.timer);
 	SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer);
 	SCTP_OS_TIMER_INIT(&net->hb_timer.timer);
 
 	/* Now generate a route for this guy */
 #ifdef INET6
 	/* KAME hack: embed scopeid */
 	if (newaddr->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		(void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
 		sin6->sin6_scope_id = 0;
 	}
 #endif
 	SCTP_RTALLOC((sctp_route_t *) & net->ro,
 	    stcb->asoc.vrf_id,
 	    stcb->sctp_ep->fibnum);
 
 	if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) {
 		/* Get source address */
 		net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep,
 		    stcb,
 		    (sctp_route_t *) & net->ro,
 		    net,
 		    0,
 		    stcb->asoc.vrf_id);
 		if (net->ro._s_addr != NULL) {
 			net->src_addr_selected = 1;
 			/* Now get the interface MTU */
 			if (net->ro._s_addr->ifn_p != NULL) {
 				net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p);
 			}
 		} else {
 			net->src_addr_selected = 0;
 		}
 		if (net->mtu > 0) {
 			uint32_t rmtu;
 
 			rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_rt);
 			if (rmtu == 0) {
 				/*
 				 * Start things off to match mtu of
 				 * interface please.
 				 */
 				SCTP_SET_MTU_OF_ROUTE(&net->ro._l_addr.sa,
 				    net->ro.ro_rt, net->mtu);
 			} else {
 				/*
 				 * we take the route mtu over the interface,
 				 * since the route may be leading out the
 				 * loopback, or a different interface.
 				 */
 				net->mtu = rmtu;
 			}
 		}
 	} else {
 		net->src_addr_selected = 0;
 	}
 	if (net->mtu == 0) {
 		switch (newaddr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			net->mtu = SCTP_DEFAULT_MTU;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			net->mtu = 1280;
 			break;
 #endif
 		default:
 			break;
 		}
 	}
 #if defined(INET) || defined(INET6)
 	if (net->port) {
 		net->mtu -= (uint32_t) sizeof(struct udphdr);
 	}
 #endif
 	if (from == SCTP_ALLOC_ASOC) {
 		stcb->asoc.smallest_mtu = net->mtu;
 	}
 	if (stcb->asoc.smallest_mtu > net->mtu) {
 		stcb->asoc.smallest_mtu = net->mtu;
 	}
 #ifdef INET6
 	if (newaddr->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		(void)sa6_recoverscope(sin6);
 	}
 #endif
 
 	/* JRS - Use the congestion control given in the CC module */
 	if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL)
 		(*stcb->asoc.cc_functions.sctp_set_initial_cc_param) (stcb, net);
 
 	/*
 	 * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning
 	 * of assoc (2005/06/27, iyengar@cis.udel.edu)
 	 */
 	net->find_pseudo_cumack = 1;
 	net->find_rtx_pseudo_cumack = 1;
 	/* Choose an initial flowid. */
 	net->flowid = stcb->asoc.my_vtag ^
 	    ntohs(stcb->rport) ^
 	    ntohs(stcb->sctp_ep->sctp_lport);
 	net->flowtype = M_HASHTYPE_OPAQUE;
 	if (netp) {
 		*netp = net;
 	}
 	netfirst = TAILQ_FIRST(&stcb->asoc.nets);
 	if (net->ro.ro_rt == NULL) {
 		/* Since we have no route put it at the back */
 		TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
 	} else if (netfirst == NULL) {
 		/* We are the first one in the pool. */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else if (netfirst->ro.ro_rt == NULL) {
 		/*
 		 * First one has NO route. Place this one ahead of the first
 		 * one.
 		 */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else if (net->ro.ro_rt->rt_ifp != netfirst->ro.ro_rt->rt_ifp) {
 		/*
 		 * This one has a different interface than the one at the
 		 * top of the list. Place it ahead.
 		 */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else {
 		/*
 		 * Ok we have the same interface as the first one. Move
 		 * forward until we find either a) one with a NULL route...
 		 * insert ahead of that b) one with a different ifp.. insert
 		 * after that. c) end of the list.. insert at the tail.
 		 */
 		struct sctp_nets *netlook;
 
 		do {
 			netlook = TAILQ_NEXT(netfirst, sctp_next);
 			if (netlook == NULL) {
 				/* End of the list */
 				TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
 				break;
 			} else if (netlook->ro.ro_rt == NULL) {
 				/* next one has NO route */
 				TAILQ_INSERT_BEFORE(netfirst, net, sctp_next);
 				break;
 			} else if (netlook->ro.ro_rt->rt_ifp != net->ro.ro_rt->rt_ifp) {
 				TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook,
 				    net, sctp_next);
 				break;
 			}
 			/* Shift forward */
 			netfirst = netlook;
 		} while (netlook != NULL);
 	}
 
 	/* got to have a primary set */
 	if (stcb->asoc.primary_destination == 0) {
 		stcb->asoc.primary_destination = net;
 	} else if ((stcb->asoc.primary_destination->ro.ro_rt == NULL) &&
 		    (net->ro.ro_rt) &&
 	    ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) {
 		/* No route to current primary adopt new primary */
 		stcb->asoc.primary_destination = net;
 	}
 	/* Validate primary is first */
 	net = TAILQ_FIRST(&stcb->asoc.nets);
 	if ((net != stcb->asoc.primary_destination) &&
 	    (stcb->asoc.primary_destination)) {
 		/*
 		 * first one on the list is NOT the primary sctp_cmpaddr()
 		 * is much more efficient if the primary is the first on the
 		 * list, make it so.
 		 */
 		TAILQ_REMOVE(&stcb->asoc.nets,
 		    stcb->asoc.primary_destination, sctp_next);
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets,
 		    stcb->asoc.primary_destination, sctp_next);
 	}
 	return (0);
 }
 
 
 static uint32_t
 sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb)
 {
 	uint32_t id;
 	struct sctpasochead *head;
 	struct sctp_tcb *lstcb;
 
 	SCTP_INP_WLOCK(inp);
 try_again:
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		/* TSNH */
 		SCTP_INP_WUNLOCK(inp);
 		return (0);
 	}
 	/*
 	 * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC,
 	 * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC.
 	 */
 	if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) {
 		inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1;
 	}
 	id = inp->sctp_associd_counter;
 	inp->sctp_associd_counter++;
 	lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t) id, 0);
 	if (lstcb) {
 		goto try_again;
 	}
 	head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
 	LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash);
 	stcb->asoc.in_asocid_hash = 1;
 	SCTP_INP_WUNLOCK(inp);
 	return id;
 }
 
 /*
  * allocate an association and add it to the endpoint. The caller must be
  * careful to add all additional addresses once they are know right away or
  * else the assoc will be may experience a blackout scenario.
  */
 struct sctp_tcb *
 sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
     int *error, uint32_t override_tag, uint32_t vrf_id,
     uint16_t o_streams,
     struct thread *p
 )
 {
 	/* note the p argument is only valid in unbound sockets */
 
 	struct sctp_tcb *stcb;
 	struct sctp_association *asoc;
 	struct sctpasochead *head;
 	uint16_t rport;
 	int err;
 
 	/*
 	 * Assumption made here: Caller has done a
 	 * sctp_findassociation_ep_addr(ep, addr's); to make sure the
 	 * address does not exist already.
 	 */
 	if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) {
 		/* Hit max assoc, sorry no more */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		*error = ENOBUFS;
 		return (NULL);
 	}
 	if (firstaddr == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_INP_RLOCK(inp);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) {
 		/*
 		 * If its in the TCP pool, its NOT allowed to create an
 		 * association. The parent listener needs to call
 		 * sctp_aloc_assoc.. or the one-2-many socket. If a peeled
 		 * off, or connected one does this.. its an error.
 		 */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) {
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED)) {
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 			*error = EINVAL;
 			return (NULL);
 		}
 	}
 	SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:");
 #ifdef SCTP_DEBUG
 	if (firstaddr) {
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr);
 		switch (firstaddr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n",
 			    ntohs(((struct sockaddr_in *)firstaddr)->sin_port));
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n",
 			    ntohs(((struct sockaddr_in6 *)firstaddr)->sin6_port));
 			break;
 #endif
 		default:
 			break;
 		}
 	} else {
 		SCTPDBG(SCTP_DEBUG_PCB3, "None\n");
 	}
 #endif				/* SCTP_DEBUG */
 	switch (firstaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)firstaddr;
 			if ((ntohs(sin->sin_port) == 0) ||
 			    (sin->sin_addr.s_addr == INADDR_ANY) ||
 			    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 			    IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 				/* Invalid address */
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				*error = EINVAL;
 				return (NULL);
 			}
 			rport = sin->sin_port;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)firstaddr;
 			if ((ntohs(sin6->sin6_port) == 0) ||
 			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 				/* Invalid address */
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				*error = EINVAL;
 				return (NULL);
 			}
 			rport = sin6->sin6_port;
 			break;
 		}
 #endif
 	default:
 		/* not supported family type */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		/*
 		 * If you have not performed a bind, then we need to do the
 		 * ephemeral bind for you.
 		 */
 		if ((err = sctp_inpcb_bind(inp->sctp_socket,
 		    (struct sockaddr *)NULL,
 		    (struct sctp_ifa *)NULL,
 		    p
 		    ))) {
 			/* bind error, probably perm */
 			*error = err;
 			return (NULL);
 		}
 	}
 	stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb);
 	if (stcb == NULL) {
 		/* out of memory? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
 		*error = ENOMEM;
 		return (NULL);
 	}
 	SCTP_INCR_ASOC_COUNT();
 
 	bzero(stcb, sizeof(*stcb));
 	asoc = &stcb->asoc;
 
 	asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb);
 	SCTP_TCB_LOCK_INIT(stcb);
 	SCTP_TCB_SEND_LOCK_INIT(stcb);
 	stcb->rport = rport;
 	/* setup back pointer's */
 	stcb->sctp_ep = inp;
 	stcb->sctp_socket = inp->sctp_socket;
 	if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id, o_streams))) {
 		/* failed */
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_DECR_ASOC_COUNT();
 		*error = err;
 		return (NULL);
 	}
 	/* and the port */
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(inp);
 	if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		/* inpcb freed while alloc going on */
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_DECR_ASOC_COUNT();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_TCB_LOCK(stcb);
 
 	/* now that my_vtag is set, add it to the hash */
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
 	/* put it in the bucket in the vtag hash of assoc's for the system */
 	LIST_INSERT_HEAD(head, stcb, sctp_asocs);
 	SCTP_INP_INFO_WUNLOCK();
 
 	if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) {
 		/* failure.. memory error? */
 		if (asoc->strmout) {
 			SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 			asoc->strmout = NULL;
 		}
 		if (asoc->mapping_array) {
 			SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 			asoc->mapping_array = NULL;
 		}
 		if (asoc->nr_mapping_array) {
 			SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
 			asoc->nr_mapping_array = NULL;
 		}
 		SCTP_DECR_ASOC_COUNT();
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		*error = ENOBUFS;
 		return (NULL);
 	}
 	/* Init all the timers */
 	SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->delayed_event_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer);
 
 	LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist);
 	/* now file the port under the hash as well */
 	if (inp->sctp_tcbhash != NULL) {
 		head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport,
 		    inp->sctp_hashmark)];
 		LIST_INSERT_HEAD(head, stcb, sctp_tcbhash);
 	}
 	SCTP_INP_WUNLOCK(inp);
 	SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb);
 	return (stcb);
 }
 
 
 void
 sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	struct sctp_association *asoc;
 
 	asoc = &stcb->asoc;
 	asoc->numnets--;
 	TAILQ_REMOVE(&asoc->nets, net, sctp_next);
 	if (net == asoc->primary_destination) {
 		/* Reset primary */
 		struct sctp_nets *lnet;
 
 		lnet = TAILQ_FIRST(&asoc->nets);
 		/*
 		 * Mobility adaptation Ideally, if deleted destination is
 		 * the primary, it becomes a fast retransmission trigger by
 		 * the subsequent SET PRIMARY. (by micchie)
 		 */
 		if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_BASE) ||
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n");
 			if (asoc->deleted_primary != NULL) {
 				SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n");
 				goto out;
 			}
 			asoc->deleted_primary = net;
 			atomic_add_int(&net->ref_count, 1);
 			memset(&net->lastsa, 0, sizeof(net->lastsa));
 			memset(&net->lastsv, 0, sizeof(net->lastsv));
 			sctp_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_PRIM_DELETED);
 			sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED,
 			    stcb->sctp_ep, stcb, NULL);
 		}
 out:
 		/* Try to find a confirmed primary */
 		asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0);
 	}
 	if (net == asoc->last_data_chunk_from) {
 		/* Reset primary */
 		asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets);
 	}
 	if (net == asoc->last_control_chunk_from) {
 		/* Clear net */
 		asoc->last_control_chunk_from = NULL;
 	}
 	if (net == stcb->asoc.alternate) {
 		sctp_free_remote_addr(stcb->asoc.alternate);
 		stcb->asoc.alternate = NULL;
 	}
 	sctp_free_remote_addr(net);
 }
 
 /*
  * remove a remote endpoint address from an association, it will fail if the
  * address does not exist.
  */
 int
 sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr)
 {
 	/*
 	 * Here we need to remove a remote address. This is quite simple, we
 	 * first find it in the list of address for the association
 	 * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE
 	 * on that item. Note we do not allow it to be removed if there are
 	 * no other addresses.
 	 */
 	struct sctp_association *asoc;
 	struct sctp_nets *net, *nnet;
 
 	asoc = &stcb->asoc;
 
 	/* locate the address */
 	TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) {
 		if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) {
 			continue;
 		}
 		if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr,
 		    remaddr)) {
 			/* we found the guy */
 			if (asoc->numnets < 2) {
 				/* Must have at LEAST two remote addresses */
 				return (-1);
 			} else {
 				sctp_remove_net(stcb, net);
 				return (0);
 			}
 		}
 	}
 	/* not found. */
 	return (-2);
 }
 
 void
 sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	int found = 0;
 	int i;
 
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == tag) &&
 			    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			break;
 	}
 }
 
 int
 sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	int found = 0;
 	int i;
 
 	SCTP_INP_INFO_WLOCK();
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == tag) &&
 			    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			break;
 	}
 	SCTP_INP_INFO_WUNLOCK();
 	return (found);
 }
 
 
 void
 sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	struct timeval now;
 	int set, i;
 
 	if (time == 0) {
 		/* Its disabled */
 		return;
 	}
 	(void)SCTP_GETTIME_TIMEVAL(&now);
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	set = 0;
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		/* Block(s) present, lets find space, and expire on the fly */
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == 0) &&
 			    !set) {
 				twait_block->vtag_block[i].tv_sec_at_expire =
 				    now.tv_sec + time;
 				twait_block->vtag_block[i].v_tag = tag;
 				twait_block->vtag_block[i].lport = lport;
 				twait_block->vtag_block[i].rport = rport;
 				set = 1;
 			} else if ((twait_block->vtag_block[i].v_tag) &&
 			    ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) {
 				/* Audit expires this guy */
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 				if (set == 0) {
 					/* Reuse it for my new tag */
 					twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time;
 					twait_block->vtag_block[i].v_tag = tag;
 					twait_block->vtag_block[i].lport = lport;
 					twait_block->vtag_block[i].rport = rport;
 					set = 1;
 				}
 			}
 		}
 		if (set) {
 			/*
 			 * We only do up to the block where we can place our
 			 * tag for audits
 			 */
 			break;
 		}
 	}
 	/* Need to add a new block to chain */
 	if (!set) {
 		SCTP_MALLOC(twait_block, struct sctp_tagblock *,
 		    sizeof(struct sctp_tagblock), SCTP_M_TIMW);
 		if (twait_block == NULL) {
 #ifdef INVARIANTS
 			panic("Can not alloc tagblock");
 #endif
 			return;
 		}
 		memset(twait_block, 0, sizeof(struct sctp_tagblock));
 		LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock);
 		twait_block->vtag_block[0].tv_sec_at_expire = now.tv_sec + time;
 		twait_block->vtag_block[0].v_tag = tag;
 		twait_block->vtag_block[0].lport = lport;
 		twait_block->vtag_block[0].rport = rport;
 	}
 }
 
 
 
 /*-
  * Free the association after un-hashing the remote port. This
  * function ALWAYS returns holding NO LOCK on the stcb. It DOES
  * expect that the input to this function IS a locked TCB.
  * It will return 0, if it did NOT destroy the association (instead
  * it unlocks it. It will return NON-zero if it either destroyed the
  * association OR the association is already destroyed.
  */
 int
 sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location)
 {
 	int i;
 	struct sctp_association *asoc;
 	struct sctp_nets *net, *nnet;
 	struct sctp_laddr *laddr, *naddr;
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_asconf_addr *aparam, *naparam;
 	struct sctp_asconf_ack *aack, *naack;
 	struct sctp_stream_reset_list *strrst, *nstrrst;
 	struct sctp_queued_to_read *sq, *nsq;
 	struct sctp_stream_queue_pending *sp, *nsp;
 	sctp_sharedkey_t *shared_key, *nshared_key;
 	struct socket *so;
 
 	/* first, lets purge the entry from the hash table. */
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, stcb, 6);
 #endif
 	if (stcb->asoc.state == 0) {
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 7);
 #endif
 		/* there is no asoc, really TSNH :-0 */
 		return (1);
 	}
 	if (stcb->asoc.alternate) {
 		sctp_free_remote_addr(stcb->asoc.alternate);
 		stcb->asoc.alternate = NULL;
 	}
 	/* TEMP CODE */
 	if (stcb->freed_from_where == 0) {
 		/* Only record the first place free happened from */
 		stcb->freed_from_where = from_location;
 	}
 	/* TEMP CODE */
 
 	asoc = &stcb->asoc;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 		/* nothing around */
 		so = NULL;
 	else
 		so = inp->sctp_socket;
 
 	/*
 	 * We used timer based freeing if a reader or writer is in the way.
 	 * So we first check if we are actually being called from a timer,
 	 * if so we abort early if a reader or writer is still in the way.
 	 */
 	if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) &&
 	    (from_inpcbfree == SCTP_NORMAL_PROC)) {
 		/*
 		 * is it the timer driving us? if so are the reader/writers
 		 * gone?
 		 */
 		if (stcb->asoc.refcnt) {
 			/* nope, reader or writer in the way */
 			sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 			/* no asoc destroyed */
 			SCTP_TCB_UNLOCK(stcb);
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, stcb, 8);
 #endif
 			return (0);
 		}
 	}
 	/* now clean up any other timers */
 	(void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer);
 	asoc->dack_timer.self = NULL;
 	(void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
 	/*-
 	 * For stream reset we don't blast this unless
 	 * it is a str-reset timer, it might be the
 	 * free-asoc timer which we DON'T want to
 	 * disturb.
 	 */
 	if (asoc->strreset_timer.type == SCTP_TIMER_TYPE_STRRESET)
 		asoc->strreset_timer.self = NULL;
 	(void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer);
 	asoc->asconf_timer.self = NULL;
 	(void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer);
 	asoc->autoclose_timer.self = NULL;
 	(void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer);
 	asoc->shut_guard_timer.self = NULL;
 	(void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer);
 	asoc->delayed_event_timer.self = NULL;
 	/* Mobility adaptation */
 	(void)SCTP_OS_TIMER_STOP(&asoc->delete_prim_timer.timer);
 	asoc->delete_prim_timer.self = NULL;
 	TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 		(void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer);
 		net->rxt_timer.self = NULL;
 		(void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer);
 		net->pmtu_timer.self = NULL;
 		(void)SCTP_OS_TIMER_STOP(&net->hb_timer.timer);
 		net->hb_timer.self = NULL;
 	}
 	/* Now the read queue needs to be cleaned up (only once) */
 	if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) {
 		stcb->asoc.state |= SCTP_STATE_ABOUT_TO_BE_FREED;
 		SCTP_INP_READ_LOCK(inp);
 		TAILQ_FOREACH(sq, &inp->read_queue, next) {
 			if (sq->stcb == stcb) {
 				sq->do_not_ref_stcb = 1;
 				sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn;
 				/*
 				 * If there is no end, there never will be
 				 * now.
 				 */
 				if (sq->end_added == 0) {
 					/* Held for PD-API clear that. */
 					sq->pdapi_aborted = 1;
 					sq->held_length = 0;
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) {
 						/*
 						 * Need to add a PD-API
 						 * aborted indication.
 						 * Setting the control_pdapi
 						 * assures that it will be
 						 * added right after this
 						 * msg.
 						 */
 						uint32_t strseq;
 
 						stcb->asoc.control_pdapi = sq;
 						strseq = (sq->sinfo_stream << 16) | sq->sinfo_ssn;
 						sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION,
 						    stcb,
 						    SCTP_PARTIAL_DELIVERY_ABORTED,
 						    (void *)&strseq,
 						    SCTP_SO_LOCKED);
 						stcb->asoc.control_pdapi = NULL;
 					}
 				}
 				/* Add an end to wake them */
 				sq->end_added = 1;
 			}
 		}
 		SCTP_INP_READ_UNLOCK(inp);
 		if (stcb->block_entry) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET);
 			stcb->block_entry->error = ECONNRESET;
 			stcb->block_entry = NULL;
 		}
 	}
 	if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) {
 		/*
 		 * Someone holds a reference OR the socket is unaccepted
 		 * yet.
 		 */
 		if ((stcb->asoc.refcnt) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 			stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
 			sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 			/* nothing around */
 			so = NULL;
 		if (so) {
 			/* Wake any reader/writers */
 			sctp_sorwakeup(inp, so);
 			sctp_sowwakeup(inp, so);
 		}
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, stcb, 9);
 #endif
 		/* no asoc destroyed */
 		return (0);
 	}
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, stcb, 10);
 #endif
 	/*
 	 * When I reach here, no others want to kill the assoc yet.. and I
 	 * own the lock. Now its possible an abort comes in when I do the
 	 * lock exchange below to grab all the locks to do the final take
 	 * out. to prevent this we increment the count, which will start a
 	 * timer and blow out above thus assuring us that we hold exclusive
 	 * killing of the asoc. Note that after getting back the TCB lock we
 	 * will go ahead and increment the counter back up and stop any
 	 * timer a passing stranger may have started :-S
 	 */
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_INFO_WLOCK();
 		SCTP_INP_WLOCK(inp);
 		SCTP_TCB_LOCK(stcb);
 	}
 	/* Double check the GONE flag */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 		/* nothing around */
 		so = NULL;
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*
 		 * For TCP type we need special handling when we are
 		 * connected. We also include the peel'ed off ones to.
 		 */
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED;
 			inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED;
 			if (so) {
 				SOCK_LOCK(so);
 				if (so->so_rcv.sb_cc == 0) {
 					so->so_state &= ~(SS_ISCONNECTING |
 					    SS_ISDISCONNECTING |
 					    SS_ISCONFIRMING |
 					    SS_ISCONNECTED);
 				}
 				socantrcvmore_locked(so);
 				sctp_sowwakeup(inp, so);
 				sctp_sorwakeup(inp, so);
 				SCTP_SOWAKEUP(so);
 			}
 		}
 	}
 	/*
 	 * Make it invalid too, that way if its about to run it will abort
 	 * and return.
 	 */
 	/* re-increment the lock */
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 	}
 	if (stcb->asoc.refcnt) {
 		stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
 		sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 		if (from_inpcbfree == SCTP_NORMAL_PROC) {
 			SCTP_INP_INFO_WUNLOCK();
 			SCTP_INP_WUNLOCK(inp);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		return (0);
 	}
 	asoc->state = 0;
 	if (inp->sctp_tcbhash) {
 		LIST_REMOVE(stcb, sctp_tcbhash);
 	}
 	if (stcb->asoc.in_asocid_hash) {
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 	}
 	/* Now lets remove it from the list of ALL associations in the EP */
 	LIST_REMOVE(stcb, sctp_tcblist);
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		SCTP_INP_INCR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 	}
 	/* pull from vtag hash */
 	LIST_REMOVE(stcb, sctp_asocs);
 	sctp_add_vtag_to_timewait(asoc->my_vtag, SCTP_BASE_SYSCTL(sctp_vtag_time_wait),
 	    inp->sctp_lport, stcb->rport);
 
 	/*
 	 * Now restop the timers to be sure this is paranoia at is finest!
 	 */
 	(void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer);
 	(void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer);
 	TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 		(void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer);
 		(void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer);
 		(void)SCTP_OS_TIMER_STOP(&net->hb_timer.timer);
 	}
 
 	asoc->strreset_timer.type = SCTP_TIMER_TYPE_NONE;
 	/*
 	 * The chunk lists and such SHOULD be empty but we check them just
 	 * in case.
 	 */
 	/* anything on the wheel needs to be removed */
 	for (i = 0; i < asoc->streamoutcnt; i++) {
 		struct sctp_stream_out *outs;
 
 		outs = &asoc->strmout[i];
 		/* now clean up any chunks here */
 		TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
 			TAILQ_REMOVE(&outs->outqueue, sp, next);
 			sctp_free_spbufspace(stcb, asoc, sp);
 			if (sp->data) {
 				if (so) {
 					/* Still an open socket - report */
 					sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
 					    0, (void *)sp, SCTP_SO_LOCKED);
 				}
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
 					sp->tail_mbuf = NULL;
 					sp->length = 0;
 				}
 			}
 			if (sp->net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
 			sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED);
 		}
 	}
 	/* sa_ignore FREED_MEMORY */
 	TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) {
 		TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp);
 		SCTP_FREE(strrst, SCTP_M_STRESET);
 	}
 	TAILQ_FOREACH_SAFE(sq, &asoc->pending_reply_queue, next, nsq) {
 		TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next);
 		if (sq->data) {
 			sctp_m_freem(sq->data);
 			sq->data = NULL;
 		}
 		sctp_free_remote_addr(sq->whoFrom);
 		sq->whoFrom = NULL;
 		sq->stcb = NULL;
 		/* Free the ctl entry */
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
 		SCTP_DECR_READQ_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1);
 		asoc->free_chunk_cnt--;
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* pending send queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 		if (asoc->strmout[chk->rec.data.stream_number].chunks_on_queues > 0) {
 			asoc->strmout[chk->rec.data.stream_number].chunks_on_queues--;
 #ifdef INVARIANTS
 		} else {
 			panic("No chunks on the queues for sid %u.", chk->rec.data.stream_number);
 #endif
 		}
 		TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
 				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
 				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		if (chk->whoTo) {
 			sctp_free_remote_addr(chk->whoTo);
 			chk->whoTo = NULL;
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* sent queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) {
 		if (chk->sent != SCTP_DATAGRAM_NR_ACKED) {
 			if (asoc->strmout[chk->rec.data.stream_number].chunks_on_queues > 0) {
 				asoc->strmout[chk->rec.data.stream_number].chunks_on_queues--;
 #ifdef INVARIANTS
 			} else {
 				panic("No chunks on the queues for sid %u.", chk->rec.data.stream_number);
 #endif
 			}
 		}
 		TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
 				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb,
 				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 #ifdef INVARIANTS
 	for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 		if (stcb->asoc.strmout[i].chunks_on_queues > 0) {
 			panic("%u chunks left for stream %u.", stcb->asoc.strmout[i].chunks_on_queues, i);
 		}
 	}
 #endif
 	/* control queue MAY not be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* ASCONF queue MAY not be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 
 	if (asoc->mapping_array) {
 		SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 		asoc->mapping_array = NULL;
 	}
 	if (asoc->nr_mapping_array) {
 		SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
 		asoc->nr_mapping_array = NULL;
 	}
 	/* the stream outs */
 	if (asoc->strmout) {
 		SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 		asoc->strmout = NULL;
 	}
 	asoc->strm_realoutsize = asoc->streamoutcnt = 0;
 	if (asoc->strmin) {
 		struct sctp_queued_to_read *ctl, *nctl;
 
 		for (i = 0; i < asoc->streamincnt; i++) {
 			TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) {
 				TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next);
 				sctp_free_remote_addr(ctl->whoFrom);
 				if (ctl->data) {
 					sctp_m_freem(ctl->data);
 					ctl->data = NULL;
 				}
 				/*
 				 * We don't free the address here since all
 				 * the net's were freed above.
 				 */
 				SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
 				SCTP_DECR_READQ_COUNT();
 			}
 		}
 		SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
 		asoc->strmin = NULL;
 	}
 	asoc->streamincnt = 0;
 	TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) {
 #ifdef INVARIANTS
 		if (SCTP_BASE_INFO(ipi_count_raddr) == 0) {
 			panic("no net's left alloc'ed, or list points to itself");
 		}
 #endif
 		TAILQ_REMOVE(&asoc->nets, net, sctp_next);
 		sctp_free_remote_addr(net);
 	}
 	LIST_FOREACH_SAFE(laddr, &asoc->sctp_restricted_addrs, sctp_nxt_addr, naddr) {
 		/* sa_ignore FREED_MEMORY */
 		sctp_remove_laddr(laddr);
 	}
 
 	/* pending asconf (address) parameters */
 	TAILQ_FOREACH_SAFE(aparam, &asoc->asconf_queue, next, naparam) {
 		/* sa_ignore FREED_MEMORY */
 		TAILQ_REMOVE(&asoc->asconf_queue, aparam, next);
 		SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
 	}
 	TAILQ_FOREACH_SAFE(aack, &asoc->asconf_ack_sent, next, naack) {
 		/* sa_ignore FREED_MEMORY */
 		TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next);
 		if (aack->data != NULL) {
 			sctp_m_freem(aack->data);
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack);
 	}
 	/* clean up auth stuff */
 	if (asoc->local_hmacs)
 		sctp_free_hmaclist(asoc->local_hmacs);
 	if (asoc->peer_hmacs)
 		sctp_free_hmaclist(asoc->peer_hmacs);
 
 	if (asoc->local_auth_chunks)
 		sctp_free_chunklist(asoc->local_auth_chunks);
 	if (asoc->peer_auth_chunks)
 		sctp_free_chunklist(asoc->peer_auth_chunks);
 
 	sctp_free_authinfo(&asoc->authinfo);
 
 	LIST_FOREACH_SAFE(shared_key, &asoc->shared_keys, next, nshared_key) {
 		LIST_REMOVE(shared_key, next);
 		sctp_free_sharedkey(shared_key);
 		/* sa_ignore FREED_MEMORY */
 	}
 
 	/* Insert new items here :> */
 
 	/* Get rid of LOCK */
 	SCTP_TCB_UNLOCK(stcb);
 	SCTP_TCB_LOCK_DESTROY(stcb);
 	SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_INP_RLOCK(inp);
 	}
 #ifdef SCTP_TRACK_FREED_ASOCS
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 		/* now clean up the tasoc itself */
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_DECR_ASOC_COUNT();
 	} else {
 		LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist);
 	}
 #else
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 	SCTP_DECR_ASOC_COUNT();
 #endif
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 			/*
 			 * If its NOT the inp_free calling us AND sctp_close
 			 * as been called, we call back...
 			 */
 			SCTP_INP_RUNLOCK(inp);
 			/*
 			 * This will start the kill timer (if we are the
 			 * last one) since we hold an increment yet. But
 			 * this is the only safe way to do this since
 			 * otherwise if the socket closes at the same time
 			 * we are here we might collide in the cleanup.
 			 */
 			sctp_inpcb_free(inp,
 			    SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
 			    SCTP_CALLED_DIRECTLY_NOCMPSET);
 			SCTP_INP_DECR_REF(inp);
 			goto out_of;
 		} else {
 			/* The socket is still open. */
 			SCTP_INP_DECR_REF(inp);
 		}
 	}
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		SCTP_INP_RUNLOCK(inp);
 	}
 out_of:
 	/* destroyed the asoc */
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 11);
 #endif
 	return (1);
 }
 
 
 
 /*
  * determine if a destination is "reachable" based upon the addresses bound
  * to the current endpoint (e.g. only v4 or v6 currently bound)
  */
 /*
  * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use
  * assoc level v4/v6 flags, as the assoc *may* not have the same address
  * types bound as its endpoint
  */
 int
 sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr)
 {
 	struct sctp_inpcb *inp;
 	int answer;
 
 	/*
 	 * No locks here, the TCB, in all cases is already locked and an
 	 * assoc is up. There is either a INP lock by the caller applied (in
 	 * asconf case when deleting an address) or NOT in the HB case,
 	 * however if HB then the INP increment is up and the INP will not
 	 * be removed (on top of the fact that we have a TCB lock). So we
 	 * only want to read the sctp_flags, which is either bound-all or
 	 * not.. no protection needed since once an assoc is up you can't be
 	 * changing your binding.
 	 */
 	inp = stcb->sctp_ep;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* if bound all, destination is not restricted */
 		/*
 		 * RRS: Question during lock work: Is this correct? If you
 		 * are bound-all you still might need to obey the V4--V6
 		 * flags??? IMO this bound-all stuff needs to be removed!
 		 */
 		return (1);
 	}
 	/* NOTE: all "scope" checks are done when local addresses are added */
 	switch (destaddr->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV6;
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV4;
 		break;
 #endif
 	default:
 		/* invalid family, so it's unreachable */
 		answer = 0;
 		break;
 	}
 	return (answer);
 }
 
 /*
  * update the inp_vflags on an endpoint
  */
 static void
 sctp_update_ep_vflag(struct sctp_inpcb *inp)
 {
 	struct sctp_laddr *laddr;
 
 	/* first clear the flag */
 	inp->ip_inp.inp.inp_vflag = 0;
 	/* set the flag based on addresses on the ep list */
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
 			    __func__);
 			continue;
 		}
 		if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 			continue;
 		}
 		switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			break;
 #endif
 		default:
 			break;
 		}
 	}
 }
 
 /*
  * Add the address to the endpoint local address list There is nothing to be
  * done if we are bound to all addresses
  */
 void
 sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action)
 {
 	struct sctp_laddr *laddr;
 	int fnd, error = 0;
 
 	fnd = 0;
 
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* You are already bound to all. You have it already */
 		return;
 	}
 #ifdef INET6
 	if (ifa->address.sa.sa_family == AF_INET6) {
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			/* Can't bind a non-useable addr. */
 			return;
 		}
 	}
 #endif
 	/* first, is it already present? */
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			fnd = 1;
 			break;
 		}
 	}
 
 	if (fnd == 0) {
 		/* Not in the ep list */
 		error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action);
 		if (error != 0)
 			return;
 		inp->laddr_count++;
 		/* update inp_vflag flags */
 		switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			break;
 #endif
 		default:
 			break;
 		}
 	}
 	return;
 }
 
 
 /*
  * select a new (hopefully reachable) destination net (should only be used
  * when we deleted an ep addr that is the only usable source address to reach
  * the destination net)
  */
 static void
 sctp_select_primary_destination(struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/* for now, we'll just pick the first reachable one we find */
 		if (net->dest_state & SCTP_ADDR_UNCONFIRMED)
 			continue;
 		if (sctp_destination_is_reachable(stcb,
 		    (struct sockaddr *)&net->ro._l_addr)) {
 			/* found a reachable destination */
 			stcb->asoc.primary_destination = net;
 		}
 	}
 	/* I can't there from here! ...we're gonna die shortly... */
 }
 
 
 /*
  * Delete the address from the endpoint local address list There is nothing
  * to be done if we are bound to all addresses
  */
 void
 sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 	int fnd;
 
 	fnd = 0;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* You are already bound to all. You have it already */
 		return;
 	}
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			fnd = 1;
 			break;
 		}
 	}
 	if (fnd && (inp->laddr_count < 2)) {
 		/* can't delete unless there are at LEAST 2 addresses */
 		return;
 	}
 	if (fnd) {
 		/*
 		 * clean up any use of this address go through our
 		 * associations and clear any last_used_address that match
 		 * this one for each assoc, see if a new primary_destination
 		 * is needed
 		 */
 		struct sctp_tcb *stcb;
 
 		/* clean up "next_addr_touse" */
 		if (inp->next_addr_touse == laddr)
 			/* delete this address */
 			inp->next_addr_touse = NULL;
 
 		/* clean up "last_used_address" */
 		LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 			struct sctp_nets *net;
 
 			SCTP_TCB_LOCK(stcb);
 			if (stcb->asoc.last_used_address == laddr)
 				/* delete this address */
 				stcb->asoc.last_used_address = NULL;
 			/*
 			 * Now spin through all the nets and purge any ref
 			 * to laddr
 			 */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				if (net->ro._s_addr &&
 				    (net->ro._s_addr->ifa == laddr->ifa)) {
 					/* Yep, purge src address selected */
 					sctp_rtentry_t *rt;
 
 					/* delete this address if cached */
 					rt = net->ro.ro_rt;
 					if (rt != NULL) {
 						RTFREE(rt);
 						net->ro.ro_rt = NULL;
 					}
 					sctp_free_ifa(net->ro._s_addr);
 					net->ro._s_addr = NULL;
 					net->src_addr_selected = 0;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}		/* for each tcb */
 		/* remove it from the ep list */
 		sctp_remove_laddr(laddr);
 		inp->laddr_count--;
 		/* update inp_vflag flags */
 		sctp_update_ep_vflag(inp);
 	}
 	return;
 }
 
 /*
  * Add the address to the TCB local address restricted list.
  * This is a "pending" address list (eg. addresses waiting for an
  * ASCONF-ACK response) and cannot be used as a valid source address.
  */
 void
 sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 	struct sctpladdr *list;
 
 	/*
 	 * Assumes TCB is locked.. and possibly the INP. May need to
 	 * confirm/fix that if we need it and is not the case.
 	 */
 	list = &stcb->asoc.sctp_restricted_addrs;
 
 #ifdef INET6
 	if (ifa->address.sa.sa_family == AF_INET6) {
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			/* Can't bind a non-existent addr. */
 			return;
 		}
 	}
 #endif
 	/* does the address already exist? */
 	LIST_FOREACH(laddr, list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			return;
 		}
 	}
 
 	/* add to the list */
 	(void)sctp_insert_laddr(list, ifa, 0);
 	return;
 }
 
 /*
  * insert an laddr entry with the given ifa for the desired list
  */
 int
 sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act)
 {
 	struct sctp_laddr *laddr;
 
 	laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 	if (laddr == NULL) {
 		/* out of memory? */
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INCR_LADDR_COUNT();
 	bzero(laddr, sizeof(*laddr));
 	(void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
 	laddr->ifa = ifa;
 	laddr->action = act;
 	atomic_add_int(&ifa->refcount, 1);
 	/* insert it */
 	LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr);
 
 	return (0);
 }
 
 /*
  * Remove an laddr entry from the local address list (on an assoc)
  */
 void
 sctp_remove_laddr(struct sctp_laddr *laddr)
 {
 
 	/* remove from the list */
 	LIST_REMOVE(laddr, sctp_nxt_addr);
 	sctp_free_ifa(laddr->ifa);
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr);
 	SCTP_DECR_LADDR_COUNT();
 }
 
 /*
  * Remove a local address from the TCB local address restricted list
  */
 void
 sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 
 	/*
 	 * This is called by asconf work. It is assumed that a) The TCB is
 	 * locked and b) The INP is locked. This is true in as much as I can
 	 * trace through the entry asconf code where I did these locks.
 	 * Again, the ASCONF code is a bit different in that it does lock
 	 * the INP during its work often times. This must be since we don't
 	 * want other proc's looking up things while what they are looking
 	 * up is changing :-D
 	 */
 
 	inp = stcb->sctp_ep;
 	/* if subset bound and don't allow ASCONF's, can't delete last */
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) &&
 	    sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
 		if (stcb->sctp_ep->laddr_count < 2) {
 			/* can't delete last address */
 			return;
 		}
 	}
 	LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
 		/* remove the address if it exists */
 		if (laddr->ifa == NULL)
 			continue;
 		if (laddr->ifa == ifa) {
 			sctp_remove_laddr(laddr);
 			return;
 		}
 	}
 
 	/* address not found! */
 	return;
 }
 
 /*
  * Temporarily remove for __APPLE__ until we use the Tiger equivalents
  */
 /* sysctl */
 static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC;
 static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR;
 
 
 
 #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP)
 struct sctp_mcore_ctrl *sctp_mcore_workers = NULL;
 int *sctp_cpuarry = NULL;
 void
 sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use)
 {
 	/* Queue a packet to a processor for the specified core */
 	struct sctp_mcore_queue *qent;
 	struct sctp_mcore_ctrl *wkq;
 	int need_wake = 0;
 
 	if (sctp_mcore_workers == NULL) {
 		/* Something went way bad during setup */
 		sctp_input_with_port(m, off, 0);
 		return;
 	}
 	SCTP_MALLOC(qent, struct sctp_mcore_queue *,
 	    (sizeof(struct sctp_mcore_queue)),
 	    SCTP_M_MCORE);
 	if (qent == NULL) {
 		/* This is trouble  */
 		sctp_input_with_port(m, off, 0);
 		return;
 	}
 	qent->vn = curvnet;
 	qent->m = m;
 	qent->off = off;
 	qent->v6 = 0;
 	wkq = &sctp_mcore_workers[cpu_to_use];
 	SCTP_MCORE_QLOCK(wkq);
 
 	TAILQ_INSERT_TAIL(&wkq->que, qent, next);
 	if (wkq->running == 0) {
 		need_wake = 1;
 	}
 	SCTP_MCORE_QUNLOCK(wkq);
 	if (need_wake) {
 		wakeup(&wkq->running);
 	}
 }
 
 static void
 sctp_mcore_thread(void *arg)
 {
 
 	struct sctp_mcore_ctrl *wkq;
 	struct sctp_mcore_queue *qent;
 
 	wkq = (struct sctp_mcore_ctrl *)arg;
 	struct mbuf *m;
 	int off, v6;
 
 	/* Wait for first tickle */
 	SCTP_MCORE_LOCK(wkq);
 	wkq->running = 0;
 	msleep(&wkq->running,
 	    &wkq->core_mtx,
 	    0, "wait for pkt", 0);
 	SCTP_MCORE_UNLOCK(wkq);
 
 	/* Bind to our cpu */
 	thread_lock(curthread);
 	sched_bind(curthread, wkq->cpuid);
 	thread_unlock(curthread);
 
 	/* Now lets start working */
 	SCTP_MCORE_LOCK(wkq);
 	/* Now grab lock and go */
 	for (;;) {
 		SCTP_MCORE_QLOCK(wkq);
 skip_sleep:
 		wkq->running = 1;
 		qent = TAILQ_FIRST(&wkq->que);
 		if (qent) {
 			TAILQ_REMOVE(&wkq->que, qent, next);
 			SCTP_MCORE_QUNLOCK(wkq);
 			CURVNET_SET(qent->vn);
 			m = qent->m;
 			off = qent->off;
 			v6 = qent->v6;
 			SCTP_FREE(qent, SCTP_M_MCORE);
 			if (v6 == 0) {
 				sctp_input_with_port(m, off, 0);
 			} else {
 				SCTP_PRINTF("V6 not yet supported\n");
 				sctp_m_freem(m);
 			}
 			CURVNET_RESTORE();
 			SCTP_MCORE_QLOCK(wkq);
 		}
 		wkq->running = 0;
 		if (!TAILQ_EMPTY(&wkq->que)) {
 			goto skip_sleep;
 		}
 		SCTP_MCORE_QUNLOCK(wkq);
 		msleep(&wkq->running,
 		    &wkq->core_mtx,
 		    0, "wait for pkt", 0);
 	}
 }
 
 static void
 sctp_startup_mcore_threads(void)
 {
 	int i, cpu;
 
 	if (mp_ncpus == 1)
 		return;
 
 	if (sctp_mcore_workers != NULL) {
 		/*
 		 * Already been here in some previous vnet?
 		 */
 		return;
 	}
 	SCTP_MALLOC(sctp_mcore_workers, struct sctp_mcore_ctrl *,
 	    ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl)),
 	    SCTP_M_MCORE);
 	if (sctp_mcore_workers == NULL) {
 		/* TSNH I hope */
 		return;
 	}
 	memset(sctp_mcore_workers, 0, ((mp_maxid + 1) *
 	    sizeof(struct sctp_mcore_ctrl)));
 	/* Init the structures */
 	for (i = 0; i <= mp_maxid; i++) {
 		TAILQ_INIT(&sctp_mcore_workers[i].que);
 		SCTP_MCORE_LOCK_INIT(&sctp_mcore_workers[i]);
 		SCTP_MCORE_QLOCK_INIT(&sctp_mcore_workers[i]);
 		sctp_mcore_workers[i].cpuid = i;
 	}
 	if (sctp_cpuarry == NULL) {
 		SCTP_MALLOC(sctp_cpuarry, int *,
 		    (mp_ncpus * sizeof(int)),
 		    SCTP_M_MCORE);
 		i = 0;
 		CPU_FOREACH(cpu) {
 			sctp_cpuarry[i] = cpu;
 			i++;
 		}
 	}
 	/* Now start them all */
 	CPU_FOREACH(cpu) {
 		(void)kproc_create(sctp_mcore_thread,
 		    (void *)&sctp_mcore_workers[cpu],
 		    &sctp_mcore_workers[cpu].thread_proc,
 		    RFPROC,
 		    SCTP_KTHREAD_PAGES,
 		    SCTP_MCORE_NAME);
 
 	}
 }
 
 #endif
 
 void
 sctp_pcb_init()
 {
 	/*
 	 * SCTP initialization for the PCB structures should be called by
 	 * the sctp_init() funciton.
 	 */
 	int i;
 	struct timeval tv;
 
 	if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) {
 		/* error I was called twice */
 		return;
 	}
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 1;
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	bzero(&SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log));
 #endif
 #if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	SCTP_MALLOC(SCTP_BASE_STATS, struct sctpstat *,
 	    ((mp_maxid + 1) * sizeof(struct sctpstat)),
 	    SCTP_M_MCORE);
 #endif
 	(void)SCTP_GETTIME_TIMEVAL(&tv);
 #if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	bzero(SCTP_BASE_STATS, (sizeof(struct sctpstat) * (mp_maxid + 1)));
 	SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t) tv.tv_sec;
 	SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t) tv.tv_usec;
 #else
 	bzero(&SCTP_BASE_STATS, sizeof(struct sctpstat));
 	SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t) tv.tv_sec;
 	SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t) tv.tv_usec;
 #endif
 	/* init the empty list of (All) Endpoints */
 	LIST_INIT(&SCTP_BASE_INFO(listhead));
 
 
 	/* init the hash table of endpoints */
 	TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize));
 	TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize));
 	TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale));
 	SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31),
 	    &SCTP_BASE_INFO(hashasocmark));
 	SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
 	    &SCTP_BASE_INFO(hashmark));
 	SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
 	    &SCTP_BASE_INFO(hashtcpmark));
 	SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize);
 
 
 	SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH,
 	    &SCTP_BASE_INFO(hashvrfmark));
 
 	SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE,
 	    &SCTP_BASE_INFO(vrf_ifn_hashmark));
 	/* init the zones */
 	/*
 	 * FIX ME: Should check for NULL returns, but if it does fail we are
 	 * doomed to panic anyways... add later maybe.
 	 */
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep",
 	    sizeof(struct sctp_inpcb), maxsockets);
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc",
 	    sizeof(struct sctp_tcb), sctp_max_number_of_assoc);
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr",
 	    sizeof(struct sctp_laddr),
 	    (sctp_max_number_of_assoc * sctp_scale_up_for_address));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr",
 	    sizeof(struct sctp_nets),
 	    (sctp_max_number_of_assoc * sctp_scale_up_for_address));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk",
 	    sizeof(struct sctp_tmit_chunk),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq",
 	    sizeof(struct sctp_queued_to_read),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out",
 	    sizeof(struct sctp_stream_queue_pending),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf",
 	    sizeof(struct sctp_asconf),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack",
 	    sizeof(struct sctp_asconf_ack),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 
 	/* Master Lock INIT for info structure */
 	SCTP_INP_INFO_LOCK_INIT();
 	SCTP_STATLOG_INIT_LOCK();
 
 	SCTP_IPI_COUNT_INIT();
 	SCTP_IPI_ADDR_INIT();
 #ifdef SCTP_PACKET_LOGGING
 	SCTP_IP_PKTLOG_INIT();
 #endif
 	LIST_INIT(&SCTP_BASE_INFO(addr_wq));
 
 	SCTP_WQ_ADDR_INIT();
 	/* not sure if we need all the counts */
 	SCTP_BASE_INFO(ipi_count_ep) = 0;
 	/* assoc/tcb zone info */
 	SCTP_BASE_INFO(ipi_count_asoc) = 0;
 	/* local addrlist zone info */
 	SCTP_BASE_INFO(ipi_count_laddr) = 0;
 	/* remote addrlist zone info */
 	SCTP_BASE_INFO(ipi_count_raddr) = 0;
 	/* chunk info */
 	SCTP_BASE_INFO(ipi_count_chunk) = 0;
 
 	/* socket queue zone info */
 	SCTP_BASE_INFO(ipi_count_readq) = 0;
 
 	/* stream out queue cont */
 	SCTP_BASE_INFO(ipi_count_strmoq) = 0;
 
 	SCTP_BASE_INFO(ipi_free_strmoq) = 0;
 	SCTP_BASE_INFO(ipi_free_chunks) = 0;
 
 	SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer));
 
 	/* Init the TIMEWAIT list */
 	for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
 		LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]);
 	}
 	sctp_startup_iterator();
 
 #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP)
 	sctp_startup_mcore_threads();
 #endif
 
 	/*
 	 * INIT the default VRF which for BSD is the only one, other O/S's
 	 * may have more. But initially they must start with one and then
 	 * add the VRF's as addresses are added.
 	 */
 	sctp_init_vrf_list(SCTP_DEFAULT_VRF);
 }
 
 /*
  * Assumes that the SCTP_BASE_INFO() lock is NOT held.
  */
 void
 sctp_pcb_finish(void)
 {
 	struct sctp_vrflist *vrf_bucket;
 	struct sctp_vrf *vrf, *nvrf;
 	struct sctp_ifn *ifn, *nifn;
 	struct sctp_ifa *ifa, *nifa;
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block, *prev_twait_block;
 	struct sctp_laddr *wi, *nwi;
 	int i;
 	struct sctp_iterator *it, *nit;
 
+	if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+		printf("%s: race condition on teardown.\n", __func__);
+		return;
+	}
+	SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
+
 	/*
 	 * In FreeBSD the iterator thread never exits but we do clean up.
 	 * The only way FreeBSD reaches here is if we have VRF's but we
 	 * still add the ifdef to make it compile on old versions.
 	 */
+retry:
+	while (sctp_it_ctl.iterator_running != 0)
+		DELAY(1);
 	SCTP_IPI_ITERATOR_WQ_LOCK();
+	/*
+	 * sctp_iterator_worker() might be working on an it entry without
+	 * holding the lock.  We won't find it on the list either and
+	 * continue and free/destroy it.  While holding the lock, spin, to
+	 * avoid the race condition as sctp_iterator_worker() will have to
+	 * wait to re-aquire the lock.
+	 */
+	if (sctp_it_ctl.cur_it != NULL || sctp_it_ctl.iterator_running != 0) {
+		SCTP_IPI_ITERATOR_WQ_UNLOCK();
+		printf("%s: Iterator running while we held the lock. Retry.\n",
+		    __func__);
+		goto retry;
+	}
 	TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
 		if (it->vn != curvnet) {
 			continue;
 		}
 		TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
 		if (it->function_atend != NULL) {
 			(*it->function_atend) (it->pointer, it->val);
 		}
 		SCTP_FREE(it, SCTP_M_ITER);
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 	SCTP_ITERATOR_LOCK();
 	if ((sctp_it_ctl.cur_it) &&
 	    (sctp_it_ctl.cur_it->vn == curvnet)) {
 		sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
 	}
 	SCTP_ITERATOR_UNLOCK();
-	SCTP_OS_TIMER_STOP(&SCTP_BASE_INFO(addr_wq_timer.timer));
+	SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer));
 	SCTP_WQ_ADDR_LOCK();
 	LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) {
 		LIST_REMOVE(wi, sctp_nxt_addr);
 		SCTP_DECR_LADDR_COUNT();
 		if (wi->action == SCTP_DEL_IP_ADDRESS) {
 			SCTP_FREE(wi->ifa, SCTP_M_IFA);
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi);
 	}
 	SCTP_WQ_ADDR_UNLOCK();
 
 	/*
 	 * free the vrf/ifn/ifa lists and hashes (be sure address monitor is
 	 * destroyed first).
 	 */
 	vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_FOREACH_SAFE(vrf, vrf_bucket, next_vrf, nvrf) {
 		LIST_FOREACH_SAFE(ifn, &vrf->ifnlist, next_ifn, nifn) {
 			LIST_FOREACH_SAFE(ifa, &ifn->ifalist, next_ifa, nifa) {
 				/* free the ifa */
 				LIST_REMOVE(ifa, next_bucket);
 				LIST_REMOVE(ifa, next_ifa);
 				SCTP_FREE(ifa, SCTP_M_IFA);
 			}
 			/* free the ifn */
 			LIST_REMOVE(ifn, next_bucket);
 			LIST_REMOVE(ifn, next_ifn);
 			SCTP_FREE(ifn, SCTP_M_IFN);
 		}
 		SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
 		/* free the vrf */
 		LIST_REMOVE(vrf, next_vrf);
 		SCTP_FREE(vrf, SCTP_M_VRF);
 	}
 	/* free the vrf hashes */
 	SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark));
 	SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark));
 
 	/*
 	 * free the TIMEWAIT list elements malloc'd in the function
 	 * sctp_add_vtag_to_timewait()...
 	 */
 	for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
 		chain = &SCTP_BASE_INFO(vtag_timewait)[i];
 		if (!LIST_EMPTY(chain)) {
 			prev_twait_block = NULL;
 			LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 				if (prev_twait_block) {
 					SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
 				}
 				prev_twait_block = twait_block;
 			}
 			SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
 		}
 	}
 
 	/* free the locks and mutexes */
 #ifdef SCTP_PACKET_LOGGING
 	SCTP_IP_PKTLOG_DESTROY();
 #endif
 	SCTP_IPI_ADDR_DESTROY();
 	SCTP_STATLOG_DESTROY();
 	SCTP_INP_INFO_LOCK_DESTROY();
 
 	SCTP_WQ_ADDR_DESTROY();
 
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack));
 	/* Get rid of other stuff to */
 	if (SCTP_BASE_INFO(sctp_asochash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark));
 	if (SCTP_BASE_INFO(sctp_ephash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark));
 	if (SCTP_BASE_INFO(sctp_tcpephash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark));
 #if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE);
 #endif
 }
 
 
 int
 sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
     int offset, int limit,
     struct sockaddr *src, struct sockaddr *dst,
     struct sockaddr *altsa)
 {
 	/*
 	 * grub through the INIT pulling addresses and loading them to the
 	 * nets structure in the asoc. The from address in the mbuf should
 	 * also be loaded (if it is not already). This routine can be called
 	 * with either INIT or INIT-ACK's as long as the m points to the IP
 	 * packet and the offset points to the beginning of the parameters.
 	 */
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net, *nnet, *net_tmp;
 	struct sctp_paramhdr *phdr, parm_buf;
 	struct sctp_tcb *stcb_tmp;
 	uint16_t ptype, plen;
 	struct sockaddr *sa;
 	uint8_t random_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_random *p_random = NULL;
 	uint16_t random_len = 0;
 	uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_hmac_algo *hmacs = NULL;
 	uint16_t hmacs_len = 0;
 	uint8_t saw_asconf = 0;
 	uint8_t saw_asconf_ack = 0;
 	uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_chunk_list *chunks = NULL;
 	uint16_t num_chunks = 0;
 	sctp_key_t *new_key;
 	uint32_t keylen;
 	int got_random = 0, got_hmacs = 0, got_chklist = 0;
 	uint8_t peer_supports_ecn;
 	uint8_t peer_supports_prsctp;
 	uint8_t peer_supports_auth;
 	uint8_t peer_supports_asconf;
 	uint8_t peer_supports_asconf_ack;
 	uint8_t peer_supports_reconfig;
 	uint8_t peer_supports_nrsack;
 	uint8_t peer_supports_pktdrop;
 
 #ifdef INET
 	struct sockaddr_in sin;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 
 #endif
 
 	/* First get the destination address setup too. */
 #ifdef INET
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(sin);
 	sin.sin_port = stcb->rport;
 #endif
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_port = stcb->rport;
 #endif
 	if (altsa) {
 		sa = altsa;
 	} else {
 		sa = src;
 	}
 	peer_supports_ecn = 0;
 	peer_supports_prsctp = 0;
 	peer_supports_auth = 0;
 	peer_supports_asconf = 0;
 	peer_supports_reconfig = 0;
 	peer_supports_nrsack = 0;
 	peer_supports_pktdrop = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/* mark all addresses that we have currently on the list */
 		net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC;
 	}
 	/* does the source address already exist? if so skip it */
 	inp = stcb->sctp_ep;
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb);
 	atomic_add_int(&stcb->asoc.refcnt, -1);
 
 	if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) {
 		/* we must add the source address */
 		/* no scope set here since we have a tcb already. */
 		switch (sa->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (stcb->asoc.scope.ipv4_addr_legal) {
 				if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) {
 					return (-1);
 				}
 			}
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (stcb->asoc.scope.ipv6_addr_legal) {
 				if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) {
 					return (-2);
 				}
 			}
 			break;
 #endif
 		default:
 			break;
 		}
 	} else {
 		if (net_tmp != NULL && stcb_tmp == stcb) {
 			net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC;
 		} else if (stcb_tmp != stcb) {
 			/* It belongs to another association? */
 			if (stcb_tmp)
 				SCTP_TCB_UNLOCK(stcb_tmp);
 			return (-3);
 		}
 	}
 	if (stcb->asoc.state == 0) {
 		/* the assoc was freed? */
 		return (-4);
 	}
 	/* now we must go through each of the params. */
 	phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
 	while (phdr) {
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		/*
 		 * SCTP_PRINTF("ptype => %0x, plen => %d\n",
 		 * (uint32_t)ptype, (int)plen);
 		 */
 		if (offset + plen > limit) {
 			break;
 		}
 		if (plen == 0) {
 			break;
 		}
 #ifdef INET
 		if (ptype == SCTP_IPV4_ADDRESS) {
 			if (stcb->asoc.scope.ipv4_addr_legal) {
 				struct sctp_ipv4addr_param *p4, p4_buf;
 
 				/* ok get the v4 address and check/add */
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&p4_buf,
 				    sizeof(p4_buf));
 				if (plen != sizeof(struct sctp_ipv4addr_param) ||
 				    phdr == NULL) {
 					return (-5);
 				}
 				p4 = (struct sctp_ipv4addr_param *)phdr;
 				sin.sin_addr.s_addr = p4->addr;
 				if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 					/* Skip multi-cast addresses */
 					goto next_param;
 				}
 				if ((sin.sin_addr.s_addr == INADDR_BROADCAST) ||
 				    (sin.sin_addr.s_addr == INADDR_ANY)) {
 					goto next_param;
 				}
 				sa = (struct sockaddr *)&sin;
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
 				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 
 				if ((stcb_tmp == NULL && inp == stcb->sctp_ep) ||
 				    inp == NULL) {
 					/* we must add the source address */
 					/*
 					 * no scope set since we have a tcb
 					 * already
 					 */
 
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 			add_it_now:
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-7);
 					}
 					if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) {
 						return (-8);
 					}
 				} else if (stcb_tmp == stcb) {
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-10);
 					}
 					if (net != NULL) {
 						/* clear flag */
 						net->dest_state &=
 						    ~SCTP_ADDR_NOT_IN_ASSOC;
 					}
 				} else {
 					/*
 					 * strange, address is in another
 					 * assoc? straighten out locks.
 					 */
 					if (stcb_tmp) {
 						if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
 							struct mbuf *op_err;
 							char msg[SCTP_DIAG_INFO_LEN];
 
 							/*
 							 * in setup state we
 							 * abort this guy
 							 */
 							snprintf(msg, sizeof(msg),
 							    "%s:%d at %s", __FILE__, __LINE__, __func__);
 							op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 							    msg);
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
 							    stcb_tmp, op_err,
 							    SCTP_SO_NOT_LOCKED);
 							goto add_it_now;
 						}
 						SCTP_TCB_UNLOCK(stcb_tmp);
 					}
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-12);
 					}
 					return (-13);
 				}
 			}
 		} else
 #endif
 #ifdef INET6
 		if (ptype == SCTP_IPV6_ADDRESS) {
 			if (stcb->asoc.scope.ipv6_addr_legal) {
 				/* ok get the v6 address and check/add */
 				struct sctp_ipv6addr_param *p6, p6_buf;
 
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&p6_buf,
 				    sizeof(p6_buf));
 				if (plen != sizeof(struct sctp_ipv6addr_param) ||
 				    phdr == NULL) {
 					return (-14);
 				}
 				p6 = (struct sctp_ipv6addr_param *)phdr;
 				memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
 				    sizeof(p6->addr));
 				if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) {
 					/* Skip multi-cast addresses */
 					goto next_param;
 				}
 				if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) {
 					/*
 					 * Link local make no sense without
 					 * scope
 					 */
 					goto next_param;
 				}
 				sa = (struct sockaddr *)&sin6;
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
 				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 				if (stcb_tmp == NULL &&
 				    (inp == stcb->sctp_ep || inp == NULL)) {
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 			add_it_now6:
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-16);
 					}
 					/*
 					 * we must add the address, no scope
 					 * set
 					 */
 					if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) {
 						return (-17);
 					}
 				} else if (stcb_tmp == stcb) {
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-19);
 					}
 					if (net != NULL) {
 						/* clear flag */
 						net->dest_state &=
 						    ~SCTP_ADDR_NOT_IN_ASSOC;
 					}
 				} else {
 					/*
 					 * strange, address is in another
 					 * assoc? straighten out locks.
 					 */
 					if (stcb_tmp) {
 						if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
 							struct mbuf *op_err;
 							char msg[SCTP_DIAG_INFO_LEN];
 
 							/*
 							 * in setup state we
 							 * abort this guy
 							 */
 							snprintf(msg, sizeof(msg),
 							    "%s:%d at %s", __FILE__, __LINE__, __func__);
 							op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 							    msg);
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
 							    stcb_tmp, op_err,
 							    SCTP_SO_NOT_LOCKED);
 							goto add_it_now6;
 						}
 						SCTP_TCB_UNLOCK(stcb_tmp);
 					}
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-21);
 					}
 					return (-22);
 				}
 			}
 		} else
 #endif
 		if (ptype == SCTP_ECN_CAPABLE) {
 			peer_supports_ecn = 1;
 		} else if (ptype == SCTP_ULP_ADAPTATION) {
 			if (stcb->asoc.state != SCTP_STATE_OPEN) {
 				struct sctp_adaptation_layer_indication ai,
 				                                *aip;
 
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&ai, sizeof(ai));
 				aip = (struct sctp_adaptation_layer_indication *)phdr;
 				if (aip) {
 					stcb->asoc.peers_adaptation = ntohl(aip->indication);
 					stcb->asoc.adaptation_needed = 1;
 				}
 			}
 		} else if (ptype == SCTP_SET_PRIM_ADDR) {
 			struct sctp_asconf_addr_param lstore, *fee;
 			int lptype;
 			struct sockaddr *lsa = NULL;
 
 #ifdef INET
 			struct sctp_asconf_addrv4_param *fii;
 
 #endif
 
 			if (stcb->asoc.asconf_supported == 0) {
 				return (-100);
 			}
 			if (plen > sizeof(lstore)) {
 				return (-23);
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&lstore,
 			    min(plen, sizeof(lstore)));
 			if (phdr == NULL) {
 				return (-24);
 			}
 			fee = (struct sctp_asconf_addr_param *)phdr;
 			lptype = ntohs(fee->addrp.ph.param_type);
 			switch (lptype) {
 #ifdef INET
 			case SCTP_IPV4_ADDRESS:
 				if (plen !=
 				    sizeof(struct sctp_asconf_addrv4_param)) {
 					SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n",
 					    (int)sizeof(struct sctp_asconf_addrv4_param),
 					    plen);
 				} else {
 					fii = (struct sctp_asconf_addrv4_param *)fee;
 					sin.sin_addr.s_addr = fii->addrp.addr;
 					lsa = (struct sockaddr *)&sin;
 				}
 				break;
 #endif
 #ifdef INET6
 			case SCTP_IPV6_ADDRESS:
 				if (plen !=
 				    sizeof(struct sctp_asconf_addr_param)) {
 					SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n",
 					    (int)sizeof(struct sctp_asconf_addr_param),
 					    plen);
 				} else {
 					memcpy(sin6.sin6_addr.s6_addr,
 					    fee->addrp.addr,
 					    sizeof(fee->addrp.addr));
 					lsa = (struct sockaddr *)&sin6;
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 			if (lsa) {
 				(void)sctp_set_primary_addr(stcb, sa, NULL);
 			}
 		} else if (ptype == SCTP_HAS_NAT_SUPPORT) {
 			stcb->asoc.peer_supports_nat = 1;
 		} else if (ptype == SCTP_PRSCTP_SUPPORTED) {
 			/* Peer supports pr-sctp */
 			peer_supports_prsctp = 1;
 		} else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) {
 			/* A supported extension chunk */
 			struct sctp_supported_chunk_types_param *pr_supported;
 			uint8_t local_store[SCTP_PARAM_BUFFER_SIZE];
 			int num_ent, i;
 
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&local_store, min(sizeof(local_store), plen));
 			if (phdr == NULL) {
 				return (-25);
 			}
 			pr_supported = (struct sctp_supported_chunk_types_param *)phdr;
 			num_ent = plen - sizeof(struct sctp_paramhdr);
 			for (i = 0; i < num_ent; i++) {
 				switch (pr_supported->chunk_types[i]) {
 				case SCTP_ASCONF:
 					peer_supports_asconf = 1;
 					break;
 				case SCTP_ASCONF_ACK:
 					peer_supports_asconf_ack = 1;
 					break;
 				case SCTP_FORWARD_CUM_TSN:
 					peer_supports_prsctp = 1;
 					break;
 				case SCTP_PACKET_DROPPED:
 					peer_supports_pktdrop = 1;
 					break;
 				case SCTP_NR_SELECTIVE_ACK:
 					peer_supports_nrsack = 1;
 					break;
 				case SCTP_STREAM_RESET:
 					peer_supports_reconfig = 1;
 					break;
 				case SCTP_AUTHENTICATION:
 					peer_supports_auth = 1;
 					break;
 				default:
 					/* one I have not learned yet */
 					break;
 
 				}
 			}
 		} else if (ptype == SCTP_RANDOM) {
 			if (plen > sizeof(random_store))
 				break;
 			if (got_random) {
 				/* already processed a RANDOM */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)random_store,
 			    min(sizeof(random_store), plen));
 			if (phdr == NULL)
 				return (-26);
 			p_random = (struct sctp_auth_random *)phdr;
 			random_len = plen - sizeof(*p_random);
 			/* enforce the random length */
 			if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) {
 				SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n");
 				return (-27);
 			}
 			got_random = 1;
 		} else if (ptype == SCTP_HMAC_LIST) {
 			uint16_t num_hmacs;
 			uint16_t i;
 
 			if (plen > sizeof(hmacs_store))
 				break;
 			if (got_hmacs) {
 				/* already processed a HMAC list */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)hmacs_store,
 			    min(plen, sizeof(hmacs_store)));
 			if (phdr == NULL)
 				return (-28);
 			hmacs = (struct sctp_auth_hmac_algo *)phdr;
 			hmacs_len = plen - sizeof(*hmacs);
 			num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]);
 			/* validate the hmac list */
 			if (sctp_verify_hmac_param(hmacs, num_hmacs)) {
 				return (-29);
 			}
 			if (stcb->asoc.peer_hmacs != NULL)
 				sctp_free_hmaclist(stcb->asoc.peer_hmacs);
 			stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs);
 			if (stcb->asoc.peer_hmacs != NULL) {
 				for (i = 0; i < num_hmacs; i++) {
 					(void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs,
 					    ntohs(hmacs->hmac_ids[i]));
 				}
 			}
 			got_hmacs = 1;
 		} else if (ptype == SCTP_CHUNK_LIST) {
 			int i;
 
 			if (plen > sizeof(chunks_store))
 				break;
 			if (got_chklist) {
 				/* already processed a Chunks list */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)chunks_store,
 			    min(plen, sizeof(chunks_store)));
 			if (phdr == NULL)
 				return (-30);
 			chunks = (struct sctp_auth_chunk_list *)phdr;
 			num_chunks = plen - sizeof(*chunks);
 			if (stcb->asoc.peer_auth_chunks != NULL)
 				sctp_clear_chunklist(stcb->asoc.peer_auth_chunks);
 			else
 				stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist();
 			for (i = 0; i < num_chunks; i++) {
 				(void)sctp_auth_add_chunk(chunks->chunk_types[i],
 				    stcb->asoc.peer_auth_chunks);
 				/* record asconf/asconf-ack if listed */
 				if (chunks->chunk_types[i] == SCTP_ASCONF)
 					saw_asconf = 1;
 				if (chunks->chunk_types[i] == SCTP_ASCONF_ACK)
 					saw_asconf_ack = 1;
 
 			}
 			got_chklist = 1;
 		} else if ((ptype == SCTP_HEARTBEAT_INFO) ||
 			    (ptype == SCTP_STATE_COOKIE) ||
 			    (ptype == SCTP_UNRECOG_PARAM) ||
 			    (ptype == SCTP_COOKIE_PRESERVE) ||
 			    (ptype == SCTP_SUPPORTED_ADDRTYPE) ||
 			    (ptype == SCTP_ADD_IP_ADDRESS) ||
 			    (ptype == SCTP_DEL_IP_ADDRESS) ||
 			    (ptype == SCTP_ERROR_CAUSE_IND) ||
 		    (ptype == SCTP_SUCCESS_REPORT)) {
 			 /* don't care */ ;
 		} else {
 			if ((ptype & 0x8000) == 0x0000) {
 				/*
 				 * must stop processing the rest of the
 				 * param's. Any report bits were handled
 				 * with the call to
 				 * sctp_arethere_unrecognized_parameters()
 				 * when the INIT or INIT-ACK was first seen.
 				 */
 				break;
 			}
 		}
 
 next_param:
 		offset += SCTP_SIZE32(plen);
 		if (offset >= limit) {
 			break;
 		}
 		phdr = sctp_get_next_param(m, offset, &parm_buf,
 		    sizeof(parm_buf));
 	}
 	/* Now check to see if we need to purge any addresses */
 	TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) {
 		if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) ==
 		    SCTP_ADDR_NOT_IN_ASSOC) {
 			/* This address has been removed from the asoc */
 			/* remove and free it */
 			stcb->asoc.numnets--;
 			TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next);
 			sctp_free_remote_addr(net);
 			if (net == stcb->asoc.primary_destination) {
 				stcb->asoc.primary_destination = NULL;
 				sctp_select_primary_destination(stcb);
 			}
 		}
 	}
 	if ((stcb->asoc.ecn_supported == 1) &&
 	    (peer_supports_ecn == 0)) {
 		stcb->asoc.ecn_supported = 0;
 	}
 	if ((stcb->asoc.prsctp_supported == 1) &&
 	    (peer_supports_prsctp == 0)) {
 		stcb->asoc.prsctp_supported = 0;
 	}
 	if ((stcb->asoc.auth_supported == 1) &&
 	    ((peer_supports_auth == 0) ||
 	    (got_random == 0) || (got_hmacs == 0))) {
 		stcb->asoc.auth_supported = 0;
 	}
 	if ((stcb->asoc.asconf_supported == 1) &&
 	    ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) ||
 	    (stcb->asoc.auth_supported == 0) ||
 	    (saw_asconf == 0) || (saw_asconf_ack == 0))) {
 		stcb->asoc.asconf_supported = 0;
 	}
 	if ((stcb->asoc.reconfig_supported == 1) &&
 	    (peer_supports_reconfig == 0)) {
 		stcb->asoc.reconfig_supported = 0;
 	}
 	if ((stcb->asoc.nrsack_supported == 1) &&
 	    (peer_supports_nrsack == 0)) {
 		stcb->asoc.nrsack_supported = 0;
 	}
 	if ((stcb->asoc.pktdrop_supported == 1) &&
 	    (peer_supports_pktdrop == 0)) {
 		stcb->asoc.pktdrop_supported = 0;
 	}
 	/* validate authentication required parameters */
 	if ((peer_supports_auth == 0) && (got_chklist == 1)) {
 		/* peer does not support auth but sent a chunks list? */
 		return (-31);
 	}
 	if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) {
 		/* peer supports asconf but not auth? */
 		return (-32);
 	} else if ((peer_supports_asconf == 1) &&
 		    (peer_supports_auth == 1) &&
 	    ((saw_asconf == 0) || (saw_asconf_ack == 0))) {
 		return (-33);
 	}
 	/* concatenate the full random key */
 	keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len;
 	if (chunks != NULL) {
 		keylen += sizeof(*chunks) + num_chunks;
 	}
 	new_key = sctp_alloc_key(keylen);
 	if (new_key != NULL) {
 		/* copy in the RANDOM */
 		if (p_random != NULL) {
 			keylen = sizeof(*p_random) + random_len;
 			bcopy(p_random, new_key->key, keylen);
 		}
 		/* append in the AUTH chunks */
 		if (chunks != NULL) {
 			bcopy(chunks, new_key->key + keylen,
 			    sizeof(*chunks) + num_chunks);
 			keylen += sizeof(*chunks) + num_chunks;
 		}
 		/* append in the HMACs */
 		if (hmacs != NULL) {
 			bcopy(hmacs, new_key->key + keylen,
 			    sizeof(*hmacs) + hmacs_len);
 		}
 	} else {
 		/* failed to get memory for the key */
 		return (-34);
 	}
 	if (stcb->asoc.authinfo.peer_random != NULL)
 		sctp_free_key(stcb->asoc.authinfo.peer_random);
 	stcb->asoc.authinfo.peer_random = new_key;
 	sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid);
 	sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid);
 
 	return (0);
 }
 
 int
 sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa,
     struct sctp_nets *net)
 {
 	/* make sure the requested primary address exists in the assoc */
 	if (net == NULL && sa)
 		net = sctp_findnet(stcb, sa);
 
 	if (net == NULL) {
 		/* didn't find the requested primary address! */
 		return (-1);
 	} else {
 		/* set the primary address */
 		if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			/* Must be confirmed, so queue to set */
 			net->dest_state |= SCTP_ADDR_REQ_PRIMARY;
 			return (0);
 		}
 		stcb->asoc.primary_destination = net;
 		if (!(net->dest_state & SCTP_ADDR_PF) && (stcb->asoc.alternate)) {
 			sctp_free_remote_addr(stcb->asoc.alternate);
 			stcb->asoc.alternate = NULL;
 		}
 		net = TAILQ_FIRST(&stcb->asoc.nets);
 		if (net != stcb->asoc.primary_destination) {
 			/*
 			 * first one on the list is NOT the primary
 			 * sctp_cmpaddr() is much more efficient if the
 			 * primary is the first on the list, make it so.
 			 */
 			TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
 			TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
 		}
 		return (0);
 	}
 }
 
 int
 sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now)
 {
 	/*
 	 * This function serves two purposes. It will see if a TAG can be
 	 * re-used and return 1 for yes it is ok and 0 for don't use that
 	 * tag. A secondary function it will do is purge out old tags that
 	 * can be removed.
 	 */
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	struct sctpasochead *head;
 	struct sctp_tcb *stcb;
 	int i;
 
 	SCTP_INP_INFO_RLOCK();
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
 	    SCTP_BASE_INFO(hashasocmark))];
 	LIST_FOREACH(stcb, head, sctp_asocs) {
 		/*
 		 * We choose not to lock anything here. TCB's can't be
 		 * removed since we have the read lock, so they can't be
 		 * freed on us, same thing for the INP. I may be wrong with
 		 * this assumption, but we will go with it for now :-)
 		 */
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			continue;
 		}
 		if (stcb->asoc.my_vtag == tag) {
 			/* candidate */
 			if (stcb->rport != rport) {
 				continue;
 			}
 			if (stcb->sctp_ep->sctp_lport != lport) {
 				continue;
 			}
 			/* Its a used tag set */
 			SCTP_INP_INFO_RUNLOCK();
 			return (0);
 		}
 	}
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	/* Now what about timed wait ? */
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		/*
 		 * Block(s) are present, lets see if we have this tag in the
 		 * list
 		 */
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if (twait_block->vtag_block[i].v_tag == 0) {
 				/* not used */
 				continue;
 			} else if ((long)twait_block->vtag_block[i].tv_sec_at_expire <
 			    now->tv_sec) {
 				/* Audit expires this guy */
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 			} else if ((twait_block->vtag_block[i].v_tag == tag) &&
 				    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				/* Bad tag, sorry :< */
 				SCTP_INP_INFO_RUNLOCK();
 				return (0);
 			}
 		}
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	return (1);
 }
 
 static void
 sctp_drain_mbufs(struct sctp_tcb *stcb)
 {
 	/*
 	 * We must hunt this association for MBUF's past the cumack (i.e.
 	 * out of order data that we can renege on).
 	 */
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk, *nchk;
 	uint32_t cumulative_tsn_p1;
 	struct sctp_queued_to_read *ctl, *nctl;
 	int cnt, strmat;
 	uint32_t gap, i;
 	int fnd = 0;
 
 	/* We look for anything larger than the cum-ack + 1 */
 
 	asoc = &stcb->asoc;
 	if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) {
 		/* none we can reneg on. */
 		return;
 	}
 	SCTP_STAT_INCR(sctps_protocol_drains_done);
 	cumulative_tsn_p1 = asoc->cumulative_tsn + 1;
 	cnt = 0;
 	/* First look in the re-assembly queue */
 	TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
 		if (SCTP_TSN_GT(chk->rec.data.TSN_seq, cumulative_tsn_p1)) {
 			/* Yep it is above cum-ack */
 			cnt++;
 			SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn);
 			asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size);
 			sctp_ucount_decr(asoc->cnt_on_reasm_queue);
 			SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 			TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 			sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		}
 	}
 	/* Ok that was fun, now we will drain all the inbound streams? */
 	for (strmat = 0; strmat < asoc->streamincnt; strmat++) {
 		TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].inqueue, next, nctl) {
 			if (SCTP_TSN_GT(ctl->sinfo_tsn, cumulative_tsn_p1)) {
 				/* Yep it is above cum-ack */
 				cnt++;
 				SCTP_CALC_TSN_TO_GAP(gap, ctl->sinfo_tsn, asoc->mapping_array_base_tsn);
 				asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length);
 				sctp_ucount_decr(asoc->cnt_on_all_streams);
 				SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 				TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next);
 				if (ctl->data) {
 					sctp_m_freem(ctl->data);
 					ctl->data = NULL;
 				}
 				sctp_free_remote_addr(ctl->whoFrom);
 				SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
 				SCTP_DECR_READQ_COUNT();
 			}
 		}
 	}
 	if (cnt) {
 		/* We must back down to see what the new highest is */
 		for (i = asoc->highest_tsn_inside_map; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) {
 			SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn);
 			if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
 				asoc->highest_tsn_inside_map = i;
 				fnd = 1;
 				break;
 			}
 		}
 		if (!fnd) {
 			asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1;
 		}
 		/*
 		 * Question, should we go through the delivery queue? The
 		 * only reason things are on here is the app not reading OR
 		 * a p-d-api up. An attacker COULD send enough in to
 		 * initiate the PD-API and then send a bunch of stuff to
 		 * other streams... these would wind up on the delivery
 		 * queue.. and then we would not get to them. But in order
 		 * to do this I then have to back-track and un-deliver
 		 * sequence numbers in streams.. el-yucko. I think for now
 		 * we will NOT look at the delivery queue and leave it to be
 		 * something to consider later. An alternative would be to
 		 * abort the P-D-API with a notification and then deliver
 		 * the data.... Or another method might be to keep track of
 		 * how many times the situation occurs and if we see a
 		 * possible attack underway just abort the association.
 		 */
 #ifdef SCTP_DEBUG
 		SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt);
 #endif
 		/*
 		 * Now do we need to find a new
 		 * asoc->highest_tsn_inside_map?
 		 */
 		asoc->last_revoke_count = cnt;
 		(void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer);
 		/* sa_ignore NO_NULL_CHK */
 		sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED);
 		sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED);
 	}
 	/*
 	 * Another issue, in un-setting the TSN's in the mapping array we
 	 * DID NOT adjust the highest_tsn marker.  This will cause one of
 	 * two things to occur. It may cause us to do extra work in checking
 	 * for our mapping array movement. More importantly it may cause us
 	 * to SACK every datagram. This may not be a bad thing though since
 	 * we will recover once we get our cum-ack above and all this stuff
 	 * we dumped recovered.
 	 */
 }
 
 void
 sctp_drain()
 {
 	/*
 	 * We must walk the PCB lists for ALL associations here. The system
 	 * is LOW on MBUF's and needs help. This is where reneging will
 	 * occur. We really hope this does NOT happen!
 	 */
 	VNET_ITERATOR_DECL(vnet_iter);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct sctp_inpcb *inp;
 		struct sctp_tcb *stcb;
 
 		SCTP_STAT_INCR(sctps_protocol_drain_calls);
 		if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
 #ifdef VIMAGE
 			continue;
 #else
 			return;
 #endif
 		}
 		SCTP_INP_INFO_RLOCK();
 		LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
 			/* For each endpoint */
 			SCTP_INP_RLOCK(inp);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				/* For each association */
 				SCTP_TCB_LOCK(stcb);
 				sctp_drain_mbufs(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			SCTP_INP_RUNLOCK(inp);
 		}
 		SCTP_INP_INFO_RUNLOCK();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * start a new iterator
  * iterates through all endpoints and associations based on the pcb_state
  * flags and asoc_state.  "af" (mandatory) is executed for all matching
  * assocs and "ef" (optional) is executed when the iterator completes.
  * "inpf" (optional) is executed for each new endpoint as it is being
  * iterated through. inpe (optional) is called when the inp completes
  * its way through all the stcbs.
  */
 int
 sctp_initiate_iterator(inp_func inpf,
     asoc_func af,
     inp_func inpe,
     uint32_t pcb_state,
     uint32_t pcb_features,
     uint32_t asoc_state,
     void *argp,
     uint32_t argi,
     end_func ef,
     struct sctp_inpcb *s_inp,
     uint8_t chunk_output_off)
 {
 	struct sctp_iterator *it = NULL;
 
 	if (af == NULL) {
 		return (-1);
 	}
 	SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator),
 	    SCTP_M_ITER);
 	if (it == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
 		return (ENOMEM);
 	}
 	memset(it, 0, sizeof(*it));
 	it->function_assoc = af;
 	it->function_inp = inpf;
 	if (inpf)
 		it->done_current_ep = 0;
 	else
 		it->done_current_ep = 1;
 	it->function_atend = ef;
 	it->pointer = argp;
 	it->val = argi;
 	it->pcb_flags = pcb_state;
 	it->pcb_features = pcb_features;
 	it->asoc_state = asoc_state;
 	it->function_inp_end = inpe;
 	it->no_chunk_output = chunk_output_off;
 	it->vn = curvnet;
 	if (s_inp) {
 		/* Assume lock is held here */
 		it->inp = s_inp;
 		SCTP_INP_INCR_REF(it->inp);
 		it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP;
 	} else {
 		SCTP_INP_INFO_RLOCK();
 		it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead));
 		if (it->inp) {
 			SCTP_INP_INCR_REF(it->inp);
 		}
 		SCTP_INP_INFO_RUNLOCK();
 		it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP;
 
 	}
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 
 	TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
 	if (sctp_it_ctl.iterator_running == 0) {
 		sctp_wakeup_iterator();
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 	/* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */
 	return (0);
 }
Index: projects/vnet/sys/netinet/sctp_usrreq.c
===================================================================
--- projects/vnet/sys/netinet/sctp_usrreq.c	(revision 294565)
+++ projects/vnet/sys/netinet/sctp_usrreq.c	(revision 294566)
@@ -1,7419 +1,7419 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_var.h>
 #ifdef INET6
 #include <netinet6/sctp6_var.h>
 #endif
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/udp.h>
 
 
 
 extern struct sctp_cc_functions sctp_cc_functions[];
 extern struct sctp_ss_functions sctp_ss_functions[];
 
 void
 sctp_init(void)
 {
 	u_long sb_max_adj;
 
 	/* Initialize and modify the sysctled variables */
 	sctp_init_sysctls();
 	if ((nmbclusters / 8) > SCTP_ASOC_MAX_CHUNKS_ON_QUEUE)
 		SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = (nmbclusters / 8);
 	/*
 	 * Allow a user to take no more than 1/2 the number of clusters or
 	 * the SB_MAX whichever is smaller for the send window.
 	 */
 	sb_max_adj = (u_long)((u_quad_t) (SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES));
 	SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj,
 	    (((uint32_t) nmbclusters / 2) * SCTP_DEFAULT_MAXSEGMENT));
 	/*
 	 * Now for the recv window, should we take the same amount? or
 	 * should I do 1/2 the SB_MAX instead in the SB_MAX min above. For
 	 * now I will just copy.
 	 */
 	SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace);
 	SCTP_BASE_VAR(first_time) = 0;
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
 	sctp_pcb_init();
 #if defined(SCTP_PACKET_LOGGING)
 	SCTP_BASE_VAR(packet_log_writers) = 0;
 	SCTP_BASE_VAR(packet_log_end) = 0;
 	bzero(&SCTP_BASE_VAR(packet_log_buffer), SCTP_PACKET_LOG_SIZE);
 #endif
 }
 
-void
+static void
 sctp_finish(void)
 {
 	sctp_pcb_finish();
 }
-
+VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL);
 
 
 void
 sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz)
 {
 	struct sctp_tmit_chunk *chk;
 	uint16_t overhead;
 
 	/* Adjust that too */
 	stcb->asoc.smallest_mtu = nxtsz;
 	/* now off to subtract IP_DF flag if needed */
 	overhead = IP_HDR_SIZE;
 	if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
 		overhead += sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 	}
 	TAILQ_FOREACH(chk, &stcb->asoc.send_queue, sctp_next) {
 		if ((chk->send_size + overhead) > nxtsz) {
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 		}
 	}
 	TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 		if ((chk->send_size + overhead) > nxtsz) {
 			/*
 			 * For this guy we also mark for immediate resend
 			 * since we sent to big of chunk
 			 */
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 			if (chk->sent < SCTP_DATAGRAM_RESEND) {
 				sctp_flight_size_decrease(chk);
 				sctp_total_flight_decrease(stcb, chk);
 				chk->sent = SCTP_DATAGRAM_RESEND;
 				sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 				chk->rec.data.doing_fast_retransmit = 0;
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
 					sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
 					    chk->whoTo->flight_size,
 					    chk->book_size,
 					    (uintptr_t) chk->whoTo,
 					    chk->rec.data.TSN_seq);
 				}
 				/* Clear any time so NO RTT is being done */
 				chk->do_rtt = 0;
 			}
 		}
 	}
 }
 
 #ifdef INET
 static void
 sctp_notify_mbuf(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_nets *net,
     struct ip *ip,
     struct sctphdr *sh)
 {
 	struct icmp *icmph;
 	int totsz, tmr_stopped = 0;
 	uint16_t nxtsz;
 
 	/* protection */
 	if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
 	    (ip == NULL) || (sh == NULL)) {
 		if (stcb != NULL) {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 		return;
 	}
 	/* First job is to verify the vtag matches what I would send */
 	if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
 	    sizeof(struct ip)));
 	if (icmph->icmp_type != ICMP_UNREACH) {
 		/* We only care about unreachable */
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	if (icmph->icmp_code != ICMP_UNREACH_NEEDFRAG) {
 		/* not a unreachable message due to frag. */
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	totsz = ntohs(ip->ip_len);
 
 	nxtsz = ntohs(icmph->icmp_nextmtu);
 	if (nxtsz == 0) {
 		/*
 		 * old type router that does not tell us what the next size
 		 * mtu is. Rats we will have to guess (in a educated fashion
 		 * of course)
 		 */
 		nxtsz = sctp_get_prev_mtu(totsz);
 	}
 	/* Stop any PMTU timer */
 	if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 		tmr_stopped = 1;
 		sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
 	}
 	/* Adjust destination size limit */
 	if (net->mtu > nxtsz) {
 		net->mtu = nxtsz;
 		if (net->port) {
 			net->mtu -= sizeof(struct udphdr);
 		}
 	}
 	/* now what about the ep? */
 	if (stcb->asoc.smallest_mtu > nxtsz) {
 		sctp_pathmtu_adjustment(stcb, nxtsz);
 	}
 	if (tmr_stopped)
 		sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 
 	SCTP_TCB_UNLOCK(stcb);
 }
 
 void
 sctp_notify(struct sctp_inpcb *inp,
     struct ip *ip,
     struct sctphdr *sh,
     struct sockaddr *to,
     struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
 	struct icmp *icmph;
 
 	/* protection */
 	if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
 	    (sh == NULL) || (to == NULL)) {
 		if (stcb)
 			SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	/* First job is to verify the vtag matches what I would send */
 	if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
 	    sizeof(struct ip)));
 	if (icmph->icmp_type != ICMP_UNREACH) {
 		/* We only care about unreachable */
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	if ((icmph->icmp_code == ICMP_UNREACH_NET) ||
 	    (icmph->icmp_code == ICMP_UNREACH_HOST) ||
 	    (icmph->icmp_code == ICMP_UNREACH_NET_UNKNOWN) ||
 	    (icmph->icmp_code == ICMP_UNREACH_HOST_UNKNOWN) ||
 	    (icmph->icmp_code == ICMP_UNREACH_ISOLATED) ||
 	    (icmph->icmp_code == ICMP_UNREACH_NET_PROHIB) ||
 	    (icmph->icmp_code == ICMP_UNREACH_HOST_PROHIB) ||
 	    (icmph->icmp_code == ICMP_UNREACH_FILTER_PROHIB)) {
 
 		/*
 		 * Hmm reachablity problems we must examine closely. If its
 		 * not reachable, we may have lost a network. Or if there is
 		 * NO protocol at the other end named SCTP. well we consider
 		 * it a OOTB abort.
 		 */
 		if (net->dest_state & SCTP_ADDR_REACHABLE) {
 			/* Ok that destination is NOT reachable */
 			net->dest_state &= ~SCTP_ADDR_REACHABLE;
 			net->dest_state &= ~SCTP_ADDR_PF;
 			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
 			    stcb, 0,
 			    (void *)net, SCTP_SO_NOT_LOCKED);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 	} else if ((icmph->icmp_code == ICMP_UNREACH_PROTOCOL) ||
 	    (icmph->icmp_code == ICMP_UNREACH_PORT)) {
 		/*
 		 * Here the peer is either playing tricks on us, including
 		 * an address that belongs to someone who does not support
 		 * SCTP OR was a userland implementation that shutdown and
 		 * now is dead. In either case treat it like a OOTB abort
 		 * with no TCB
 		 */
 		sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED);
 #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(inp);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
 		SCTP_TCB_LOCK(stcb);
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
 #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 		/* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */
 #endif
 		/* no need to unlock here, since the TCB is gone */
 	} else {
 		SCTP_TCB_UNLOCK(stcb);
 	}
 }
 
 #endif
 
 #ifdef INET
 void
 sctp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct sctphdr *sh;
 	uint32_t vrf_id;
 
 	/* FIX, for non-bsd is this right? */
 	vrf_id = SCTP_DEFAULT_VRFID;
 	if (sa->sa_family != AF_INET ||
 	    ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) {
 		return;
 	}
 	if (PRC_IS_REDIRECT(cmd)) {
 		ip = 0;
 	} else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) {
 		return;
 	}
 	if (ip) {
 		struct sctp_inpcb *inp = NULL;
 		struct sctp_tcb *stcb = NULL;
 		struct sctp_nets *net = NULL;
 		struct sockaddr_in to, from;
 
 		sh = (struct sctphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		bzero(&to, sizeof(to));
 		bzero(&from, sizeof(from));
 		from.sin_family = to.sin_family = AF_INET;
 		from.sin_len = to.sin_len = sizeof(to);
 		from.sin_port = sh->src_port;
 		from.sin_addr = ip->ip_src;
 		to.sin_port = sh->dest_port;
 		to.sin_addr = ip->ip_dst;
 
 		/*
 		 * 'to' holds the dest of the packet that failed to be sent.
 		 * 'from' holds our local endpoint address. Thus we reverse
 		 * the to and the from in the lookup.
 		 */
 		stcb = sctp_findassociation_addr_sa((struct sockaddr *)&to,
 		    (struct sockaddr *)&from,
 		    &inp, &net, 1, vrf_id);
 		if (stcb != NULL && inp && (inp->sctp_socket != NULL)) {
 			if (cmd != PRC_MSGSIZE) {
 				sctp_notify(inp, ip, sh,
 				    (struct sockaddr *)&to, stcb,
 				    net);
 			} else {
 				/* handle possible ICMP size messages */
 				sctp_notify_mbuf(inp, stcb, net, ip, sh);
 			}
 		} else {
 			if ((stcb == NULL) && (inp != NULL)) {
 				/* reduce ref-count */
 				SCTP_INP_WLOCK(inp);
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 			}
 			if (stcb) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 		}
 	}
 	return;
 }
 
 #endif
 
 static int
 sctp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net;
 	struct sctp_tcb *stcb;
 	int error;
 	uint32_t vrf_id;
 
 	/* FIX, for non-bsd is this right? */
 	vrf_id = SCTP_DEFAULT_VRFID;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	stcb = sctp_findassociation_addr_sa(sintosa(&addrs[1]),
 	    sintosa(&addrs[0]),
 	    &inp, &net, 1, vrf_id);
 	if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) {
 		if ((inp != NULL) && (stcb == NULL)) {
 			/* reduce ref-count */
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			goto cred_can_cont;
 		}
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 		error = ENOENT;
 		goto out;
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	/*
 	 * We use the write lock here, only since in the error leg we need
 	 * it. If we used RLOCK, then we would have to
 	 * wlock/decr/unlock/rlock. Which in theory could create a hole.
 	 * Better to use higher wlock.
 	 */
 	SCTP_INP_WLOCK(inp);
 cred_can_cont:
 	error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket);
 	if (error) {
 		SCTP_INP_WUNLOCK(inp);
 		goto out;
 	}
 	cru2x(inp->sctp_socket->so_cred, &xuc);
 	SCTP_INP_WUNLOCK(inp);
 	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 out:
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_sctp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW,
     0, 0, sctp_getcred, "S,ucred", "Get the ucred of a SCTP connection");
 
 
 #ifdef INET
 static void
 sctp_abort(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 	uint32_t flags;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		return;
 	}
 sctp_must_try_again:
 	flags = inp->sctp_flags;
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 17);
 #endif
 	if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 	    (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 16);
 #endif
 		sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 		    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		SOCK_LOCK(so);
 		SCTP_SB_CLEAR(so->so_snd);
 		/*
 		 * same for the rcv ones, they are only here for the
 		 * accounting/select.
 		 */
 		SCTP_SB_CLEAR(so->so_rcv);
 
 		/* Now null out the reference, we are completely detached. */
 		so->so_pcb = NULL;
 		SOCK_UNLOCK(so);
 	} else {
 		flags = inp->sctp_flags;
 		if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
 			goto sctp_must_try_again;
 		}
 	}
 	return;
 }
 
 static int
 sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED)
 {
 	struct sctp_inpcb *inp;
 	struct inpcb *ip_inp;
 	int error;
 	uint32_t vrf_id = SCTP_DEFAULT_VRFID;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp != 0) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace));
 		if (error) {
 			return (error);
 		}
 	}
 	error = sctp_inpcb_alloc(so, vrf_id);
 	if (error) {
 		return (error);
 	}
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	SCTP_INP_WLOCK(inp);
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUND_V6;	/* I'm not v6! */
 	ip_inp = &inp->ip_inp.inp;
 	ip_inp->inp_vflag |= INP_IPV4;
 	ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
 	SCTP_INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 sctp_bind(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (addr != NULL) {
 		if ((addr->sa_family != AF_INET) ||
 		    (addr->sa_len != sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 	}
 	return (sctp_inpcb_bind(so, addr, NULL, p));
 }
 
 #endif
 void
 sctp_close(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 	uint32_t flags;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL)
 		return;
 
 	/*
 	 * Inform all the lower layer assoc that we are done.
 	 */
 sctp_must_try_again:
 	flags = inp->sctp_flags;
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 17);
 #endif
 	if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 	    (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
 		if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) ||
 		    (so->so_rcv.sb_cc > 0)) {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 13);
 #endif
 			sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 			    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		} else {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 14);
 #endif
 			sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
 			    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		}
 		/*
 		 * The socket is now detached, no matter what the state of
 		 * the SCTP association.
 		 */
 		SOCK_LOCK(so);
 		SCTP_SB_CLEAR(so->so_snd);
 		/*
 		 * same for the rcv ones, they are only here for the
 		 * accounting/select.
 		 */
 		SCTP_SB_CLEAR(so->so_rcv);
 
 		/* Now null out the reference, we are completely detached. */
 		so->so_pcb = NULL;
 		SOCK_UNLOCK(so);
 	} else {
 		flags = inp->sctp_flags;
 		if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
 			goto sctp_must_try_again;
 		}
 	}
 	return;
 }
 
 
 int
 sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p);
 
 
 int
 sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 	int error;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		sctp_m_freem(m);
 		return (EINVAL);
 	}
 	/* Got to have an to address if we are NOT a connected socket */
 	if ((addr == NULL) &&
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))) {
 		goto connected_type;
 	} else if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
 		error = EDESTADDRREQ;
 		sctp_m_freem(m);
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
 		return (error);
 	}
 #ifdef INET6
 	if (addr->sa_family != AF_INET) {
 		/* must be a v4 address! */
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
 		sctp_m_freem(m);
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
 		error = EDESTADDRREQ;
 		return (error);
 	}
 #endif				/* INET6 */
 connected_type:
 	/* now what about control */
 	if (control) {
 		if (inp->control) {
 			SCTP_PRINTF("huh? control set?\n");
 			sctp_m_freem(inp->control);
 			inp->control = NULL;
 		}
 		inp->control = control;
 	}
 	/* Place the data */
 	if (inp->pkt) {
 		SCTP_BUF_NEXT(inp->pkt_last) = m;
 		inp->pkt_last = m;
 	} else {
 		inp->pkt_last = inp->pkt = m;
 	}
 	if (
 	/* FreeBSD uses a flag passed */
 	    ((flags & PRUS_MORETOCOME) == 0)
 	    ) {
 		/*
 		 * note with the current version this code will only be used
 		 * by OpenBSD-- NetBSD, FreeBSD, and MacOS have methods for
 		 * re-defining sosend to use the sctp_sosend. One can
 		 * optionally switch back to this code (by changing back the
 		 * definitions) but this is not advisable. This code is used
 		 * by FreeBSD when sending a file with sendfile() though.
 		 */
 		int ret;
 
 		ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags);
 		inp->pkt = NULL;
 		inp->control = NULL;
 		return (ret);
 	} else {
 		return (0);
 	}
 }
 
 int
 sctp_disconnect(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 		return (ENOTCONN);
 	}
 	SCTP_INP_RLOCK(inp);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		if (LIST_EMPTY(&inp->sctp_asoc_list)) {
 			/* No connection */
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		} else {
 			struct sctp_association *asoc;
 			struct sctp_tcb *stcb;
 
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			SCTP_TCB_LOCK(stcb);
 			asoc = &stcb->asoc;
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				/* We are about to be freed, out of here */
 				SCTP_TCB_UNLOCK(stcb);
 				SCTP_INP_RUNLOCK(inp);
 				return (0);
 			}
 			if (((so->so_options & SO_LINGER) &&
 			    (so->so_linger == 0)) ||
 			    (so->so_rcv.sb_cc > 0)) {
 				if (SCTP_GET_STATE(asoc) !=
 				    SCTP_STATE_COOKIE_WAIT) {
 					/* Left with Data unread */
 					struct mbuf *err;
 
 					err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA);
 					if (err) {
 						/*
 						 * Fill in the user
 						 * initiated abort
 						 */
 						struct sctp_paramhdr *ph;
 
 						ph = mtod(err, struct sctp_paramhdr *);
 						SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr);
 						ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
 						ph->param_length = htons(SCTP_BUF_LEN(err));
 					}
 					sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				}
 				SCTP_INP_RUNLOCK(inp);
 				if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
 				    (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 					SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 				}
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3);
 				/* No unlock tcb assoc is gone */
 				return (0);
 			}
 			if (TAILQ_EMPTY(&asoc->send_queue) &&
 			    TAILQ_EMPTY(&asoc->sent_queue) &&
 			    (asoc->stream_queue_cnt == 0)) {
 				/* there is nothing queued to send, so done */
 				if (asoc->locked_on_sending) {
 					goto abort_anyway;
 				}
 				if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					/* only send SHUTDOWN 1st time thru */
 					struct sctp_nets *netp;
 
 					if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
 					SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
 					sctp_stop_timers_for_shutdown(stcb);
 					if (stcb->asoc.alternate) {
 						netp = stcb->asoc.alternate;
 					} else {
 						netp = stcb->asoc.primary_destination;
 					}
 					sctp_send_shutdown(stcb, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
 					    stcb->sctp_ep, stcb, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 					    stcb->sctp_ep, stcb, netp);
 					sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED);
 				}
 			} else {
 				/*
 				 * we still got (or just got) data to send,
 				 * so set SHUTDOWN_PENDING
 				 */
 				/*
 				 * XXX sockets draft says that SCTP_EOF
 				 * should be sent with no data. currently,
 				 * we will allow user data to be sent first
 				 * and move to SHUTDOWN-PENDING
 				 */
 				struct sctp_nets *netp;
 
 				if (stcb->asoc.alternate) {
 					netp = stcb->asoc.alternate;
 				} else {
 					netp = stcb->asoc.primary_destination;
 				}
 
 				asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
 				    netp);
 				if (asoc->locked_on_sending) {
 					/* Locked to send out the data */
 					struct sctp_stream_queue_pending *sp;
 
 					sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
 					if (sp == NULL) {
 						SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
 						    asoc->locked_on_sending->stream_no);
 					} else {
 						if ((sp->length == 0) && (sp->msg_is_complete == 0))
 							asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
 					}
 				}
 				if (TAILQ_EMPTY(&asoc->send_queue) &&
 				    TAILQ_EMPTY(&asoc->sent_queue) &&
 				    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 					struct mbuf *op_err;
 
 			abort_anyway:
 					op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4;
 					sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 					if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_INP_RUNLOCK(inp);
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5);
 					return (0);
 				} else {
 					sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 				}
 			}
 			soisdisconnecting(so);
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		/* not reached */
 	} else {
 		/* UDP model does not support this */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	}
 }
 
 int
 sctp_flush(struct socket *so, int how)
 {
 	/*
 	 * We will just clear out the values and let subsequent close clear
 	 * out the data, if any. Note if the user did a shutdown(SHUT_RD)
 	 * they will not be able to read the data, the socket will block
 	 * that from happening.
 	 */
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INP_RLOCK(inp);
 	/* For the 1 to many model this does nothing */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
 		SCTP_INP_RUNLOCK(inp);
 		return (0);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if ((how == PRU_FLUSH_RD) || (how == PRU_FLUSH_RDWR)) {
 		/*
 		 * First make sure the sb will be happy, we don't use these
 		 * except maybe the count
 		 */
 		SCTP_INP_WLOCK(inp);
 		SCTP_INP_READ_LOCK(inp);
 		inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ;
 		SCTP_INP_READ_UNLOCK(inp);
 		SCTP_INP_WUNLOCK(inp);
 		so->so_rcv.sb_cc = 0;
 		so->so_rcv.sb_mbcnt = 0;
 		so->so_rcv.sb_mb = NULL;
 	}
 	if ((how == PRU_FLUSH_WR) || (how == PRU_FLUSH_RDWR)) {
 		/*
 		 * First make sure the sb will be happy, we don't use these
 		 * except maybe the count
 		 */
 		so->so_snd.sb_cc = 0;
 		so->so_snd.sb_mbcnt = 0;
 		so->so_snd.sb_mb = NULL;
 
 	}
 	return (0);
 }
 
 int
 sctp_shutdown(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INP_RLOCK(inp);
 	/* For UDP model this is a invalid call */
 	if (!((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
 		/* Restore the flags that the soshutdown took away. */
 		SOCKBUF_LOCK(&so->so_rcv);
 		so->so_rcv.sb_state &= ~SBS_CANTRCVMORE;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/* This proc will wakeup for read and do nothing (I hope) */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	} else {
 		/*
 		 * Ok, if we reach here its the TCP model and it is either a
 		 * SHUT_WR or SHUT_RDWR. This means we put the shutdown flag
 		 * against it.
 		 */
 		struct sctp_tcb *stcb;
 		struct sctp_association *asoc;
 		struct sctp_nets *netp;
 
 		if ((so->so_state &
 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
 			SCTP_INP_RUNLOCK(inp);
 			return (ENOTCONN);
 		}
 		socantsendmore(so);
 
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb == NULL) {
 			/*
 			 * Ok, we hit the case that the shutdown call was
 			 * made after an abort or something. Nothing to do
 			 * now.
 			 */
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		SCTP_TCB_LOCK(stcb);
 		asoc = &stcb->asoc;
 		if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
 		    (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_ECHOED) &&
 		    (SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN)) {
 			/*
 			 * If we are not in or before ESTABLISHED, there is
 			 * no protocol action required.
 			 */
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		if (stcb->asoc.alternate) {
 			netp = stcb->asoc.alternate;
 		} else {
 			netp = stcb->asoc.primary_destination;
 		}
 		if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) &&
 		    TAILQ_EMPTY(&asoc->send_queue) &&
 		    TAILQ_EMPTY(&asoc->sent_queue) &&
 		    (asoc->stream_queue_cnt == 0)) {
 			if (asoc->locked_on_sending) {
 				goto abort_anyway;
 			}
 			/* there is nothing queued to send, so I'm done... */
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 			SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
 			SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
 			sctp_stop_timers_for_shutdown(stcb);
 			sctp_send_shutdown(stcb, netp);
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
 			    stcb->sctp_ep, stcb, netp);
 		} else {
 			/*
 			 * We still got (or just got) data to send, so set
 			 * SHUTDOWN_PENDING.
 			 */
 			SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
 			if (asoc->locked_on_sending) {
 				/* Locked to send out the data */
 				struct sctp_stream_queue_pending *sp;
 
 				sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
 				if (sp == NULL) {
 					SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
 					    asoc->locked_on_sending->stream_no);
 				} else {
 					if ((sp->length == 0) && (sp->msg_is_complete == 0)) {
 						SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT);
 					}
 				}
 			}
 			if (TAILQ_EMPTY(&asoc->send_queue) &&
 			    TAILQ_EMPTY(&asoc->sent_queue) &&
 			    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 				struct mbuf *op_err;
 
 		abort_anyway:
 				op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6;
 				sctp_abort_an_association(stcb->sctp_ep, stcb,
 				    op_err, SCTP_SO_LOCKED);
 				SCTP_INP_RUNLOCK(inp);
 				return (0);
 			}
 		}
 		sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp);
 		/*
 		 * XXX: Why do this in the case where we have still data
 		 * queued?
 		 */
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_RUNLOCK(inp);
 		return (0);
 	}
 }
 
 /*
  * copies a "user" presentable address and removes embedded scope, etc.
  * returns 0 on success, 1 on error
  */
 static uint32_t
 sctp_fill_user_address(struct sockaddr_storage *ss, struct sockaddr *sa)
 {
 #ifdef INET6
 	struct sockaddr_in6 lsa6;
 
 	sa = (struct sockaddr *)sctp_recover_scope((struct sockaddr_in6 *)sa,
 	    &lsa6);
 #endif
 	memcpy(ss, sa, sa->sa_len);
 	return (0);
 }
 
 
 
 /*
  * NOTE: assumes addr lock is held
  */
 static size_t
 sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     size_t limit,
     struct sockaddr_storage *sas,
     uint32_t vrf_id)
 {
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 	size_t actual;
 	int loopback_scope;
 
 #if defined(INET)
 	int ipv4_local_scope, ipv4_addr_legal;
 
 #endif
 #if defined(INET6)
 	int local_scope, site_scope, ipv6_addr_legal;
 
 #endif
 	struct sctp_vrf *vrf;
 
 	actual = 0;
 	if (limit <= 0)
 		return (actual);
 
 	if (stcb) {
 		/* Turn on all the appropriate scope */
 		loopback_scope = stcb->asoc.scope.loopback_scope;
 #if defined(INET)
 		ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope;
 		ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal;
 #endif
 #if defined(INET6)
 		local_scope = stcb->asoc.scope.local_scope;
 		site_scope = stcb->asoc.scope.site_scope;
 		ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal;
 #endif
 	} else {
 		/* Use generic values for endpoints. */
 		loopback_scope = 1;
 #if defined(INET)
 		ipv4_local_scope = 1;
 #endif
 #if defined(INET6)
 		local_scope = 1;
 		site_scope = 1;
 #endif
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 #if defined(INET6)
 			ipv6_addr_legal = 1;
 #endif
 #if defined(INET)
 			if (SCTP_IPV6_V6ONLY(inp)) {
 				ipv4_addr_legal = 0;
 			} else {
 				ipv4_addr_legal = 1;
 			}
 #endif
 		} else {
 #if defined(INET6)
 			ipv6_addr_legal = 0;
 #endif
 #if defined(INET)
 			ipv4_addr_legal = 1;
 #endif
 		}
 	}
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		return (0);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			if ((loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 				/* Skip loopback if loopback_scope not set */
 				continue;
 			}
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				if (stcb) {
 					/*
 					 * For the BOUND-ALL case, the list
 					 * associated with a TCB is Always
 					 * considered a reverse list.. i.e.
 					 * it lists addresses that are NOT
 					 * part of the association. If this
 					 * is one of those we must skip it.
 					 */
 					if (sctp_is_addr_restricted(stcb,
 					    sctp_ifa)) {
 						continue;
 					}
 				}
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (ipv4_addr_legal) {
 						struct sockaddr_in *sin;
 
 						sin = &sctp_ifa->address.sin;
 						if (sin->sin_addr.s_addr == 0) {
 							/*
 							 * we skip
 							 * unspecifed
 							 * addresses
 							 */
 							continue;
 						}
 						if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 						    &sin->sin_addr) != 0) {
 							continue;
 						}
 						if ((ipv4_local_scope == 0) &&
 						    (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 							continue;
 						}
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)sas);
 							((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
 							sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6));
 							actual += sizeof(struct sockaddr_in6);
 						} else {
 #endif
 							memcpy(sas, sin, sizeof(*sin));
 							((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport;
 							sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin));
 							actual += sizeof(*sin);
 #ifdef INET6
 						}
 #endif
 						if (actual >= limit) {
 							return (actual);
 						}
 					} else {
 						continue;
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					if (ipv6_addr_legal) {
 						struct sockaddr_in6 *sin6;
 
 						sin6 = &sctp_ifa->address.sin6;
 						if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 							/*
 							 * we skip
 							 * unspecifed
 							 * addresses
 							 */
 							continue;
 						}
 						if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 						    &sin6->sin6_addr) != 0) {
 							continue;
 						}
 						if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 							if (local_scope == 0)
 								continue;
 							if (sin6->sin6_scope_id == 0) {
 								if (sa6_recoverscope(sin6) != 0)
 									/*
 									 * 
 									 * bad
 									 * 
 									 * li
 									 * nk
 									 * 
 									 * loc
 									 * al
 									 * 
 									 * add
 									 * re
 									 * ss
 									 * */
 									continue;
 							}
 						}
 						if ((site_scope == 0) &&
 						    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 							continue;
 						}
 						memcpy(sas, sin6, sizeof(*sin6));
 						((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
 						sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin6));
 						actual += sizeof(*sin6);
 						if (actual >= limit) {
 							return (actual);
 						}
 					} else {
 						continue;
 					}
 					break;
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (stcb) {
 				if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
 					continue;
 				}
 			}
 			if (sctp_fill_user_address(sas, &laddr->ifa->address.sa))
 				continue;
 			switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
 				break;
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 			sas = (struct sockaddr_storage *)((caddr_t)sas +
 			    laddr->ifa->address.sa.sa_len);
 			actual += laddr->ifa->address.sa.sa_len;
 			if (actual >= limit) {
 				return (actual);
 			}
 		}
 	}
 	return (actual);
 }
 
 static size_t
 sctp_fill_up_addresses(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     size_t limit,
     struct sockaddr_storage *sas)
 {
 	size_t size = 0;
 
 	SCTP_IPI_ADDR_RLOCK();
 	/* fill up addresses for the endpoint's default vrf */
 	size = sctp_fill_up_addresses_vrf(inp, stcb, limit, sas,
 	    inp->def_vrf_id);
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (size);
 }
 
 /*
  * NOTE: assumes addr lock is held
  */
 static int
 sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id)
 {
 	int cnt = 0;
 	struct sctp_vrf *vrf = NULL;
 
 	/*
 	 * In both sub-set bound an bound_all cases we return the MAXIMUM
 	 * number of addresses that you COULD get. In reality the sub-set
 	 * bound may have an exclusion list for a given TCB OR in the
 	 * bound-all case a TCB may NOT include the loopback or other
 	 * addresses as well.
 	 */
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		return (0);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		struct sctp_ifn *sctp_ifn;
 		struct sctp_ifa *sctp_ifa;
 
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				/* Count them if they are the right type */
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 #ifdef INET6
 					if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
 						cnt += sizeof(struct sockaddr_in6);
 					else
 						cnt += sizeof(struct sockaddr_in);
 #else
 					cnt += sizeof(struct sockaddr_in);
 #endif
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					cnt += sizeof(struct sockaddr_in6);
 					break;
 #endif
 				default:
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 #ifdef INET6
 				if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
 					cnt += sizeof(struct sockaddr_in6);
 				else
 					cnt += sizeof(struct sockaddr_in);
 #else
 				cnt += sizeof(struct sockaddr_in);
 #endif
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				cnt += sizeof(struct sockaddr_in6);
 				break;
 #endif
 			default:
 				break;
 			}
 		}
 	}
 	return (cnt);
 }
 
 static int
 sctp_count_max_addresses(struct sctp_inpcb *inp)
 {
 	int cnt = 0;
 
 	SCTP_IPI_ADDR_RLOCK();
 	/* count addresses for the endpoint's default VRF */
 	cnt = sctp_count_max_addresses_vrf(inp, inp->def_vrf_id);
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (cnt);
 }
 
 static int
 sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
     size_t optsize, void *p, int delay)
 {
 	int error = 0;
 	int creat_lock_on = 0;
 	struct sctp_tcb *stcb = NULL;
 	struct sockaddr *sa;
 	int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr;
 	uint32_t vrf_id;
 	int bad_addresses = 0;
 	sctp_assoc_t *a_id;
 
 	SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n");
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 		return (EADDRINUSE);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 		SCTP_INP_RLOCK(inp);
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		SCTP_INP_RUNLOCK(inp);
 	}
 	if (stcb) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 		return (EALREADY);
 	}
 	SCTP_INP_INCR_REF(inp);
 	SCTP_ASOC_CREATE_LOCK(inp);
 	creat_lock_on = 1;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
 		error = EFAULT;
 		goto out_now;
 	}
 	totaddrp = (int *)optval;
 	totaddr = *totaddrp;
 	sa = (struct sockaddr *)(totaddrp + 1);
 	stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (optsize - sizeof(int)), &bad_addresses);
 	if ((stcb != NULL) || bad_addresses) {
 		/* Already have or am bring up an association */
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		creat_lock_on = 0;
 		if (stcb)
 			SCTP_TCB_UNLOCK(stcb);
 		if (bad_addresses == 0) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 			error = EALREADY;
 		}
 		goto out_now;
 	}
 #ifdef INET6
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
 	    (num_v6 > 0)) {
 		error = EINVAL;
 		goto out_now;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 	    (num_v4 > 0)) {
 		struct in6pcb *inp6;
 
 		inp6 = (struct in6pcb *)inp;
 		if (SCTP_IPV6_V6ONLY(inp6)) {
 			/*
 			 * if IPV6_V6ONLY flag, ignore connections destined
 			 * to a v4 addr or v4-mapped addr
 			 */
 			SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			goto out_now;
 		}
 	}
 #endif				/* INET6 */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) ==
 	    SCTP_PCB_FLAGS_UNBOUND) {
 		/* Bind a ephemeral port */
 		error = sctp_inpcb_bind(so, NULL, NULL, p);
 		if (error) {
 			goto out_now;
 		}
 	}
 	/* FIX ME: do we want to pass in a vrf on the connect call? */
 	vrf_id = inp->def_vrf_id;
 
 
 	/* We are GOOD to go */
 	stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id,
 	    inp->sctp_ep.pre_open_stream_count,
 	    (struct thread *)p
 	    );
 	if (stcb == NULL) {
 		/* Gak! no memory */
 		goto out_now;
 	}
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 		/* Set the connected flag so we can queue data */
 		soisconnecting(so);
 	}
 	SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT);
 	/* move to second address */
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in));
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in6));
 		break;
 #endif
 	default:
 		break;
 	}
 
 	error = 0;
 	sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error);
 	/* Fill in the return id */
 	if (error) {
 		(void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
 		goto out_now;
 	}
 	a_id = (sctp_assoc_t *) optval;
 	*a_id = sctp_get_associd(stcb);
 
 	/* initialize authentication parameters for the assoc */
 	sctp_initialize_auth_params(inp, stcb);
 
 	if (delay) {
 		/* doing delayed connection */
 		stcb->asoc.delayed_connection = 1;
 		sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination);
 	} else {
 		(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 		sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 		/* Set the connected flag so we can queue data */
 		soisconnecting(so);
 	}
 out_now:
 	if (creat_lock_on) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 	}
 	SCTP_INP_DECR_REF(inp);
 	return (error);
 }
 
 #define SCTP_FIND_STCB(inp, stcb, assoc_id) { \
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||\
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { \
 		SCTP_INP_RLOCK(inp); \
 		stcb = LIST_FIRST(&inp->sctp_asoc_list); \
 		if (stcb) { \
 			SCTP_TCB_LOCK(stcb); \
                 } \
 		SCTP_INP_RUNLOCK(inp); \
 	} else if (assoc_id > SCTP_ALL_ASSOC) { \
 		stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \
 		if (stcb == NULL) { \
 		        SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \
 			error = ENOENT; \
 			break; \
 		} \
 	} else { \
 		stcb = NULL; \
         } \
   }
 
 
 #define SCTP_CHECK_AND_CAST(destp, srcp, type, size) {\
 	if (size < sizeof(type)) { \
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); \
 		error = EINVAL; \
 		break; \
 	} else { \
 		destp = (type *)srcp; \
 	} \
       }
 
 static int
 sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize,
     void *p)
 {
 	struct sctp_inpcb *inp = NULL;
 	int error, val = 0;
 	struct sctp_tcb *stcb = NULL;
 
 	if (optval == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return EINVAL;
 	}
 	error = 0;
 
 	switch (optname) {
 	case SCTP_NODELAY:
 	case SCTP_AUTOCLOSE:
 	case SCTP_EXPLICIT_EOR:
 	case SCTP_AUTO_ASCONF:
 	case SCTP_DISABLE_FRAGMENTS:
 	case SCTP_I_WANT_MAPPED_V4_ADDR:
 	case SCTP_USE_EXT_RCVINFO:
 		SCTP_INP_RLOCK(inp);
 		switch (optname) {
 		case SCTP_DISABLE_FRAGMENTS:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT);
 			break;
 		case SCTP_I_WANT_MAPPED_V4_ADDR:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4);
 			break;
 		case SCTP_AUTO_ASCONF:
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 				/* only valid for bound all sockets */
 				val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				goto flags_out;
 			}
 			break;
 		case SCTP_EXPLICIT_EOR:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
 			break;
 		case SCTP_NODELAY:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY);
 			break;
 		case SCTP_USE_EXT_RCVINFO:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO);
 			break;
 		case SCTP_AUTOCLOSE:
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))
 				val = TICKS_TO_SEC(inp->sctp_ep.auto_close_time);
 			else
 				val = 0;
 			break;
 
 		default:
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 			error = ENOPROTOOPT;
 		}		/* end switch (sopt->sopt_name) */
 		if (*optsize < sizeof(val)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 		}
 flags_out:
 		SCTP_INP_RUNLOCK(inp);
 		if (error == 0) {
 			/* return the option value */
 			*(int *)optval = val;
 			*optsize = sizeof(val);
 		}
 		break;
 	case SCTP_GET_PACKET_LOG:
 		{
 #ifdef  SCTP_PACKET_LOGGING
 			uint8_t *target;
 			int ret;
 
 			SCTP_CHECK_AND_CAST(target, optval, uint8_t, *optsize);
 			ret = sctp_copy_out_packet_log(target, (int)*optsize);
 			*optsize = ret;
 #else
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 #endif
 			break;
 		}
 	case SCTP_REUSE_PORT:
 		{
 			uint32_t *value;
 
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
 				/* Can't do this for a 1-m socket */
 				error = EINVAL;
 				break;
 			}
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			*value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_PARTIAL_DELIVERY_POINT:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			*value = inp->partial_delivery_point;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_FRAGMENT_INTERLEAVE:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) {
 				if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) {
 					*value = SCTP_FRAG_LEVEL_2;
 				} else {
 					*value = SCTP_FRAG_LEVEL_1;
 				}
 			} else {
 				*value = SCTP_FRAG_LEVEL_0;
 			}
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_CMT_ON_OFF:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.sctp_cmt_on_off;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_cmt_on_off;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_CC:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.congestion_control_module;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.sctp_default_cc_module;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_CC_OPTION:
 		{
 			struct sctp_cc_option *cc_opt;
 
 			SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, *optsize);
 			SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id);
 			if (stcb == NULL) {
 				error = EINVAL;
 			} else {
 				if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) {
 					error = ENOTSUP;
 				} else {
 					error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, cc_opt);
 					*optsize = sizeof(struct sctp_cc_option);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_SS:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.stream_scheduling_module;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.sctp_default_ss_module;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_SS_VALUE:
 		{
 			struct sctp_stream_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				if ((av->stream_id >= stcb->asoc.streamoutcnt) ||
 				    (stcb->asoc.ss_functions.sctp_ss_get_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id],
 				    &av->stream_value) < 0)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				} else {
 					*optsize = sizeof(struct sctp_stream_value);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				/*
 				 * Can't get stream value without
 				 * association
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_GET_ADDR_LEN:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			error = EINVAL;
 #ifdef INET
 			if (av->assoc_value == AF_INET) {
 				av->assoc_value = sizeof(struct sockaddr_in);
 				error = 0;
 			}
 #endif
 #ifdef INET6
 			if (av->assoc_value == AF_INET6) {
 				av->assoc_value = sizeof(struct sockaddr_in6);
 				error = 0;
 			}
 #endif
 			if (error) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			} else {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_ASSOC_NUMBER:
 		{
 			uint32_t *value, cnt;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			cnt = 0;
 			SCTP_INP_RLOCK(inp);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				cnt++;
 			}
 			SCTP_INP_RUNLOCK(inp);
 			*value = cnt;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_ASSOC_ID_LIST:
 		{
 			struct sctp_assoc_ids *ids;
 			unsigned int at, limit;
 
 			SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize);
 			at = 0;
 			limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t);
 			SCTP_INP_RLOCK(inp);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				if (at < limit) {
 					ids->gaids_assoc_id[at++] = sctp_get_associd(stcb);
 				} else {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			SCTP_INP_RUNLOCK(inp);
 			if (error == 0) {
 				ids->gaids_number_of_ids = at;
 				*optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t));
 			}
 			break;
 		}
 	case SCTP_CONTEXT:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.context;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_context;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_VRF_ID:
 		{
 			uint32_t *default_vrfid;
 
 			SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize);
 			*default_vrfid = inp->def_vrf_id;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_ASOC_VRF:
 		{
 			struct sctp_assoc_value *id;
 
 			SCTP_CHECK_AND_CAST(id, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, id->assoc_id);
 			if (stcb == NULL) {
 				error = EINVAL;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			} else {
 				id->assoc_value = stcb->asoc.vrf_id;
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_VRF_IDS:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_GET_NONCE_VALUES:
 		{
 			struct sctp_get_nonce_values *gnv;
 
 			SCTP_CHECK_AND_CAST(gnv, optval, struct sctp_get_nonce_values, *optsize);
 			SCTP_FIND_STCB(inp, stcb, gnv->gn_assoc_id);
 
 			if (stcb) {
 				gnv->gn_peers_tag = stcb->asoc.peer_vtag;
 				gnv->gn_local_tag = stcb->asoc.my_vtag;
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_get_nonce_values);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 				error = ENOTCONN;
 			}
 			break;
 		}
 	case SCTP_DELAYED_SACK:
 		{
 			struct sctp_sack_info *sack;
 
 			SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
 			if (stcb) {
 				sack->sack_delay = stcb->asoc.delayed_ack;
 				sack->sack_freq = stcb->asoc.sack_freq;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sack->sack_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]);
 					sack->sack_freq = inp->sctp_ep.sctp_sack_freq;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sack_info);
 			}
 			break;
 		}
 	case SCTP_GET_SNDBUF_USE:
 		{
 			struct sctp_sockstat *ss;
 
 			SCTP_CHECK_AND_CAST(ss, optval, struct sctp_sockstat, *optsize);
 			SCTP_FIND_STCB(inp, stcb, ss->ss_assoc_id);
 
 			if (stcb) {
 				ss->ss_total_sndbuf = stcb->asoc.total_output_queue_size;
 				ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue +
 				    stcb->asoc.size_on_all_streams);
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_sockstat);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 				error = ENOTCONN;
 			}
 			break;
 		}
 	case SCTP_MAX_BURST:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.max_burst;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.max_burst;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_MAXSEG:
 		{
 			struct sctp_assoc_value *av;
 			int ovh;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 						ovh = SCTP_MED_OVERHEAD;
 					} else {
 						ovh = SCTP_MED_V4_OVERHEAD;
 					}
 					if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT)
 						av->assoc_value = 0;
 					else
 						av->assoc_value = inp->sctp_frag_point - ovh;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_STAT_LOG:
 		error = sctp_fill_stat_log(optval, optsize);
 		break;
 	case SCTP_EVENTS:
 		{
 			struct sctp_event_subscribe *events;
 
 			SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize);
 			memset(events, 0, sizeof(struct sctp_event_subscribe));
 			SCTP_INP_RLOCK(inp);
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT))
 				events->sctp_data_io_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT))
 				events->sctp_association_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT))
 				events->sctp_address_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT))
 				events->sctp_send_failure_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR))
 				events->sctp_peer_error_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT))
 				events->sctp_shutdown_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT))
 				events->sctp_partial_delivery_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT))
 				events->sctp_adaptation_layer_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT))
 				events->sctp_authentication_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT))
 				events->sctp_sender_dry_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT))
 				events->sctp_stream_reset_event = 1;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(struct sctp_event_subscribe);
 			break;
 		}
 	case SCTP_ADAPTATION_LAYER:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 
 			SCTP_INP_RLOCK(inp);
 			*value = inp->sctp_ep.adaptation_layer_indicator;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_SET_INITIAL_DBG_SEQ:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			SCTP_INP_RLOCK(inp);
 			*value = inp->sctp_ep.initial_sequence_debug;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_LOCAL_ADDR_SIZE:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			SCTP_INP_RLOCK(inp);
 			*value = sctp_count_max_addresses(inp);
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_REMOTE_ADDR_SIZE:
 		{
 			uint32_t *value;
 			size_t size;
 			struct sctp_nets *net;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			/* FIXME MT: change to sctp_assoc_value? */
 			SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t) * value);
 
 			if (stcb) {
 				size = 0;
 				/* Count the sizes */
 				TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							size += sizeof(struct sockaddr_in6);
 						} else {
 							size += sizeof(struct sockaddr_in);
 						}
 #else
 						size += sizeof(struct sockaddr_in);
 #endif
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						size += sizeof(struct sockaddr_in6);
 						break;
 #endif
 					default:
 						break;
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*value = (uint32_t) size;
 				*optsize = sizeof(uint32_t);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 				error = ENOTCONN;
 			}
 			break;
 		}
 	case SCTP_GET_PEER_ADDRESSES:
 		/*
 		 * Get the address information, an array is passed in to
 		 * fill up we pack it.
 		 */
 		{
 			size_t cpsz, left;
 			struct sockaddr_storage *sas;
 			struct sctp_nets *net;
 			struct sctp_getaddresses *saddr;
 
 			SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
 			SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
 
 			if (stcb) {
 				left = (*optsize) - sizeof(struct sctp_getaddresses);
 				*optsize = sizeof(struct sctp_getaddresses);
 				sas = (struct sockaddr_storage *)&saddr->addr[0];
 
 				TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							cpsz = sizeof(struct sockaddr_in6);
 						} else {
 							cpsz = sizeof(struct sockaddr_in);
 						}
 #else
 						cpsz = sizeof(struct sockaddr_in);
 #endif
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						cpsz = sizeof(struct sockaddr_in6);
 						break;
 #endif
 					default:
 						cpsz = 0;
 						break;
 					}
 					if (cpsz == 0) {
 						break;
 					}
 					if (left < cpsz) {
 						/* not enough room. */
 						break;
 					}
 #if defined(INET) && defined(INET6)
 					if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
 					    (net->ro._l_addr.sa.sa_family == AF_INET)) {
 						/* Must map the address */
 						in6_sin_2_v4mapsin6(&net->ro._l_addr.sin,
 						    (struct sockaddr_in6 *)sas);
 					} else {
 						memcpy(sas, &net->ro._l_addr, cpsz);
 					}
 #else
 					memcpy(sas, &net->ro._l_addr, cpsz);
 #endif
 					((struct sockaddr_in *)sas)->sin_port = stcb->rport;
 
 					sas = (struct sockaddr_storage *)((caddr_t)sas + cpsz);
 					left -= cpsz;
 					*optsize += cpsz;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 			}
 			break;
 		}
 	case SCTP_GET_LOCAL_ADDRESSES:
 		{
 			size_t limit, actual;
 			struct sockaddr_storage *sas;
 			struct sctp_getaddresses *saddr;
 
 			SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
 			SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
 
 			sas = (struct sockaddr_storage *)&saddr->addr[0];
 			limit = *optsize - sizeof(sctp_assoc_t);
 			actual = sctp_fill_up_addresses(inp, stcb, limit, sas);
 			if (stcb) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			*optsize = sizeof(struct sockaddr_storage) + actual;
 			break;
 		}
 	case SCTP_PEER_ADDR_PARAMS:
 		{
 			struct sctp_paddrparams *paddrp;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize);
 			SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddrp->spp_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddrp->spp_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddrp->spp_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddrp->spp_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (stcb != NULL) {
 				/* Applies to the specific association */
 				paddrp->spp_flags = 0;
 				if (net != NULL) {
 					paddrp->spp_hbinterval = net->heart_beat_delay;
 					paddrp->spp_pathmaxrxt = net->failure_threshold;
 					paddrp->spp_pathmtu = net->mtu;
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 						paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD;
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD;
 						break;
 #endif
 					default:
 						break;
 					}
 					/* get flags for HB */
 					if (net->dest_state & SCTP_ADDR_NOHB) {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					}
 					/* get flags for PMTU */
 					if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					}
 					if (net->dscp & 0x01) {
 						paddrp->spp_dscp = net->dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if ((net->ro._l_addr.sa.sa_family == AF_INET6) &&
 					    (net->flowlabel & 0x80000000)) {
 						paddrp->spp_ipv6_flowlabel = net->flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 				} else {
 					/*
 					 * No destination so return default
 					 * value
 					 */
 					paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure;
 					paddrp->spp_pathmtu = 0;
 					if (stcb->asoc.default_dscp & 0x01) {
 						paddrp->spp_dscp = stcb->asoc.default_dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if (stcb->asoc.default_flowlabel & 0x80000000) {
 						paddrp->spp_ipv6_flowlabel = stcb->asoc.default_flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 					/* default settings should be these */
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					}
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					}
 					paddrp->spp_hbinterval = stcb->asoc.heart_beat_delay;
 				}
 				paddrp->spp_assoc_id = sctp_get_associd(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC)) {
 					/* Use endpoint defaults */
 					SCTP_INP_RLOCK(inp);
 					paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure;
 					paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
 					paddrp->spp_assoc_id = SCTP_FUTURE_ASSOC;
 					/* get inp's default */
 					if (inp->sctp_ep.default_dscp & 0x01) {
 						paddrp->spp_dscp = inp->sctp_ep.default_dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 					    (inp->sctp_ep.default_flowlabel & 0x80000000)) {
 						paddrp->spp_ipv6_flowlabel = inp->sctp_ep.default_flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 					/* can't return this */
 					paddrp->spp_pathmtu = 0;
 
 					if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					}
 					if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_paddrparams);
 			}
 			break;
 		}
 	case SCTP_GET_PEER_ADDR_INFO:
 		{
 			struct sctp_paddrinfo *paddri;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddri->spinfo_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddri->spinfo_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddri->spinfo_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddri->spinfo_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddri->spinfo_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if ((stcb != NULL) && (net != NULL)) {
 				if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 					/* It's unconfirmed */
 					paddri->spinfo_state = SCTP_UNCONFIRMED;
 				} else if (net->dest_state & SCTP_ADDR_REACHABLE) {
 					/* It's active */
 					paddri->spinfo_state = SCTP_ACTIVE;
 				} else {
 					/* It's inactive */
 					paddri->spinfo_state = SCTP_INACTIVE;
 				}
 				paddri->spinfo_cwnd = net->cwnd;
 				paddri->spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT;
 				paddri->spinfo_rto = net->RTO;
 				paddri->spinfo_assoc_id = sctp_get_associd(stcb);
 				paddri->spinfo_mtu = net->mtu;
 				switch (addr->sa_family) {
 #if defined(INET)
 				case AF_INET:
 					paddri->spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
 					break;
 #endif
 #if defined(INET6)
 				case AF_INET6:
 					paddri->spinfo_mtu -= SCTP_MIN_OVERHEAD;
 					break;
 #endif
 				default:
 					break;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_paddrinfo);
 			} else {
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 			}
 			break;
 		}
 	case SCTP_PCB_STATUS:
 		{
 			struct sctp_pcbinfo *spcb;
 
 			SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize);
 			sctp_fill_pcbinfo(spcb);
 			*optsize = sizeof(struct sctp_pcbinfo);
 			break;
 		}
 	case SCTP_STATUS:
 		{
 			struct sctp_nets *net;
 			struct sctp_status *sstat;
 
 			SCTP_CHECK_AND_CAST(sstat, optval, struct sctp_status, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id);
 
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			sstat->sstat_state = sctp_map_assoc_state(stcb->asoc.state);
 			sstat->sstat_assoc_id = sctp_get_associd(stcb);
 			sstat->sstat_rwnd = stcb->asoc.peers_rwnd;
 			sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt;
 			/*
 			 * We can't include chunks that have been passed to
 			 * the socket layer. Only things in queue.
 			 */
 			sstat->sstat_penddata = (stcb->asoc.cnt_on_reasm_queue +
 			    stcb->asoc.cnt_on_all_streams);
 
 
 			sstat->sstat_instrms = stcb->asoc.streamincnt;
 			sstat->sstat_outstrms = stcb->asoc.streamoutcnt;
 			sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc);
 			memcpy(&sstat->sstat_primary.spinfo_address,
 			    &stcb->asoc.primary_destination->ro._l_addr,
 			    ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len);
 			net = stcb->asoc.primary_destination;
 			((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport;
 			/*
 			 * Again the user can get info from sctp_constants.h
 			 * for what the state of the network is.
 			 */
 			if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 				/* It's unconfirmed */
 				sstat->sstat_primary.spinfo_state = SCTP_UNCONFIRMED;
 			} else if (net->dest_state & SCTP_ADDR_REACHABLE) {
 				/* It's active */
 				sstat->sstat_primary.spinfo_state = SCTP_ACTIVE;
 			} else {
 				/* It's inactive */
 				sstat->sstat_primary.spinfo_state = SCTP_INACTIVE;
 			}
 			sstat->sstat_primary.spinfo_cwnd = net->cwnd;
 			sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT;
 			sstat->sstat_primary.spinfo_rto = net->RTO;
 			sstat->sstat_primary.spinfo_mtu = net->mtu;
 			switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) {
 #if defined(INET)
 			case AF_INET:
 				sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
 				break;
 #endif
 #if defined(INET6)
 			case AF_INET6:
 				sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD;
 				break;
 #endif
 			default:
 				break;
 			}
 			sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb);
 			SCTP_TCB_UNLOCK(stcb);
 			*optsize = sizeof(struct sctp_status);
 			break;
 		}
 	case SCTP_RTOINFO:
 		{
 			struct sctp_rtoinfo *srto;
 
 			SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
 
 			if (stcb) {
 				srto->srto_initial = stcb->asoc.initial_rto;
 				srto->srto_max = stcb->asoc.maxrto;
 				srto->srto_min = stcb->asoc.minrto;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (srto->srto_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					srto->srto_initial = inp->sctp_ep.initial_rto;
 					srto->srto_max = inp->sctp_ep.sctp_maxrto;
 					srto->srto_min = inp->sctp_ep.sctp_minrto;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_rtoinfo);
 			}
 			break;
 		}
 	case SCTP_TIMEOUTS:
 		{
 			struct sctp_timeouts *stimo;
 
 			SCTP_CHECK_AND_CAST(stimo, optval, struct sctp_timeouts, *optsize);
 			SCTP_FIND_STCB(inp, stcb, stimo->stimo_assoc_id);
 
 			if (stcb) {
 				stimo->stimo_init = stcb->asoc.timoinit;
 				stimo->stimo_data = stcb->asoc.timodata;
 				stimo->stimo_sack = stcb->asoc.timosack;
 				stimo->stimo_shutdown = stcb->asoc.timoshutdown;
 				stimo->stimo_heartbeat = stcb->asoc.timoheartbeat;
 				stimo->stimo_cookie = stcb->asoc.timocookie;
 				stimo->stimo_shutdownack = stcb->asoc.timoshutdownack;
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_timeouts);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_ASSOCINFO:
 		{
 			struct sctp_assocparams *sasoc;
 
 			SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
 
 			if (stcb) {
 				sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life);
 				sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times;
 				sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets;
 				sasoc->sasoc_peer_rwnd = stcb->asoc.peers_rwnd;
 				sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life);
 					sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times;
 					sasoc->sasoc_number_peer_destinations = 0;
 					sasoc->sasoc_peer_rwnd = 0;
 					sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv);
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assocparams);
 			}
 			break;
 		}
 	case SCTP_DEFAULT_SEND_PARAM:
 		{
 			struct sctp_sndrcvinfo *s_info;
 
 			SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
 
 			if (stcb) {
 				memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send));
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					memcpy(s_info, &inp->def_send, sizeof(inp->def_send));
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sndrcvinfo);
 			}
 			break;
 		}
 	case SCTP_INITMSG:
 		{
 			struct sctp_initmsg *sinit;
 
 			SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, *optsize);
 			SCTP_INP_RLOCK(inp);
 			sinit->sinit_num_ostreams = inp->sctp_ep.pre_open_stream_count;
 			sinit->sinit_max_instreams = inp->sctp_ep.max_open_streams_intome;
 			sinit->sinit_max_attempts = inp->sctp_ep.max_init_times;
 			sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(struct sctp_initmsg);
 			break;
 		}
 	case SCTP_PRIMARY_ADDR:
 		/* we allow a "get" operation on this */
 		{
 			struct sctp_setprim *ssp;
 
 			SCTP_CHECK_AND_CAST(ssp, optval, struct sctp_setprim, *optsize);
 			SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id);
 
 			if (stcb) {
 				union sctp_sockstore *addr;
 
 				addr = &stcb->asoc.primary_destination->ro._l_addr;
 				switch (addr->sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 #ifdef INET6
 					if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 						in6_sin_2_v4mapsin6(&addr->sin,
 						    (struct sockaddr_in6 *)&ssp->ssp_addr);
 					} else {
 						memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
 					}
 #else
 					memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
 #endif
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					memcpy(&ssp->ssp_addr, &addr->sin6, sizeof(struct sockaddr_in6));
 					break;
 #endif
 				default:
 					break;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_setprim);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_HMAC_IDENT:
 		{
 			struct sctp_hmacalgo *shmac;
 			sctp_hmaclist_t *hmaclist;
 			uint32_t size;
 			int i;
 
 			SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, *optsize);
 
 			SCTP_INP_RLOCK(inp);
 			hmaclist = inp->sctp_ep.local_hmacs;
 			if (hmaclist == NULL) {
 				/* no HMACs to return */
 				*optsize = sizeof(*shmac);
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			/* is there room for all of the hmac ids? */
 			size = sizeof(*shmac) + (hmaclist->num_algo *
 			    sizeof(shmac->shmac_idents[0]));
 			if ((size_t)(*optsize) < size) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			/* copy in the list */
 			shmac->shmac_number_of_idents = hmaclist->num_algo;
 			for (i = 0; i < hmaclist->num_algo; i++) {
 				shmac->shmac_idents[i] = hmaclist->hmac[i];
 			}
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = size;
 			break;
 		}
 	case SCTP_AUTH_ACTIVE_KEY:
 		{
 			struct sctp_authkeyid *scact;
 
 			SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, *optsize);
 			SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
 
 			if (stcb) {
 				/* get the active key on the assoc */
 				scact->scact_keynumber = stcb->asoc.authinfo.active_keyid;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (scact->scact_assoc_id == SCTP_FUTURE_ASSOC)) {
 					/* get the endpoint active key */
 					SCTP_INP_RLOCK(inp);
 					scact->scact_keynumber = inp->sctp_ep.default_keyid;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_authkeyid);
 			}
 			break;
 		}
 	case SCTP_LOCAL_AUTH_CHUNKS:
 		{
 			struct sctp_authchunks *sac;
 			sctp_auth_chklist_t *chklist = NULL;
 			size_t size = 0;
 
 			SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
 
 			if (stcb) {
 				/* get off the assoc */
 				chklist = stcb->asoc.local_auth_chunks;
 				/* is there enough space? */
 				size = sctp_auth_get_chklist_size(chklist);
 				if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 					sac->gauth_number_of_chunks = (uint32_t) size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sac->gauth_assoc_id == SCTP_FUTURE_ASSOC)) {
 					/* get off the endpoint */
 					SCTP_INP_RLOCK(inp);
 					chklist = inp->sctp_ep.local_auth_chunks;
 					/* is there enough space? */
 					size = sctp_auth_get_chklist_size(chklist);
 					if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 						error = EINVAL;
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					} else {
 						/* copy in the chunks */
 						(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 						sac->gauth_number_of_chunks = (uint32_t) size;
 						*optsize = sizeof(struct sctp_authchunks) + size;
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_AUTH_CHUNKS:
 		{
 			struct sctp_authchunks *sac;
 			sctp_auth_chklist_t *chklist = NULL;
 			size_t size = 0;
 
 			SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
 
 			if (stcb) {
 				/* get off the assoc */
 				chklist = stcb->asoc.peer_auth_chunks;
 				/* is there enough space? */
 				size = sctp_auth_get_chklist_size(chklist);
 				if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 					sac->gauth_number_of_chunks = (uint32_t) size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 			}
 			break;
 		}
 	case SCTP_EVENT:
 		{
 			struct sctp_event *event;
 			uint32_t event_type;
 
 			SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, *optsize);
 			SCTP_FIND_STCB(inp, stcb, event->se_assoc_id);
 
 			switch (event->se_type) {
 			case SCTP_ASSOC_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT;
 				break;
 			case SCTP_PEER_ADDR_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVPADDREVNT;
 				break;
 			case SCTP_REMOTE_ERROR:
 				event_type = SCTP_PCB_FLAGS_RECVPEERERR;
 				break;
 			case SCTP_SEND_FAILED:
 				event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT;
 				break;
 			case SCTP_SHUTDOWN_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT;
 				break;
 			case SCTP_ADAPTATION_INDICATION:
 				event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT;
 				break;
 			case SCTP_PARTIAL_DELIVERY_EVENT:
 				event_type = SCTP_PCB_FLAGS_PDAPIEVNT;
 				break;
 			case SCTP_AUTHENTICATION_EVENT:
 				event_type = SCTP_PCB_FLAGS_AUTHEVNT;
 				break;
 			case SCTP_STREAM_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT;
 				break;
 			case SCTP_SENDER_DRY_EVENT:
 				event_type = SCTP_PCB_FLAGS_DRYEVNT;
 				break;
 			case SCTP_NOTIFICATIONS_STOPPED_EVENT:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
 			case SCTP_ASSOC_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
 				break;
 			case SCTP_STREAM_CHANGE_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
 				break;
 			case SCTP_SEND_FAILED_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
 				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (event_type > 0) {
 				if (stcb) {
 					event->se_on = sctp_stcb_is_feature_on(inp, stcb, event_type);
 					SCTP_TCB_UNLOCK(stcb);
 				} else {
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 					    (event->se_assoc_id == SCTP_FUTURE_ASSOC)) {
 						SCTP_INP_RLOCK(inp);
 						event->se_on = sctp_is_feature_on(inp, event_type);
 						SCTP_INP_RUNLOCK(inp);
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_event);
 			}
 			break;
 		}
 	case SCTP_RECVRCVINFO:
 		{
 			int onoff;
 
 			if (*optsize < sizeof(int)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			} else {
 				SCTP_INP_RLOCK(inp);
 				onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 				SCTP_INP_RUNLOCK(inp);
 			}
 			if (error == 0) {
 				/* return the option value */
 				*(int *)optval = onoff;
 				*optsize = sizeof(int);
 			}
 			break;
 		}
 	case SCTP_RECVNXTINFO:
 		{
 			int onoff;
 
 			if (*optsize < sizeof(int)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			} else {
 				SCTP_INP_RLOCK(inp);
 				onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 				SCTP_INP_RUNLOCK(inp);
 			}
 			if (error == 0) {
 				/* return the option value */
 				*(int *)optval = onoff;
 				*optsize = sizeof(int);
 			}
 			break;
 		}
 	case SCTP_DEFAULT_SNDINFO:
 		{
 			struct sctp_sndinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id);
 
 			if (stcb) {
 				info->snd_sid = stcb->asoc.def_send.sinfo_stream;
 				info->snd_flags = stcb->asoc.def_send.sinfo_flags;
 				info->snd_flags &= 0xfff0;
 				info->snd_ppid = stcb->asoc.def_send.sinfo_ppid;
 				info->snd_context = stcb->asoc.def_send.sinfo_context;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (info->snd_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					info->snd_sid = inp->def_send.sinfo_stream;
 					info->snd_flags = inp->def_send.sinfo_flags;
 					info->snd_flags &= 0xfff0;
 					info->snd_ppid = inp->def_send.sinfo_ppid;
 					info->snd_context = inp->def_send.sinfo_context;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sndinfo);
 			}
 			break;
 		}
 	case SCTP_DEFAULT_PRINFO:
 		{
 			struct sctp_default_prinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id);
 
 			if (stcb) {
 				info->pr_policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 				info->pr_value = stcb->asoc.def_send.sinfo_timetolive;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (info->pr_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					info->pr_policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags);
 					info->pr_value = inp->def_send.sinfo_timetolive;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_default_prinfo);
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_THLDS:
 		{
 			struct sctp_paddrthlds *thlds;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, *optsize);
 			SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (thlds->spt_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&thlds->spt_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&thlds->spt_address;
 			}
 #else
 			addr = (struct sockaddr *)&thlds->spt_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (stcb != NULL) {
 				if (net != NULL) {
 					thlds->spt_pathmaxrxt = net->failure_threshold;
 					thlds->spt_pathpfthld = net->pf_threshold;
 				} else {
 					thlds->spt_pathmaxrxt = stcb->asoc.def_net_failure;
 					thlds->spt_pathpfthld = stcb->asoc.def_net_pf_threshold;
 				}
 				thlds->spt_assoc_id = sctp_get_associd(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC)) {
 					/* Use endpoint defaults */
 					SCTP_INP_RLOCK(inp);
 					thlds->spt_pathmaxrxt = inp->sctp_ep.def_net_failure;
 					thlds->spt_pathpfthld = inp->sctp_ep.def_net_pf_threshold;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_paddrthlds);
 			}
 			break;
 		}
 	case SCTP_REMOTE_UDP_ENCAPS_PORT:
 		{
 			struct sctp_udpencaps *encaps;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, *optsize);
 			SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (encaps->sue_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&encaps->sue_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&encaps->sue_address;
 			}
 #else
 			addr = (struct sockaddr *)&encaps->sue_address;
 #endif
 			if (stcb) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (stcb != NULL) {
 				if (net) {
 					encaps->sue_port = net->port;
 				} else {
 					encaps->sue_port = stcb->asoc.port;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					encaps->sue_port = inp->sctp_ep.port;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_udpencaps);
 			}
 			break;
 		}
 	case SCTP_ECN_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.ecn_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->ecn_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PR_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.prsctp_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->prsctp_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_AUTH_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.auth_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->auth_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_ASCONF_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.asconf_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->asconf_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_RECONFIG_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.reconfig_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->reconfig_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_NRSACK_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.nrsack_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->nrsack_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PKTDROP_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.pktdrop_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->pktdrop_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_ENABLE_STREAM_RESET:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = (uint32_t) stcb->asoc.local_strreset_support;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = (uint32_t) inp->local_strreset_support;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PR_STREAM_STATUS:
 		{
 			struct sctp_prstatus *sprstat;
 			uint16_t sid;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
 
 			sid = sprstat->sprstat_sid;
 			policy = sprstat->sprstat_policy;
 #if defined(SCTP_DETAILED_STR_STATS)
 			if ((stcb != NULL) &&
 			    (sid < stcb->asoc.streamoutcnt) &&
 			    (policy != SCTP_PR_SCTP_NONE) &&
 			    ((policy <= SCTP_PR_SCTP_MAX) ||
 			    (policy == SCTP_PR_SCTP_ALL))) {
 				if (policy == SCTP_PR_SCTP_ALL) {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
 				} else {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[policy];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[policy];
 				}
 #else
 			if ((stcb != NULL) &&
 			    (sid < stcb->asoc.streamoutcnt) &&
 			    (policy == SCTP_PR_SCTP_ALL)) {
 				sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
 				sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
 #endif
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_prstatus);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_PR_ASSOC_STATUS:
 		{
 			struct sctp_prstatus *sprstat;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
 
 			policy = sprstat->sprstat_policy;
 			if ((stcb != NULL) &&
 			    (policy != SCTP_PR_SCTP_NONE) &&
 			    ((policy <= SCTP_PR_SCTP_MAX) ||
 			    (policy == SCTP_PR_SCTP_ALL))) {
 				if (policy == SCTP_PR_SCTP_ALL) {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[0];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[0];
 				} else {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[policy];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[policy];
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_prstatus);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_MAX_CWND:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.max_cwnd;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->max_cwnd;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 		error = ENOPROTOOPT;
 		break;
 	}			/* end switch (sopt->sopt_name) */
 	if (error) {
 		*optsize = 0;
 	}
 	return (error);
 }
 
 static int
 sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
     void *p)
 {
 	int error, set_opt;
 	uint32_t *mopt;
 	struct sctp_tcb *stcb = NULL;
 	struct sctp_inpcb *inp = NULL;
 	uint32_t vrf_id;
 
 	if (optval == NULL) {
 		SCTP_PRINTF("optval is NULL\n");
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_PRINTF("inp is NULL?\n");
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	vrf_id = inp->def_vrf_id;
 
 	error = 0;
 	switch (optname) {
 	case SCTP_NODELAY:
 	case SCTP_AUTOCLOSE:
 	case SCTP_AUTO_ASCONF:
 	case SCTP_EXPLICIT_EOR:
 	case SCTP_DISABLE_FRAGMENTS:
 	case SCTP_USE_EXT_RCVINFO:
 	case SCTP_I_WANT_MAPPED_V4_ADDR:
 		/* copy in the option value */
 		SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
 		set_opt = 0;
 		if (error)
 			break;
 		switch (optname) {
 		case SCTP_DISABLE_FRAGMENTS:
 			set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT;
 			break;
 		case SCTP_AUTO_ASCONF:
 			/*
 			 * NOTE: we don't really support this flag
 			 */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 				/* only valid for bound all sockets */
 				if ((SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) &&
 				    (*mopt != 0)) {
 					/* forbidden by admin */
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM);
 					return (EPERM);
 				}
 				set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF;
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 		case SCTP_EXPLICIT_EOR:
 			set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR;
 			break;
 		case SCTP_USE_EXT_RCVINFO:
 			set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO;
 			break;
 		case SCTP_I_WANT_MAPPED_V4_ADDR:
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4;
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 		case SCTP_NODELAY:
 			set_opt = SCTP_PCB_FLAGS_NODELAY;
 			break;
 		case SCTP_AUTOCLOSE:
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			set_opt = SCTP_PCB_FLAGS_AUTOCLOSE;
 			/*
 			 * The value is in ticks. Note this does not effect
 			 * old associations, only new ones.
 			 */
 			inp->sctp_ep.auto_close_time = SEC_TO_TICKS(*mopt);
 			break;
 		}
 		SCTP_INP_WLOCK(inp);
 		if (*mopt != 0) {
 			sctp_feature_on(inp, set_opt);
 		} else {
 			sctp_feature_off(inp, set_opt);
 		}
 		SCTP_INP_WUNLOCK(inp);
 		break;
 	case SCTP_REUSE_PORT:
 		{
 			SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
 				/* Can't set it after we are bound */
 				error = EINVAL;
 				break;
 			}
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
 				/* Can't do this for a 1-m socket */
 				error = EINVAL;
 				break;
 			}
 			if (optval)
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			else
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			break;
 		}
 	case SCTP_PARTIAL_DELIVERY_POINT:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
 			if (*value > SCTP_SB_LIMIT_RCV(so)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			inp->partial_delivery_point = *value;
 			break;
 		}
 	case SCTP_FRAGMENT_INTERLEAVE:
 		/* not yet until we re-write sctp_recvmsg() */
 		{
 			uint32_t *level;
 
 			SCTP_CHECK_AND_CAST(level, optval, uint32_t, optsize);
 			if (*level == SCTP_FRAG_LEVEL_2) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 			} else if (*level == SCTP_FRAG_LEVEL_1) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 			} else if (*level == SCTP_FRAG_LEVEL_0) {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_CMT_ON_OFF:
 		if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) {
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if (av->assoc_value > SCTP_CMT_MAX) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.sctp_cmt_on_off = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_cmt_on_off = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.sctp_cmt_on_off = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 		} else {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 			error = ENOPROTOOPT;
 		}
 		break;
 	case SCTP_PLUGGABLE_CC:
 		{
 			struct sctp_assoc_value *av;
 			struct sctp_nets *net;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if ((av->assoc_value != SCTP_CC_RFC2581) &&
 			    (av->assoc_value != SCTP_CC_HSTCP) &&
 			    (av->assoc_value != SCTP_CC_HTCP) &&
 			    (av->assoc_value != SCTP_CC_RTCC)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value];
 				stcb->asoc.congestion_control_module = av->assoc_value;
 				if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.sctp_default_cc_module = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value];
 						stcb->asoc.congestion_control_module = av->assoc_value;
 						if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) {
 							TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 								stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 							}
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_CC_OPTION:
 		{
 			struct sctp_cc_option *cc_opt;
 
 			SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, optsize);
 			SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id);
 			if (stcb == NULL) {
 				if (cc_opt->aid_value.assoc_id == SCTP_CURRENT_ASSOC) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (stcb->asoc.cc_functions.sctp_cwnd_socket_option) {
 							(*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					error = EINVAL;
 				}
 			} else {
 				if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) {
 					error = ENOTSUP;
 				} else {
 					error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1,
 					    cc_opt);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_SS:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if ((av->assoc_value != SCTP_SS_DEFAULT) &&
 			    (av->assoc_value != SCTP_SS_ROUND_ROBIN) &&
 			    (av->assoc_value != SCTP_SS_ROUND_ROBIN_PACKET) &&
 			    (av->assoc_value != SCTP_SS_PRIORITY) &&
 			    (av->assoc_value != SCTP_SS_FAIR_BANDWITH) &&
 			    (av->assoc_value != SCTP_SS_FIRST_COME)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1);
 				stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value];
 				stcb->asoc.stream_scheduling_module = av->assoc_value;
 				stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.sctp_default_ss_module = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1);
 						stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value];
 						stcb->asoc.stream_scheduling_module = av->assoc_value;
 						stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_SS_VALUE:
 		{
 			struct sctp_stream_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				if ((av->stream_id >= stcb->asoc.streamoutcnt) ||
 				    (stcb->asoc.ss_functions.sctp_ss_set_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id],
 				    av->stream_value) < 0)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if (av->assoc_id == SCTP_CURRENT_ASSOC) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (av->stream_id < stcb->asoc.streamoutcnt) {
 							stcb->asoc.ss_functions.sctp_ss_set_value(stcb,
 							    &stcb->asoc,
 							    &stcb->asoc.strmout[av->stream_id],
 							    av->stream_value);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					/*
 					 * Can't set stream value without
 					 * association
 					 */
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_CLR_STAT_LOG:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		error = EOPNOTSUPP;
 		break;
 	case SCTP_CONTEXT:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.context = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_context = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.context = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_VRF_ID:
 		{
 			uint32_t *default_vrfid;
 
 			SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, optsize);
 			if (*default_vrfid > SCTP_MAX_VRF_ID) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			inp->def_vrf_id = *default_vrfid;
 			break;
 		}
 	case SCTP_DEL_VRF_ID:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_ADD_VRF_ID:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_DELAYED_SACK:
 		{
 			struct sctp_sack_info *sack;
 
 			SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, optsize);
 			SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
 			if (sack->sack_delay) {
 				if (sack->sack_delay > SCTP_MAX_SACK_DELAY)
 					sack->sack_delay = SCTP_MAX_SACK_DELAY;
 				if (MSEC_TO_TICKS(sack->sack_delay) < 1) {
 					sack->sack_delay = TICKS_TO_MSEC(1);
 				}
 			}
 			if (stcb) {
 				if (sack->sack_delay) {
 					stcb->asoc.delayed_ack = sack->sack_delay;
 				}
 				if (sack->sack_freq) {
 					stcb->asoc.sack_freq = sack->sack_freq;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sack->sack_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (sack->sack_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (sack->sack_delay) {
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay);
 					}
 					if (sack->sack_freq) {
 						inp->sctp_ep.sctp_sack_freq = sack->sack_freq;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((sack->sack_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (sack->sack_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (sack->sack_delay) {
 							stcb->asoc.delayed_ack = sack->sack_delay;
 						}
 						if (sack->sack_freq) {
 							stcb->asoc.sack_freq = sack->sack_freq;
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_CHUNK:
 		{
 			struct sctp_authchunk *sauth;
 
 			SCTP_CHECK_AND_CAST(sauth, optval, struct sctp_authchunk, optsize);
 
 			SCTP_INP_WLOCK(inp);
 			if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_AUTH_KEY:
 		{
 			struct sctp_authkey *sca;
 			struct sctp_keyhead *shared_keys;
 			sctp_sharedkey_t *shared_key;
 			sctp_key_t *key = NULL;
 			size_t size;
 
 			SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize);
 			if (sca->sca_keylength == 0) {
 				size = optsize - sizeof(struct sctp_authkey);
 			} else {
 				if (sca->sca_keylength + sizeof(struct sctp_authkey) <= optsize) {
 					size = sca->sca_keylength;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 			}
 			SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id);
 
 			if (stcb) {
 				shared_keys = &stcb->asoc.shared_keys;
 				/* clear the cached keys for this key id */
 				sctp_clear_cachedkeys(stcb, sca->sca_keynumber);
 				/*
 				 * create the new shared key and
 				 * insert/replace it
 				 */
 				if (size > 0) {
 					key = sctp_set_key(sca->sca_key, (uint32_t) size);
 					if (key == NULL) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 						error = ENOMEM;
 						SCTP_TCB_UNLOCK(stcb);
 						break;
 					}
 				}
 				shared_key = sctp_alloc_sharedkey();
 				if (shared_key == NULL) {
 					sctp_free_key(key);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 					error = ENOMEM;
 					SCTP_TCB_UNLOCK(stcb);
 					break;
 				}
 				shared_key->key = key;
 				shared_key->keyid = sca->sca_keynumber;
 				error = sctp_insert_sharedkey(shared_keys, shared_key);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sca->sca_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (sca->sca_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					shared_keys = &inp->sctp_ep.shared_keys;
 					/*
 					 * clear the cached keys on all
 					 * assocs for this key id
 					 */
 					sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber);
 					/*
 					 * create the new shared key and
 					 * insert/replace it
 					 */
 					if (size > 0) {
 						key = sctp_set_key(sca->sca_key, (uint32_t) size);
 						if (key == NULL) {
 							SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 							error = ENOMEM;
 							SCTP_INP_WUNLOCK(inp);
 							break;
 						}
 					}
 					shared_key = sctp_alloc_sharedkey();
 					if (shared_key == NULL) {
 						sctp_free_key(key);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 						error = ENOMEM;
 						SCTP_INP_WUNLOCK(inp);
 						break;
 					}
 					shared_key->key = key;
 					shared_key->keyid = sca->sca_keynumber;
 					error = sctp_insert_sharedkey(shared_keys, shared_key);
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((sca->sca_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (sca->sca_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						shared_keys = &stcb->asoc.shared_keys;
 						/*
 						 * clear the cached keys for
 						 * this key id
 						 */
 						sctp_clear_cachedkeys(stcb, sca->sca_keynumber);
 						/*
 						 * create the new shared key
 						 * and insert/replace it
 						 */
 						if (size > 0) {
 							key = sctp_set_key(sca->sca_key, (uint32_t) size);
 							if (key == NULL) {
 								SCTP_TCB_UNLOCK(stcb);
 								continue;
 							}
 						}
 						shared_key = sctp_alloc_sharedkey();
 						if (shared_key == NULL) {
 							sctp_free_key(key);
 							SCTP_TCB_UNLOCK(stcb);
 							continue;
 						}
 						shared_key->key = key;
 						shared_key->keyid = sca->sca_keynumber;
 						error = sctp_insert_sharedkey(shared_keys, shared_key);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_HMAC_IDENT:
 		{
 			struct sctp_hmacalgo *shmac;
 			sctp_hmaclist_t *hmaclist;
 			uint16_t hmacid;
 			uint32_t i;
 
 			SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize);
 			if ((optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) ||
 			    (shmac->shmac_number_of_idents > 0xffff)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			hmaclist = sctp_alloc_hmaclist((uint16_t) shmac->shmac_number_of_idents);
 			if (hmaclist == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 				error = ENOMEM;
 				break;
 			}
 			for (i = 0; i < shmac->shmac_number_of_idents; i++) {
 				hmacid = shmac->shmac_idents[i];
 				if (sctp_auth_add_hmacid(hmaclist, hmacid)) {
 					 /* invalid HMACs were found */ ;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					sctp_free_hmaclist(hmaclist);
 					goto sctp_set_hmac_done;
 				}
 			}
 			for (i = 0; i < hmaclist->num_algo; i++) {
 				if (hmaclist->hmac[i] == SCTP_AUTH_HMAC_ID_SHA1) {
 					/* already in list */
 					break;
 				}
 			}
 			if (i == hmaclist->num_algo) {
 				/* not found in list */
 				sctp_free_hmaclist(hmaclist);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			/* set it on the endpoint */
 			SCTP_INP_WLOCK(inp);
 			if (inp->sctp_ep.local_hmacs)
 				sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
 			inp->sctp_ep.local_hmacs = hmaclist;
 			SCTP_INP_WUNLOCK(inp);
 	sctp_set_hmac_done:
 			break;
 		}
 	case SCTP_AUTH_ACTIVE_KEY:
 		{
 			struct sctp_authkeyid *scact;
 
 			SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
 
 			/* set the active key on the right place */
 			if (stcb) {
 				/* set the active key on the assoc */
 				if (sctp_auth_setactivekey(stcb,
 				    scact->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
 					    SCTP_FROM_SCTP_USRREQ,
 					    EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (scact->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (scact->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_auth_setactivekey_ep(inp, scact->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((scact->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (scact->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_auth_setactivekey(stcb, scact->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_DELETE_KEY:
 		{
 			struct sctp_authkeyid *scdel;
 
 			SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id);
 
 			/* delete the key from the right place */
 			if (stcb) {
 				if (sctp_delete_sharedkey(stcb, scdel->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (scdel->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_delete_sharedkey_ep(inp, scdel->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((scdel->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_delete_sharedkey(stcb, scdel->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_DEACTIVATE_KEY:
 		{
 			struct sctp_authkeyid *keyid;
 
 			SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id);
 
 			/* deactivate the key from the right place */
 			if (stcb) {
 				if (sctp_deact_sharedkey(stcb, keyid->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (keyid->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_deact_sharedkey_ep(inp, keyid->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((keyid->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_deact_sharedkey(stcb, keyid->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_ENABLE_STREAM_RESET:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if (av->assoc_value & (~SCTP_ENABLE_VALUE_MASK)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.local_strreset_support = (uint8_t) av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->local_strreset_support = (uint8_t) av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.local_strreset_support = (uint8_t) av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_RESET_STREAMS:
 		{
 			struct sctp_reset_streams *strrst;
 			int i, send_out = 0;
 			int send_in = 0;
 
 			SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_reset_streams, optsize);
 			SCTP_FIND_STCB(inp, stcb, strrst->srs_assoc_id);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (sizeof(struct sctp_reset_streams) +
 			    strrst->srs_number_streams * sizeof(uint16_t) > optsize) {
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (strrst->srs_flags & SCTP_STREAM_RESET_INCOMING) {
 				send_in = 1;
 				if (stcb->asoc.stream_reset_outstanding) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 					error = EALREADY;
 					SCTP_TCB_UNLOCK(stcb);
 					break;
 				}
 			}
 			if (strrst->srs_flags & SCTP_STREAM_RESET_OUTGOING) {
 				send_out = 1;
 			}
 			if ((strrst->srs_number_streams > SCTP_MAX_STREAMS_AT_ONCE_RESET) && send_in) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 				error = ENOMEM;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if ((send_in == 0) && (send_out == 0)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			for (i = 0; i < strrst->srs_number_streams; i++) {
 				if ((send_in) &&
 				    (strrst->srs_stream_list[i] > stcb->asoc.streamincnt)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if ((send_out) &&
 				    (strrst->srs_stream_list[i] > stcb->asoc.streamoutcnt)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 			}
 			if (error) {
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (send_out) {
 				int cnt;
 				uint16_t strm;
 
 				if (strrst->srs_number_streams) {
 					for (i = 0, cnt = 0; i < strrst->srs_number_streams; i++) {
 						strm = strrst->srs_stream_list[i];
 						if (stcb->asoc.strmout[strm].state == SCTP_STREAM_OPEN) {
 							stcb->asoc.strmout[strm].state = SCTP_STREAM_RESET_PENDING;
 							cnt++;
 						}
 					}
 				} else {
 					/* Its all */
 					for (i = 0, cnt = 0; i < stcb->asoc.streamoutcnt; i++) {
 						if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) {
 							stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING;
 							cnt++;
 						}
 					}
 				}
 			}
 			if (send_in) {
 				error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams,
 				    strrst->srs_stream_list,
 				    send_in, 0, 0, 0, 0, 0);
 			} else {
 				error = sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_LOCKED);
 			}
 			if (error == 0) {
 				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 			} else {
 				/*
 				 * For outgoing streams don't report any
 				 * problems in sending the request to the
 				 * application. XXX: Double check resetting
 				 * incoming streams.
 				 */
 				error = 0;
 			}
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_ADD_STREAMS:
 		{
 			struct sctp_add_streams *stradd;
 			uint8_t addstream = 0;
 			uint16_t add_o_strmcnt = 0;
 			uint16_t add_i_strmcnt = 0;
 
 			SCTP_CHECK_AND_CAST(stradd, optval, struct sctp_add_streams, optsize);
 			SCTP_FIND_STCB(inp, stcb, stradd->sas_assoc_id);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if ((stradd->sas_outstrms == 0) &&
 			    (stradd->sas_instrms == 0)) {
 				error = EINVAL;
 				goto skip_stuff;
 			}
 			if (stradd->sas_outstrms) {
 				addstream = 1;
 				/* We allocate here */
 				add_o_strmcnt = stradd->sas_outstrms;
 				if ((((int)add_o_strmcnt) + ((int)stcb->asoc.streamoutcnt)) > 0x0000ffff) {
 					/* You can't have more than 64k */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 			}
 			if (stradd->sas_instrms) {
 				int cnt;
 
 				addstream |= 2;
 				/*
 				 * We allocate inside
 				 * sctp_send_str_reset_req()
 				 */
 				add_i_strmcnt = stradd->sas_instrms;
 				cnt = add_i_strmcnt;
 				cnt += stcb->asoc.streamincnt;
 				if (cnt > 0x0000ffff) {
 					/* You can't have more than 64k */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 				if (cnt > (int)stcb->asoc.max_inbound_streams) {
 					/* More than you are allowed */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 			}
 			error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0);
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 	skip_stuff:
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_RESET_ASSOC:
 		{
 			int i;
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
 			SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t) * value);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			/*
 			 * Is there any data pending in the send or sent
 			 * queues?
 			 */
 			if (!TAILQ_EMPTY(&stcb->asoc.send_queue) ||
 			    !TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
 		busy_out:
 				error = EBUSY;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			/* Do any streams have data queued? */
 			for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 				if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
 					goto busy_out;
 				}
 			}
 			error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 1, 0, 0, 0, 0);
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_CONNECT_X:
 		if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			break;
 		}
 		error = sctp_do_connect_x(so, inp, optval, optsize, p, 0);
 		break;
 	case SCTP_CONNECT_X_DELAYED:
 		if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			break;
 		}
 		error = sctp_do_connect_x(so, inp, optval, optsize, p, 1);
 		break;
 	case SCTP_CONNECT_X_COMPLETE:
 		{
 			struct sockaddr *sa;
 
 			/* FIXME MT: check correct? */
 			SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
 
 			/* find tcb */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 				SCTP_INP_RLOCK(inp);
 				stcb = LIST_FIRST(&inp->sctp_asoc_list);
 				if (stcb) {
 					SCTP_TCB_LOCK(stcb);
 				}
 				SCTP_INP_RUNLOCK(inp);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.delayed_connection == 1) {
 				stcb->asoc.delayed_connection = 0;
 				(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 				sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb,
 				    stcb->asoc.primary_destination,
 				    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
 				sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 			} else {
 				/*
 				 * already expired or did not use delayed
 				 * connectx
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 			}
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_MAX_BURST:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.max_burst = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.max_burst = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.max_burst = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_MAXSEG:
 		{
 			struct sctp_assoc_value *av;
 			int ovh;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				ovh = SCTP_MED_OVERHEAD;
 			} else {
 				ovh = SCTP_MED_V4_OVERHEAD;
 			}
 			if (stcb) {
 				if (av->assoc_value) {
 					stcb->asoc.sctp_frag_point = (av->assoc_value + ovh);
 				} else {
 					stcb->asoc.sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					/*
 					 * FIXME MT: I think this is not in
 					 * tune with the API ID
 					 */
 					if (av->assoc_value) {
 						inp->sctp_frag_point = (av->assoc_value + ovh);
 					} else {
 						inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_EVENTS:
 		{
 			struct sctp_event_subscribe *events;
 
 			SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, optsize);
 
 			SCTP_INP_WLOCK(inp);
 			if (events->sctp_data_io_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
 			}
 
 			if (events->sctp_association_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 			}
 
 			if (events->sctp_address_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
 			}
 
 			if (events->sctp_send_failure_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 			}
 
 			if (events->sctp_peer_error_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPEERERR);
 			}
 
 			if (events->sctp_shutdown_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 			}
 
 			if (events->sctp_partial_delivery_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
 			}
 
 			if (events->sctp_adaptation_layer_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 			}
 
 			if (events->sctp_authentication_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTHEVNT);
 			}
 
 			if (events->sctp_sender_dry_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT);
 			}
 
 			if (events->sctp_stream_reset_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 			}
 			SCTP_INP_WUNLOCK(inp);
 
 			SCTP_INP_RLOCK(inp);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				SCTP_TCB_LOCK(stcb);
 				if (events->sctp_association_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 				}
 				if (events->sctp_address_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT);
 				}
 				if (events->sctp_send_failure_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 				}
 				if (events->sctp_peer_error_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR);
 				}
 				if (events->sctp_shutdown_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 				}
 				if (events->sctp_partial_delivery_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT);
 				}
 				if (events->sctp_adaptation_layer_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 				}
 				if (events->sctp_authentication_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT);
 				}
 				if (events->sctp_sender_dry_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT);
 				}
 				if (events->sctp_stream_reset_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			/*
 			 * Send up the sender dry event only for 1-to-1
 			 * style sockets.
 			 */
 			if (events->sctp_sender_dry_event) {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 					stcb = LIST_FIRST(&inp->sctp_asoc_list);
 					if (stcb) {
 						SCTP_TCB_LOCK(stcb);
 						if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
 						    TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
 						    (stcb->asoc.stream_queue_cnt == 0)) {
 							sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 				}
 			}
 			SCTP_INP_RUNLOCK(inp);
 			break;
 		}
 	case SCTP_ADAPTATION_LAYER:
 		{
 			struct sctp_setadaptation *adap_bits;
 
 			SCTP_CHECK_AND_CAST(adap_bits, optval, struct sctp_setadaptation, optsize);
 			SCTP_INP_WLOCK(inp);
 			inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind;
 			inp->sctp_ep.adaptation_layer_indicator_provided = 1;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 #ifdef SCTP_DEBUG
 	case SCTP_SET_INITIAL_DBG_SEQ:
 		{
 			uint32_t *vvv;
 
 			SCTP_CHECK_AND_CAST(vvv, optval, uint32_t, optsize);
 			SCTP_INP_WLOCK(inp);
 			inp->sctp_ep.initial_sequence_debug = *vvv;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 #endif
 	case SCTP_DEFAULT_SEND_PARAM:
 		{
 			struct sctp_sndrcvinfo *s_info;
 
 			SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
 
 			if (stcb) {
 				if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) {
 					memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send)));
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send)));
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((s_info->sinfo_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) {
 							memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send)));
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_PARAMS:
 		{
 			struct sctp_paddrparams *paddrp;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize);
 			SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddrp->spp_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddrp->spp_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddrp->spp_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddrp->spp_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			/* sanity checks */
 			if ((paddrp->spp_flags & SPP_HB_ENABLE) && (paddrp->spp_flags & SPP_HB_DISABLE)) {
 				if (stcb)
 					SCTP_TCB_UNLOCK(stcb);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) {
 				if (stcb)
 					SCTP_TCB_UNLOCK(stcb);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			if (stcb != NULL) {
 				/************************TCB SPECIFIC SET ******************/
 				if (net != NULL) {
 					/************************NET SPECIFIC SET ******************/
 					if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
 						    !(net->dest_state & SCTP_ADDR_NOHB)) {
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9);
 						}
 						net->dest_state |= SCTP_ADDR_NOHB;
 					}
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_hbinterval) {
 							net->heart_beat_delay = paddrp->spp_hbinterval;
 						} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							net->heart_beat_delay = 0;
 						}
 						sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 						    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
 						sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 						net->dest_state &= ~SCTP_ADDR_NOHB;
 					}
 					if (paddrp->spp_flags & SPP_HB_DEMAND) {
 						/* on demand HB */
 						sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 						sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED);
 						sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 					}
 					if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) {
 						if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 							sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11);
 						}
 						net->dest_state |= SCTP_ADDR_NO_PMTUD;
 						net->mtu = paddrp->spp_pathmtu;
 						switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 						case AF_INET:
 							net->mtu += SCTP_MIN_V4_OVERHEAD;
 							break;
 #endif
 #ifdef INET6
 						case AF_INET6:
 							net->mtu += SCTP_MIN_OVERHEAD;
 							break;
 #endif
 						default:
 							break;
 						}
 						if (net->mtu < stcb->asoc.smallest_mtu) {
 							sctp_pathmtu_adjustment(stcb, net->mtu);
 						}
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 							sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 						}
 						net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 					}
 					if (paddrp->spp_pathmaxrxt) {
 						if (net->dest_state & SCTP_ADDR_PF) {
 							if (net->error_count > paddrp->spp_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_PF;
 							}
 						} else {
 							if ((net->error_count <= paddrp->spp_pathmaxrxt) &&
 							    (net->error_count > net->pf_threshold)) {
 								net->dest_state |= SCTP_ADDR_PF;
 								sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 								sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 								    stcb->sctp_ep, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_12);
 								sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 							}
 						}
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->error_count > paddrp->spp_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->error_count <= paddrp->spp_pathmaxrxt) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 						net->failure_threshold = paddrp->spp_pathmaxrxt;
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						net->dscp = paddrp->spp_dscp & 0xfc;
 						net->dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						if (net->ro._l_addr.sa.sa_family == AF_INET6) {
 							net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 							net->flowlabel |= 0x80000000;
 						}
 					}
 #endif
 				} else {
 					/************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/
 					if (paddrp->spp_pathmaxrxt != 0) {
 						stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt;
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (net->dest_state & SCTP_ADDR_PF) {
 								if (net->error_count > paddrp->spp_pathmaxrxt) {
 									net->dest_state &= ~SCTP_ADDR_PF;
 								}
 							} else {
 								if ((net->error_count <= paddrp->spp_pathmaxrxt) &&
 								    (net->error_count > net->pf_threshold)) {
 									net->dest_state |= SCTP_ADDR_PF;
 									sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 									sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 									    stcb->sctp_ep, stcb, net,
 									    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_13);
 									sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 								}
 							}
 							if (net->dest_state & SCTP_ADDR_REACHABLE) {
 								if (net->error_count > paddrp->spp_pathmaxrxt) {
 									net->dest_state &= ~SCTP_ADDR_REACHABLE;
 									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							} else {
 								if (net->error_count <= paddrp->spp_pathmaxrxt) {
 									net->dest_state |= SCTP_ADDR_REACHABLE;
 									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							}
 							net->failure_threshold = paddrp->spp_pathmaxrxt;
 						}
 					}
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_hbinterval != 0) {
 							stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval;
 						} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							stcb->asoc.heart_beat_delay = 0;
 						}
 						/* Turn back on the timer */
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (paddrp->spp_hbinterval != 0) {
 								net->heart_beat_delay = paddrp->spp_hbinterval;
 							} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 								net->heart_beat_delay = 0;
 							}
 							if (net->dest_state & SCTP_ADDR_NOHB) {
 								net->dest_state &= ~SCTP_ADDR_NOHB;
 							}
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_14);
 							sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 						}
 						sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (!(net->dest_state & SCTP_ADDR_NOHB)) {
 								net->dest_state |= SCTP_ADDR_NOHB;
 								if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
 									sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 									    inp, stcb, net,
 									    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_15);
 								}
 							}
 						}
 						sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 								sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16);
 							}
 							net->dest_state |= SCTP_ADDR_NO_PMTUD;
 							net->mtu = paddrp->spp_pathmtu;
 							switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 							case AF_INET:
 								net->mtu += SCTP_MIN_V4_OVERHEAD;
 								break;
 #endif
 #ifdef INET6
 							case AF_INET6:
 								net->mtu += SCTP_MIN_OVERHEAD;
 								break;
 #endif
 							default:
 								break;
 							}
 							if (net->mtu < stcb->asoc.smallest_mtu) {
 								sctp_pathmtu_adjustment(stcb, net->mtu);
 							}
 						}
 						sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 								sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 							}
 							net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 						}
 						sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							net->dscp = paddrp->spp_dscp & 0xfc;
 							net->dscp |= 0x01;
 						}
 						stcb->asoc.default_dscp = paddrp->spp_dscp & 0xfc;
 						stcb->asoc.default_dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (net->ro._l_addr.sa.sa_family == AF_INET6) {
 								net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 								net->flowlabel |= 0x80000000;
 							}
 						}
 						stcb->asoc.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 						stcb->asoc.default_flowlabel |= 0x80000000;
 					}
 #endif
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				/************************NO TCB, SET TO default stuff ******************/
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					/*
 					 * For the TOS/FLOWLABEL stuff you
 					 * set it with the options on the
 					 * socket
 					 */
 					if (paddrp->spp_pathmaxrxt != 0) {
 						inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt;
 					}
 					if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO)
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
 					else if (paddrp->spp_hbinterval != 0) {
 						if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL)
 							paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL;
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval);
 					}
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
 						} else if (paddrp->spp_hbinterval) {
 							inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval);
 						}
 						sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					} else if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					} else if (paddrp->spp_flags & SPP_PMTUD_DISABLE) {
 						sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						inp->sctp_ep.default_dscp = paddrp->spp_dscp & 0xfc;
 						inp->sctp_ep.default_dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 							inp->sctp_ep.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 							inp->sctp_ep.default_flowlabel |= 0x80000000;
 						}
 					}
 #endif
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_RTOINFO:
 		{
 			struct sctp_rtoinfo *srto;
 			uint32_t new_init, new_min, new_max;
 
 			SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
 
 			if (stcb) {
 				if (srto->srto_initial)
 					new_init = srto->srto_initial;
 				else
 					new_init = stcb->asoc.initial_rto;
 				if (srto->srto_max)
 					new_max = srto->srto_max;
 				else
 					new_max = stcb->asoc.maxrto;
 				if (srto->srto_min)
 					new_min = srto->srto_min;
 				else
 					new_min = stcb->asoc.minrto;
 				if ((new_min <= new_init) && (new_init <= new_max)) {
 					stcb->asoc.initial_rto = new_init;
 					stcb->asoc.maxrto = new_max;
 					stcb->asoc.minrto = new_min;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (srto->srto_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (srto->srto_initial)
 						new_init = srto->srto_initial;
 					else
 						new_init = inp->sctp_ep.initial_rto;
 					if (srto->srto_max)
 						new_max = srto->srto_max;
 					else
 						new_max = inp->sctp_ep.sctp_maxrto;
 					if (srto->srto_min)
 						new_min = srto->srto_min;
 					else
 						new_min = inp->sctp_ep.sctp_minrto;
 					if ((new_min <= new_init) && (new_init <= new_max)) {
 						inp->sctp_ep.initial_rto = new_init;
 						inp->sctp_ep.sctp_maxrto = new_max;
 						inp->sctp_ep.sctp_minrto = new_min;
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ASSOCINFO:
 		{
 			struct sctp_assocparams *sasoc;
 
 			SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, optsize);
 			SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
 			if (sasoc->sasoc_cookie_life) {
 				/* boundary check the cookie life */
 				if (sasoc->sasoc_cookie_life < 1000)
 					sasoc->sasoc_cookie_life = 1000;
 				if (sasoc->sasoc_cookie_life > SCTP_MAX_COOKIE_LIFE) {
 					sasoc->sasoc_cookie_life = SCTP_MAX_COOKIE_LIFE;
 				}
 			}
 			if (stcb) {
 				if (sasoc->sasoc_asocmaxrxt)
 					stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt;
 				if (sasoc->sasoc_cookie_life) {
 					stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (sasoc->sasoc_asocmaxrxt)
 						inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt;
 					if (sasoc->sasoc_cookie_life) {
 						inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life);
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_INITMSG:
 		{
 			struct sctp_initmsg *sinit;
 
 			SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (sinit->sinit_num_ostreams)
 				inp->sctp_ep.pre_open_stream_count = sinit->sinit_num_ostreams;
 
 			if (sinit->sinit_max_instreams)
 				inp->sctp_ep.max_open_streams_intome = sinit->sinit_max_instreams;
 
 			if (sinit->sinit_max_attempts)
 				inp->sctp_ep.max_init_times = sinit->sinit_max_attempts;
 
 			if (sinit->sinit_max_init_timeo)
 				inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_PRIMARY_ADDR:
 		{
 			struct sctp_setprim *spa;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize);
 			SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (spa->ssp_addr.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&spa->ssp_addr;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&spa->ssp_addr;
 				}
 			} else {
 				addr = (struct sockaddr *)&spa->ssp_addr;
 			}
 #else
 			addr = (struct sockaddr *)&spa->ssp_addr;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if ((stcb != NULL) && (net != NULL)) {
 				if (net != stcb->asoc.primary_destination) {
 					if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
 						/* Ok we need to set it */
 						if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) {
 							if ((stcb->asoc.alternate) &&
 							    (!(net->dest_state & SCTP_ADDR_PF)) &&
 							    (net->dest_state & SCTP_ADDR_REACHABLE)) {
 								sctp_free_remote_addr(stcb->asoc.alternate);
 								stcb->asoc.alternate = NULL;
 							}
 						} else {
 							SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 							error = EINVAL;
 						}
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 				}
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_SET_DYNAMIC_PRIMARY:
 		{
 			union sctp_sockstore *ss;
 
 			error = priv_check(curthread,
 			    PRIV_NETINET_RESERVEDPORT);
 			if (error)
 				break;
 
 			SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize);
 			/* SUPER USER CHECK? */
 			error = sctp_dynamic_set_primary(&ss->sa, vrf_id);
 			break;
 		}
 	case SCTP_SET_PEER_PRIMARY_ADDR:
 		{
 			struct sctp_setpeerprim *sspp;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize);
 			SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id);
 			if (stcb != NULL) {
 				struct sctp_ifa *ifa;
 
 #if defined(INET) && defined(INET6)
 				if (sspp->sspp_addr.ss_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr;
 					if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 						in6_sin6_2_sin(&sin_store, sin6);
 						addr = (struct sockaddr *)&sin_store;
 					} else {
 						addr = (struct sockaddr *)&sspp->sspp_addr;
 					}
 				} else {
 					addr = (struct sockaddr *)&sspp->sspp_addr;
 				}
 #else
 				addr = (struct sockaddr *)&sspp->sspp_addr;
 #endif
 				ifa = sctp_find_ifa_by_addr(addr, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
 				if (ifa == NULL) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					goto out_of_it;
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 					/*
 					 * Must validate the ifa found is in
 					 * our ep
 					 */
 					struct sctp_laddr *laddr;
 					int found = 0;
 
 					LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 						if (laddr->ifa == NULL) {
 							SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
 							    __func__);
 							continue;
 						}
 						if (laddr->ifa == ifa) {
 							found = 1;
 							break;
 						}
 					}
 					if (!found) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 						goto out_of_it;
 					}
 				} else {
 					switch (addr->sa_family) {
 #ifdef INET
 					case AF_INET:
 						{
 							struct sockaddr_in *sin;
 
 							sin = (struct sockaddr_in *)addr;
 							if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 							    &sin->sin_addr) != 0) {
 								SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 								error = EINVAL;
 								goto out_of_it;
 							}
 							break;
 						}
 #endif
 #ifdef INET6
 					case AF_INET6:
 						{
 							struct sockaddr_in6 *sin6;
 
 							sin6 = (struct sockaddr_in6 *)addr;
 							if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 							    &sin6->sin6_addr) != 0) {
 								SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 								error = EINVAL;
 								goto out_of_it;
 							}
 							break;
 						}
 #endif
 					default:
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 						goto out_of_it;
 					}
 				}
 				if (sctp_set_primary_ip_address_sa(stcb, addr) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 		out_of_it:
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_BINDX_ADD_ADDR:
 		{
 			struct sctp_getaddresses *addrs;
 			struct thread *td;
 
 			td = (struct thread *)p;
 			SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses,
 			    optsize);
 #ifdef INET
 			if (addrs->addr->sa_family == AF_INET) {
 				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 #ifdef INET6
 			if (addrs->addr->sa_family == AF_INET6) {
 				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr),
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 			{
 				error = EAFNOSUPPORT;
 				break;
 			}
 			sctp_bindx_add_address(so, inp, addrs->addr,
 			    addrs->sget_assoc_id, vrf_id,
 			    &error, p);
 			break;
 		}
 	case SCTP_BINDX_REM_ADDR:
 		{
 			struct sctp_getaddresses *addrs;
 			struct thread *td;
 
 			td = (struct thread *)p;
 
 			SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize);
 #ifdef INET
 			if (addrs->addr->sa_family == AF_INET) {
 				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 #ifdef INET6
 			if (addrs->addr->sa_family == AF_INET6) {
 				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL &&
 				    (error = prison_local_ip6(td->td_ucred,
 				    &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr),
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 			{
 				error = EAFNOSUPPORT;
 				break;
 			}
 			sctp_bindx_delete_address(inp, addrs->addr,
 			    addrs->sget_assoc_id, vrf_id,
 			    &error);
 			break;
 		}
 	case SCTP_EVENT:
 		{
 			struct sctp_event *event;
 			uint32_t event_type;
 
 			SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, optsize);
 			SCTP_FIND_STCB(inp, stcb, event->se_assoc_id);
 			switch (event->se_type) {
 			case SCTP_ASSOC_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT;
 				break;
 			case SCTP_PEER_ADDR_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVPADDREVNT;
 				break;
 			case SCTP_REMOTE_ERROR:
 				event_type = SCTP_PCB_FLAGS_RECVPEERERR;
 				break;
 			case SCTP_SEND_FAILED:
 				event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT;
 				break;
 			case SCTP_SHUTDOWN_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT;
 				break;
 			case SCTP_ADAPTATION_INDICATION:
 				event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT;
 				break;
 			case SCTP_PARTIAL_DELIVERY_EVENT:
 				event_type = SCTP_PCB_FLAGS_PDAPIEVNT;
 				break;
 			case SCTP_AUTHENTICATION_EVENT:
 				event_type = SCTP_PCB_FLAGS_AUTHEVNT;
 				break;
 			case SCTP_STREAM_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT;
 				break;
 			case SCTP_SENDER_DRY_EVENT:
 				event_type = SCTP_PCB_FLAGS_DRYEVNT;
 				break;
 			case SCTP_NOTIFICATIONS_STOPPED_EVENT:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
 			case SCTP_ASSOC_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
 				break;
 			case SCTP_STREAM_CHANGE_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
 				break;
 			case SCTP_SEND_FAILED_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
 				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (event_type > 0) {
 				if (stcb) {
 					if (event->se_on) {
 						sctp_stcb_feature_on(inp, stcb, event_type);
 						if (event_type == SCTP_PCB_FLAGS_DRYEVNT) {
 							if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
 							    TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
 							    (stcb->asoc.stream_queue_cnt == 0)) {
 								sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED);
 							}
 						}
 					} else {
 						sctp_stcb_feature_off(inp, stcb, event_type);
 					}
 					SCTP_TCB_UNLOCK(stcb);
 				} else {
 					/*
 					 * We don't want to send up a storm
 					 * of events, so return an error for
 					 * sender dry events
 					 */
 					if ((event_type == SCTP_PCB_FLAGS_DRYEVNT) &&
 					    ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) == 0) &&
 					    ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) &&
 					    ((event->se_assoc_id == SCTP_ALL_ASSOC) ||
 					    (event->se_assoc_id == SCTP_CURRENT_ASSOC))) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 						error = ENOTSUP;
 						break;
 					}
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 					    (event->se_assoc_id == SCTP_FUTURE_ASSOC) ||
 					    (event->se_assoc_id == SCTP_ALL_ASSOC)) {
 						SCTP_INP_WLOCK(inp);
 						if (event->se_on) {
 							sctp_feature_on(inp, event_type);
 						} else {
 							sctp_feature_off(inp, event_type);
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 					if ((event->se_assoc_id == SCTP_CURRENT_ASSOC) ||
 					    (event->se_assoc_id == SCTP_ALL_ASSOC)) {
 						SCTP_INP_RLOCK(inp);
 						LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 							SCTP_TCB_LOCK(stcb);
 							if (event->se_on) {
 								sctp_stcb_feature_on(inp, stcb, event_type);
 							} else {
 								sctp_stcb_feature_off(inp, stcb, event_type);
 							}
 							SCTP_TCB_UNLOCK(stcb);
 						}
 						SCTP_INP_RUNLOCK(inp);
 					}
 				}
 			}
 			break;
 		}
 	case SCTP_RECVRCVINFO:
 		{
 			int *onoff;
 
 			SCTP_CHECK_AND_CAST(onoff, optval, int, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (*onoff != 0) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_RECVNXTINFO:
 		{
 			int *onoff;
 
 			SCTP_CHECK_AND_CAST(onoff, optval, int, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (*onoff != 0) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_DEFAULT_SNDINFO:
 		{
 			struct sctp_sndinfo *info;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id);
 
 			if (stcb) {
 				if (info->snd_sid < stcb->asoc.streamoutcnt) {
 					stcb->asoc.def_send.sinfo_stream = info->snd_sid;
 					policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 					stcb->asoc.def_send.sinfo_flags = info->snd_flags;
 					stcb->asoc.def_send.sinfo_flags |= policy;
 					stcb->asoc.def_send.sinfo_ppid = info->snd_ppid;
 					stcb->asoc.def_send.sinfo_context = info->snd_context;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (info->snd_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (info->snd_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->def_send.sinfo_stream = info->snd_sid;
 					policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags);
 					inp->def_send.sinfo_flags = info->snd_flags;
 					inp->def_send.sinfo_flags |= policy;
 					inp->def_send.sinfo_ppid = info->snd_ppid;
 					inp->def_send.sinfo_context = info->snd_context;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((info->snd_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (info->snd_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (info->snd_sid < stcb->asoc.streamoutcnt) {
 							stcb->asoc.def_send.sinfo_stream = info->snd_sid;
 							policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 							stcb->asoc.def_send.sinfo_flags = info->snd_flags;
 							stcb->asoc.def_send.sinfo_flags |= policy;
 							stcb->asoc.def_send.sinfo_ppid = info->snd_ppid;
 							stcb->asoc.def_send.sinfo_context = info->snd_context;
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_DEFAULT_PRINFO:
 		{
 			struct sctp_default_prinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id);
 
 			if (info->pr_policy > SCTP_PR_SCTP_MAX) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (stcb) {
 				stcb->asoc.def_send.sinfo_flags &= 0xfff0;
 				stcb->asoc.def_send.sinfo_flags |= info->pr_policy;
 				stcb->asoc.def_send.sinfo_timetolive = info->pr_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (info->pr_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (info->pr_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->def_send.sinfo_flags &= 0xfff0;
 					inp->def_send.sinfo_flags |= info->pr_policy;
 					inp->def_send.sinfo_timetolive = info->pr_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((info->pr_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (info->pr_assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.def_send.sinfo_flags &= 0xfff0;
 						stcb->asoc.def_send.sinfo_flags |= info->pr_policy;
 						stcb->asoc.def_send.sinfo_timetolive = info->pr_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_THLDS:
 		/* Applies to the specific association */
 		{
 			struct sctp_paddrthlds *thlds;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, optsize);
 			SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (thlds->spt_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&thlds->spt_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&thlds->spt_address;
 			}
 #else
 			addr = (struct sockaddr *)&thlds->spt_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (stcb != NULL) {
 				if (net != NULL) {
 					net->failure_threshold = thlds->spt_pathmaxrxt;
 					net->pf_threshold = thlds->spt_pathpfthld;
 					if (net->dest_state & SCTP_ADDR_PF) {
 						if ((net->error_count > net->failure_threshold) ||
 						    (net->error_count <= net->pf_threshold)) {
 							net->dest_state &= ~SCTP_ADDR_PF;
 						}
 					} else {
 						if ((net->error_count > net->pf_threshold) &&
 						    (net->error_count <= net->failure_threshold)) {
 							net->dest_state |= SCTP_ADDR_PF;
 							sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 							    stcb->sctp_ep, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_17);
 							sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 						}
 					}
 					if (net->dest_state & SCTP_ADDR_REACHABLE) {
 						if (net->error_count > net->failure_threshold) {
 							net->dest_state &= ~SCTP_ADDR_REACHABLE;
 							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					} else {
 						if (net->error_count <= net->failure_threshold) {
 							net->dest_state |= SCTP_ADDR_REACHABLE;
 							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					}
 				} else {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						net->failure_threshold = thlds->spt_pathmaxrxt;
 						net->pf_threshold = thlds->spt_pathpfthld;
 						if (net->dest_state & SCTP_ADDR_PF) {
 							if ((net->error_count > net->failure_threshold) ||
 							    (net->error_count <= net->pf_threshold)) {
 								net->dest_state &= ~SCTP_ADDR_PF;
 							}
 						} else {
 							if ((net->error_count > net->pf_threshold) &&
 							    (net->error_count <= net->failure_threshold)) {
 								net->dest_state |= SCTP_ADDR_PF;
 								sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 								sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 								    stcb->sctp_ep, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_18);
 								sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 							}
 						}
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->error_count > net->failure_threshold) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->error_count <= net->failure_threshold) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 					}
 					stcb->asoc.def_net_failure = thlds->spt_pathmaxrxt;
 					stcb->asoc.def_net_pf_threshold = thlds->spt_pathpfthld;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.def_net_failure = thlds->spt_pathmaxrxt;
 					inp->sctp_ep.def_net_pf_threshold = thlds->spt_pathpfthld;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_REMOTE_UDP_ENCAPS_PORT:
 		{
 			struct sctp_udpencaps *encaps;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 
 #endif
 
 			SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, optsize);
 			SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (encaps->sue_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&encaps->sue_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&encaps->sue_address;
 			}
 #else
 			addr = (struct sockaddr *)&encaps->sue_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (stcb != NULL) {
 				if (net != NULL) {
 					net->port = encaps->sue_port;
 				} else {
 					stcb->asoc.port = encaps->sue_port;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.port = encaps->sue_port;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ECN_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->ecn_supported = 0;
 					} else {
 						inp->ecn_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PR_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->prsctp_supported = 0;
 					} else {
 						inp->prsctp_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					if ((av->assoc_value == 0) &&
 					    (inp->asconf_supported == 1)) {
 						/*
 						 * AUTH is required for
 						 * ASCONF
 						 */
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					} else {
 						SCTP_INP_WLOCK(inp);
 						if (av->assoc_value == 0) {
 							inp->auth_supported = 0;
 						} else {
 							inp->auth_supported = 1;
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ASCONF_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					if ((av->assoc_value != 0) &&
 					    (inp->auth_supported == 0)) {
 						/*
 						 * AUTH is required for
 						 * ASCONF
 						 */
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					} else {
 						SCTP_INP_WLOCK(inp);
 						if (av->assoc_value == 0) {
 							inp->asconf_supported = 0;
 							sctp_auth_delete_chunk(SCTP_ASCONF,
 							    inp->sctp_ep.local_auth_chunks);
 							sctp_auth_delete_chunk(SCTP_ASCONF_ACK,
 							    inp->sctp_ep.local_auth_chunks);
 						} else {
 							inp->asconf_supported = 1;
 							sctp_auth_add_chunk(SCTP_ASCONF,
 							    inp->sctp_ep.local_auth_chunks);
 							sctp_auth_add_chunk(SCTP_ASCONF_ACK,
 							    inp->sctp_ep.local_auth_chunks);
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_RECONFIG_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->reconfig_supported = 0;
 					} else {
 						inp->reconfig_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_NRSACK_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->nrsack_supported = 0;
 					} else {
 						inp->nrsack_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PKTDROP_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->pktdrop_supported = 0;
 					} else {
 						inp->pktdrop_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_MAX_CWND:
 		{
 			struct sctp_assoc_value *av;
 			struct sctp_nets *net;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.max_cwnd = av->assoc_value;
 				if (stcb->asoc.max_cwnd > 0) {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						if ((net->cwnd > stcb->asoc.max_cwnd) &&
 						    (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) {
 							net->cwnd = stcb->asoc.max_cwnd;
 							if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) {
 								net->cwnd = net->mtu - sizeof(struct sctphdr);
 							}
 						}
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
 					inp->max_cwnd = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 		error = ENOPROTOOPT;
 		break;
 	}			/* end switch (opt) */
 	return (error);
 }
 
 int
 sctp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	void *optval = NULL;
 	size_t optsize = 0;
 	void *p;
 	int error = 0;
 	struct sctp_inpcb *inp;
 
 	if ((sopt->sopt_level == SOL_SOCKET) &&
 	    (sopt->sopt_name == SO_SETFIB)) {
 		inp = (struct sctp_inpcb *)so->so_pcb;
 		if (inp == NULL) {
 			SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 			return (EINVAL);
 		}
 		SCTP_INP_WLOCK(inp);
 		inp->fibnum = so->so_fibnum;
 		SCTP_INP_WUNLOCK(inp);
 		return (0);
 	}
 	if (sopt->sopt_level != IPPROTO_SCTP) {
 		/* wrong proto level... send back up to IP */
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6))
 			error = ip6_ctloutput(so, sopt);
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		return (error);
 	}
 	optsize = sopt->sopt_valsize;
 	if (optsize) {
 		SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT);
 		if (optval == NULL) {
 			SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 			return (ENOBUFS);
 		}
 		error = sooptcopyin(sopt, optval, optsize, optsize);
 		if (error) {
 			SCTP_FREE(optval, SCTP_M_SOCKOPT);
 			goto out;
 		}
 	}
 	p = (void *)sopt->sopt_td;
 	if (sopt->sopt_dir == SOPT_SET) {
 		error = sctp_setopt(so, sopt->sopt_name, optval, optsize, p);
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p);
 	} else {
 		SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 	}
 	if ((error == 0) && (optval != NULL)) {
 		error = sooptcopyout(sopt, optval, optsize);
 		SCTP_FREE(optval, SCTP_M_SOCKOPT);
 	} else if (optval != NULL) {
 		SCTP_FREE(optval, SCTP_M_SOCKOPT);
 	}
 out:
 	return (error);
 }
 
 #ifdef INET
 static int
 sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	int error = 0;
 	int create_lock_on = 0;
 	uint32_t vrf_id;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb = NULL;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		/* I made the same as TCP since we are not setup? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return EINVAL;
 	}
 	switch (addr->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6p;
 
 			if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			sin6p = (struct sockaddr_in6 *)addr;
 			if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				return (error);
 			}
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sinp;
 
 			if (addr->sa_len != sizeof(struct sockaddr_in)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			sinp = (struct sockaddr_in *)addr;
 			if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sinp->sin_addr)) != 0) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				return (error);
 			}
 			break;
 		}
 #endif
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT);
 		return (EAFNOSUPPORT);
 	}
 	SCTP_INP_INCR_REF(inp);
 	SCTP_ASOC_CREATE_LOCK(inp);
 	create_lock_on = 1;
 
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 		/* Should I really unlock ? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
 		error = EFAULT;
 		goto out_now;
 	}
 #ifdef INET6
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
 	    (addr->sa_family == AF_INET6)) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 #endif
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) ==
 	    SCTP_PCB_FLAGS_UNBOUND) {
 		/* Bind a ephemeral port */
 		error = sctp_inpcb_bind(so, NULL, NULL, p);
 		if (error) {
 			goto out_now;
 		}
 	}
 	/* Now do we connect? */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 		error = EADDRINUSE;
 		goto out_now;
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 		SCTP_INP_RLOCK(inp);
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		SCTP_INP_RUNLOCK(inp);
 	} else {
 		/*
 		 * We increment here since sctp_findassociation_ep_addr()
 		 * will do a decrement if it finds the stcb as long as the
 		 * locked tcb (last argument) is NOT a TCB.. aka NULL.
 		 */
 		SCTP_INP_INCR_REF(inp);
 		stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL);
 		if (stcb == NULL) {
 			SCTP_INP_DECR_REF(inp);
 		} else {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 	if (stcb != NULL) {
 		/* Already have or am bring up an association */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 		error = EALREADY;
 		goto out_now;
 	}
 	vrf_id = inp->def_vrf_id;
 	/* We are GOOD to go */
 	stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, p);
 	if (stcb == NULL) {
 		/* Gak! no memory */
 		goto out_now;
 	}
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 		/* Set the connected flag so we can queue data */
 		soisconnecting(so);
 	}
 	SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT);
 	(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 
 	/* initialize authentication parameters for the assoc */
 	sctp_initialize_auth_params(inp, stcb);
 
 	sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 	SCTP_TCB_UNLOCK(stcb);
 out_now:
 	if (create_lock_on) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 	}
 	SCTP_INP_DECR_REF(inp);
 	return (error);
 }
 
 #endif
 
 int
 sctp_listen(struct socket *so, int backlog, struct thread *p)
 {
 	/*
 	 * Note this module depends on the protocol processing being called
 	 * AFTER any socket level flags and backlog are applied to the
 	 * socket. The traditional way that the socket flags are applied is
 	 * AFTER protocol processing. We have made a change to the
 	 * sys/kern/uipc_socket.c module to reverse this but this MUST be in
 	 * place if the socket API for SCTP is to work properly.
 	 */
 
 	int error = 0;
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		/* I made the same as TCP since we are not setup? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
 		/* See if we have a listener */
 		struct sctp_inpcb *tinp;
 		union sctp_sockstore store;
 
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 			/* not bound all */
 			struct sctp_laddr *laddr;
 
 			LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 				memcpy(&store, &laddr->ifa->address, sizeof(store));
 				switch (store.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					store.sin.sin_port = inp->sctp_lport;
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					store.sin6.sin6_port = inp->sctp_lport;
 					break;
 #endif
 				default:
 					break;
 				}
 				tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id);
 				if (tinp && (tinp != inp) &&
 				    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
 				    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 				    (tinp->sctp_socket->so_qlimit)) {
 					/*
 					 * we have a listener already and
 					 * its not this inp.
 					 */
 					SCTP_INP_DECR_REF(tinp);
 					return (EADDRINUSE);
 				} else if (tinp) {
 					SCTP_INP_DECR_REF(tinp);
 				}
 			}
 		} else {
 			/* Setup a local addr bound all */
 			memset(&store, 0, sizeof(store));
 #ifdef INET6
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				store.sa.sa_family = AF_INET6;
 				store.sa.sa_len = sizeof(struct sockaddr_in6);
 			}
 #endif
 #ifdef INET
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 				store.sa.sa_family = AF_INET;
 				store.sa.sa_len = sizeof(struct sockaddr_in);
 			}
 #endif
 			switch (store.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				store.sin.sin_port = inp->sctp_lport;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				store.sin6.sin6_port = inp->sctp_lport;
 				break;
 #endif
 			default:
 				break;
 			}
 			tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id);
 			if (tinp && (tinp != inp) &&
 			    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
 			    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 			    (tinp->sctp_socket->so_qlimit)) {
 				/*
 				 * we have a listener already and its not
 				 * this inp.
 				 */
 				SCTP_INP_DECR_REF(tinp);
 				return (EADDRINUSE);
 			} else if (tinp) {
 				SCTP_INP_DECR_REF(tinp);
 			}
 		}
 	}
 	SCTP_INP_RLOCK(inp);
 #ifdef SCTP_LOCK_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) {
 		sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK);
 	}
 #endif
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		SCTP_INP_RUNLOCK(inp);
 		return (error);
 	}
 	if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*
 		 * The unlucky case - We are in the tcp pool with this guy.
 		 * - Someone else is in the main inp slot. - We must move
 		 * this guy (the listener) to the main slot - We must then
 		 * move the guy that was listener to the TCP Pool.
 		 */
 		if (sctp_swap_inpcb_for_listen(inp)) {
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 			return (EADDRINUSE);
 		}
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 		return (EADDRINUSE);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		/* We must do a bind. */
 		if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) {
 			/* bind error, probably perm */
 			return (error);
 		}
 	}
 	SOCK_LOCK(so);
 	/* It appears for 7.0 and on, we must always call this. */
 	solisten_proto(so, backlog);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
 		/* remove the ACCEPTCONN flag for one-to-many sockets */
 		so->so_options &= ~SO_ACCEPTCONN;
 	}
 	if (backlog == 0) {
 		/* turning off listen */
 		so->so_options &= ~SO_ACCEPTCONN;
 	}
 	SOCK_UNLOCK(so);
 	return (error);
 }
 
 static int sctp_defered_wakeup_cnt = 0;
 
 int
 sctp_accept(struct socket *so, struct sockaddr **addr)
 {
 	struct sctp_tcb *stcb;
 	struct sctp_inpcb *inp;
 	union sctp_sockstore store;
 
 #ifdef INET6
 	int error;
 
 #endif
 	inp = (struct sctp_inpcb *)so->so_pcb;
 
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_INP_RLOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	}
 	if (so->so_state & SS_ISDISCONNECTED) {
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED);
 		return (ECONNABORTED);
 	}
 	stcb = LIST_FIRST(&inp->sctp_asoc_list);
 	if (stcb == NULL) {
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_TCB_LOCK(stcb);
 	SCTP_INP_RUNLOCK(inp);
 	store = stcb->asoc.primary_destination->ro._l_addr;
 	stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
 	SCTP_TCB_UNLOCK(stcb);
 	switch (store.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 			if (sin == NULL)
 				return (ENOMEM);
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			sin->sin_port = store.sin.sin_port;
 			sin->sin_addr = store.sin.sin_addr;
 			*addr = (struct sockaddr *)sin;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
 			if (sin6 == NULL)
 				return (ENOMEM);
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_port = store.sin6.sin6_port;
 			sin6->sin6_addr = store.sin6.sin6_addr;
 			if ((error = sa6_recoverscope(sin6)) != 0) {
 				SCTP_FREE_SONAME(sin6);
 				return (error);
 			}
 			*addr = (struct sockaddr *)sin6;
 			break;
 		}
 #endif
 	default:
 		/* TSNH */
 		break;
 	}
 	/* Wake any delayed sleep action */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) {
 		SCTP_INP_WLOCK(inp);
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE;
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
 			SCTP_INP_WUNLOCK(inp);
 			SOCKBUF_LOCK(&inp->sctp_socket->so_snd);
 			if (sowriteable(inp->sctp_socket)) {
 				sowwakeup_locked(inp->sctp_socket);
 			} else {
 				SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd);
 			}
 			SCTP_INP_WLOCK(inp);
 		}
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
 			SCTP_INP_WUNLOCK(inp);
 			SOCKBUF_LOCK(&inp->sctp_socket->so_rcv);
 			if (soreadable(inp->sctp_socket)) {
 				sctp_defered_wakeup_cnt++;
 				sorwakeup_locked(inp->sctp_socket);
 			} else {
 				SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv);
 			}
 			SCTP_INP_WLOCK(inp);
 		}
 		SCTP_INP_WUNLOCK(inp);
 	}
 	if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 		SCTP_TCB_LOCK(stcb);
 		sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19);
 	}
 	return (0);
 }
 
 #ifdef INET
 int
 sctp_ingetaddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in *sin;
 	uint32_t vrf_id;
 	struct sctp_inpcb *inp;
 	struct sctp_ifa *sctp_ifa;
 
 	/*
 	 * Do the malloc first in case it blocks.
 	 */
 	SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 	if (sin == NULL)
 		return (ENOMEM);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (!inp) {
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_INP_RLOCK(inp);
 	sin->sin_port = inp->sctp_lport;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 			struct sctp_tcb *stcb;
 			struct sockaddr_in *sin_a;
 			struct sctp_nets *net;
 			int fnd;
 
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				goto notConn;
 			}
 			fnd = 0;
 			sin_a = NULL;
 			SCTP_TCB_LOCK(stcb);
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				sin_a = (struct sockaddr_in *)&net->ro._l_addr;
 				if (sin_a == NULL)
 					/* this will make coverity happy */
 					continue;
 
 				if (sin_a->sin_family == AF_INET) {
 					fnd = 1;
 					break;
 				}
 			}
 			if ((!fnd) || (sin_a == NULL)) {
 				/* punt */
 				SCTP_TCB_UNLOCK(stcb);
 				goto notConn;
 			}
 			vrf_id = inp->def_vrf_id;
 			sctp_ifa = sctp_source_address_selection(inp,
 			    stcb,
 			    (sctp_route_t *) & net->ro,
 			    net, 0, vrf_id);
 			if (sctp_ifa) {
 				sin->sin_addr = sctp_ifa->address.sin.sin_addr;
 				sctp_free_ifa(sctp_ifa);
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		} else {
 			/* For the bound all case you get back 0 */
 	notConn:
 			sin->sin_addr.s_addr = 0;
 		}
 
 	} else {
 		/* Take the first IPv4 address in the list */
 		struct sctp_laddr *laddr;
 		int fnd = 0;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa->address.sa.sa_family == AF_INET) {
 				struct sockaddr_in *sin_a;
 
 				sin_a = &laddr->ifa->address.sin;
 				sin->sin_addr = sin_a->sin_addr;
 				fnd = 1;
 				break;
 			}
 		}
 		if (!fnd) {
 			SCTP_FREE_SONAME(sin);
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 			return (ENOENT);
 		}
 	}
 	SCTP_INP_RUNLOCK(inp);
 	(*addr) = (struct sockaddr *)sin;
 	return (0);
 }
 
 int
 sctp_peeraddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in *sin;
 	int fnd;
 	struct sockaddr_in *sin_a;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 
 	/* Do the malloc first in case it blocks. */
 	SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 	if (sin == NULL)
 		return (ENOMEM);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if ((inp == NULL) ||
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
 		/* UDP type and listeners will drop out here */
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 		return (ENOTCONN);
 	}
 	SCTP_INP_RLOCK(inp);
 	stcb = LIST_FIRST(&inp->sctp_asoc_list);
 	if (stcb) {
 		SCTP_TCB_LOCK(stcb);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (stcb == NULL) {
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	fnd = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sin_a = (struct sockaddr_in *)&net->ro._l_addr;
 		if (sin_a->sin_family == AF_INET) {
 			fnd = 1;
 			sin->sin_port = stcb->rport;
 			sin->sin_addr = sin_a->sin_addr;
 			break;
 		}
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	if (!fnd) {
 		/* No IPv4 address */
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 		return (ENOENT);
 	}
 	(*addr) = (struct sockaddr *)sin;
 	return (0);
 }
 
 struct pr_usrreqs sctp_usrreqs = {
 	.pru_abort = sctp_abort,
 	.pru_accept = sctp_accept,
 	.pru_attach = sctp_attach,
 	.pru_bind = sctp_bind,
 	.pru_connect = sctp_connect,
 	.pru_control = in_control,
 	.pru_close = sctp_close,
 	.pru_detach = sctp_close,
 	.pru_sopoll = sopoll_generic,
 	.pru_flush = sctp_flush,
 	.pru_disconnect = sctp_disconnect,
 	.pru_listen = sctp_listen,
 	.pru_peeraddr = sctp_peeraddr,
 	.pru_send = sctp_sendm,
 	.pru_shutdown = sctp_shutdown,
 	.pru_sockaddr = sctp_ingetaddr,
 	.pru_sosend = sctp_sosend,
 	.pru_soreceive = sctp_soreceive
 };
 
 #endif
Index: projects/vnet/sys/netinet/sctp_var.h
===================================================================
--- projects/vnet/sys/netinet/sctp_var.h	(revision 294565)
+++ projects/vnet/sys/netinet/sctp_var.h	(revision 294566)
@@ -1,356 +1,355 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_VAR_H_
 #define _NETINET_SCTP_VAR_H_
 
 #include <netinet/sctp_uio.h>
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
 extern struct pr_usrreqs sctp_usrreqs;
 
 
 #define sctp_feature_on(inp, feature)  (inp->sctp_features |= feature)
 #define sctp_feature_off(inp, feature) (inp->sctp_features &= ~feature)
 #define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature)
 #define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0)
 
 #define sctp_stcb_feature_on(inp, stcb, feature) {\
 	if (stcb) { \
 		stcb->asoc.sctp_features |= feature; \
 	} else if (inp) { \
 		inp->sctp_features |= feature; \
 	} \
 }
 #define sctp_stcb_feature_off(inp, stcb, feature) {\
 	if (stcb) { \
 		stcb->asoc.sctp_features &= ~feature; \
 	} else if (inp) { \
 		inp->sctp_features &= ~feature; \
 	} \
 }
 #define sctp_stcb_is_feature_on(inp, stcb, feature) \
 	(((stcb != NULL) && \
 	  ((stcb->asoc.sctp_features & feature) == feature)) || \
 	 ((stcb == NULL) && (inp != NULL) && \
 	  ((inp->sctp_features & feature) == feature)))
 #define sctp_stcb_is_feature_off(inp, stcb, feature) \
 	(((stcb != NULL) && \
 	  ((stcb->asoc.sctp_features & feature) == 0)) || \
 	 ((stcb == NULL) && (inp != NULL) && \
 	  ((inp->sctp_features & feature) == 0)) || \
 	 ((stcb == NULL) && (inp == NULL)))
 
 /* managing mobility_feature in inpcb (by micchie) */
 #define sctp_mobility_feature_on(inp, feature)  (inp->sctp_mobility_features |= feature)
 #define sctp_mobility_feature_off(inp, feature) (inp->sctp_mobility_features &= ~feature)
 #define sctp_is_mobility_feature_on(inp, feature) (inp->sctp_mobility_features & feature)
 #define sctp_is_mobility_feature_off(inp, feature) ((inp->sctp_mobility_features & feature) == 0)
 
 #define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND))
 
 #define	sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0))
 
 #define	sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0))
 
 #define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0)
 
 /*
  * I tried to cache the readq entries at one point. But the reality
  * is that it did not add any performance since this meant we had to
  * lock the STCB on read. And at that point once you have to do an
  * extra lock, it really does not matter if the lock is in the ZONE
  * stuff or in our code. Note that this same problem would occur with
  * an mbuf cache as well so it is not really worth doing, at least
  * right now :-D
  */
 
 #define sctp_free_a_readq(_stcb, _readq) { \
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
 	SCTP_DECR_READQ_COUNT(); \
 }
 
 #define sctp_alloc_a_readq(_stcb, _readq) { \
 	(_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \
 	if ((_readq)) { \
 	     SCTP_INCR_READQ_COUNT(); \
 	} \
 }
 
 #define sctp_free_a_strmoq(_stcb, _strmoq, _so_locked) { \
 	if ((_strmoq)->holds_key_ref) { \
 		sctp_auth_key_release(stcb, sp->auth_keyid, _so_locked); \
 		(_strmoq)->holds_key_ref = 0; \
 	} \
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), (_strmoq)); \
 	SCTP_DECR_STRMOQ_COUNT(); \
 }
 
 #define sctp_alloc_a_strmoq(_stcb, _strmoq) { \
 	(_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \
 	if ((_strmoq)) { \
 		memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \
 		SCTP_INCR_STRMOQ_COUNT(); \
 		(_strmoq)->holds_key_ref = 0; \
 	} \
 }
 
 #define sctp_free_a_chunk(_stcb, _chk, _so_locked) { \
 	if ((_chk)->holds_key_ref) {\
 		sctp_auth_key_release((_stcb), (_chk)->auth_keyid, _so_locked); \
 		(_chk)->holds_key_ref = 0; \
 	} \
 	if (_stcb) { \
 		SCTP_TCB_LOCK_ASSERT((_stcb)); \
 		if ((_chk)->whoTo) { \
 			sctp_free_remote_addr((_chk)->whoTo); \
 			(_chk)->whoTo = NULL; \
 		} \
 		if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \
 		    (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
 			SCTP_DECR_CHK_COUNT(); \
 		} else { \
 			TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
 			(_stcb)->asoc.free_chunk_cnt++; \
 			atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
 		} \
 	} else { \
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
 		SCTP_DECR_CHK_COUNT(); \
 	} \
 }
 
 #define sctp_alloc_a_chunk(_stcb, _chk) { \
 	if (TAILQ_EMPTY(&(_stcb)->asoc.free_chunks)) { \
 		(_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \
 		if ((_chk)) { \
 			SCTP_INCR_CHK_COUNT(); \
 			(_chk)->whoTo = NULL; \
 			(_chk)->holds_key_ref = 0; \
 		} \
 	} else { \
 		(_chk) = TAILQ_FIRST(&(_stcb)->asoc.free_chunks); \
 		TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
 		(_chk)->holds_key_ref = 0; \
 		SCTP_STAT_INCR(sctps_cached_chk); \
 		(_stcb)->asoc.free_chunk_cnt--; \
 	} \
 }
 
 
 #define sctp_free_remote_addr(__net) { \
 	if ((__net)) {  \
 		if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \
 			(void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \
 			(void)SCTP_OS_TIMER_STOP(&(__net)->pmtu_timer.timer); \
 			(void)SCTP_OS_TIMER_STOP(&(__net)->hb_timer.timer); \
 			if ((__net)->ro.ro_rt) { \
 				RTFREE((__net)->ro.ro_rt); \
 				(__net)->ro.ro_rt = NULL; \
 			} \
 			if ((__net)->src_addr_selected) { \
 				sctp_free_ifa((__net)->ro._s_addr); \
 				(__net)->ro._s_addr = NULL; \
 			} \
 			(__net)->src_addr_selected = 0; \
 			(__net)->dest_state &= ~SCTP_ADDR_REACHABLE; \
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \
 			SCTP_DECR_RADDR_COUNT(); \
 		} \
 	} \
 }
 
 #define sctp_sbfree(ctl, stcb, sb, m) { \
 	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \
 	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \
 	if (((ctl)->do_not_ref_stcb == 0) && stcb) {\
 		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
 		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
 	    SCTP_BUF_TYPE(m) != MT_OOBDATA) \
 		atomic_subtract_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
 }
 
 #define sctp_sballoc(stcb, sb, m) { \
 	atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \
 	atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \
 	if (stcb) { \
 		atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \
 		atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
 	    SCTP_BUF_TYPE(m) != MT_OOBDATA) \
 		atomic_add_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
 }
 
 
 #define sctp_ucount_incr(val) { \
 	val++; \
 }
 
 #define sctp_ucount_decr(val) { \
 	if (val > 0) { \
 		val--; \
 	} else { \
 		val = 0; \
 	} \
 }
 
 #define sctp_mbuf_crush(data) do { \
 	struct mbuf *_m; \
 	_m = (data); \
 	while (_m && (SCTP_BUF_LEN(_m) == 0)) { \
 		(data)  = SCTP_BUF_NEXT(_m); \
 		SCTP_BUF_NEXT(_m) = NULL; \
 		sctp_m_free(_m); \
 		_m = (data); \
 	} \
 } while (0)
 
 #define sctp_flight_size_decrease(tp1) do { \
 	if (tp1->whoTo->flight_size >= tp1->book_size) \
 		tp1->whoTo->flight_size -= tp1->book_size; \
 	else \
 		tp1->whoTo->flight_size = 0; \
 } while (0)
 
 #define sctp_flight_size_increase(tp1) do { \
 	(tp1)->whoTo->flight_size += (tp1)->book_size; \
 } while (0)
 
 #ifdef SCTP_FS_SPEC_LOG
 #define sctp_total_flight_decrease(stcb, tp1) do { \
 	if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
 		stcb->asoc.fs_index = 0;\
 	stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \
 	stcb->asoc.fs_index++; \
 	tp1->window_probe = 0; \
 	if (stcb->asoc.total_flight >= tp1->book_size) { \
 		stcb->asoc.total_flight -= tp1->book_size; \
 		if (stcb->asoc.total_flight_count > 0) \
 			stcb->asoc.total_flight_count--; \
 	} else { \
 		stcb->asoc.total_flight = 0; \
 		stcb->asoc.total_flight_count = 0; \
 	} \
 } while (0)
 
 #define sctp_total_flight_increase(stcb, tp1) do { \
 	if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
 		stcb->asoc.fs_index = 0;\
 	stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \
 	stcb->asoc.fs_index++; \
 	(stcb)->asoc.total_flight_count++; \
 	(stcb)->asoc.total_flight += (tp1)->book_size; \
 } while (0)
 
 #else
 
 #define sctp_total_flight_decrease(stcb, tp1) do { \
 	tp1->window_probe = 0; \
 	if (stcb->asoc.total_flight >= tp1->book_size) { \
 		stcb->asoc.total_flight -= tp1->book_size; \
 		if (stcb->asoc.total_flight_count > 0) \
 			stcb->asoc.total_flight_count--; \
 	} else { \
 		stcb->asoc.total_flight = 0; \
 		stcb->asoc.total_flight_count = 0; \
 	} \
 } while (0)
 
 #define sctp_total_flight_increase(stcb, tp1) do { \
 	(stcb)->asoc.total_flight_count++; \
 	(stcb)->asoc.total_flight += (tp1)->book_size; \
 } while (0)
 
 #endif
 
 #define SCTP_PF_ENABLED(_net) (_net->pf_threshold < _net->failure_threshold)
 #define SCTP_NET_IS_PF(_net) (_net->pf_threshold < _net->error_count)
 
 struct sctp_nets;
 struct sctp_inpcb;
 struct sctp_tcb;
 struct sctphdr;
 
 
 void sctp_close(struct socket *so);
 int sctp_disconnect(struct socket *so);
 void sctp_ctlinput(int, struct sockaddr *, void *);
 int sctp_ctloutput(struct socket *, struct sockopt *);
 
 #ifdef INET
 void sctp_input_with_port(struct mbuf *, int, uint16_t);
 int sctp_input(struct mbuf **, int *, int);
 
 #endif
 void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t);
 void sctp_drain(void);
 void sctp_init(void);
-void sctp_finish(void);
 int sctp_flush(struct socket *, int);
 int sctp_shutdown(struct socket *);
 void 
 sctp_notify(struct sctp_inpcb *, struct ip *ip, struct sctphdr *,
     struct sockaddr *, struct sctp_tcb *,
     struct sctp_nets *);
 int 
 sctp_bindx(struct socket *, int, struct sockaddr_storage *,
     int, int, struct proc *);
 
 /* can't use sctp_assoc_t here */
 int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *);
 int sctp_ingetaddr(struct socket *, struct sockaddr **);
 int sctp_peeraddr(struct socket *, struct sockaddr **);
 int sctp_listen(struct socket *, int, struct thread *);
 int sctp_accept(struct socket *, struct sockaddr **);
 
 #endif				/* _KERNEL */
 
 #endif				/* !_NETINET_SCTP_VAR_H_ */
Index: projects/vnet/sys/netinet/tcp_subr.c
===================================================================
--- projects/vnet/sys/netinet/tcp_subr.c	(revision 294565)
+++ projects/vnet/sys/netinet/tcp_subr.c	(revision 294566)
@@ -1,2867 +1,2868 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #ifdef TCP_RFC7413
 #include <netinet/tcp_fastopen.h>
 #endif
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 struct rwlock tcp_function_lock;
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
     &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
     &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 static VNET_DEFINE(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 #ifdef TCP_SIGNATURE
 static int	tcp_sig_checksigs = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW,
     &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic");
 #endif
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static void tcp_mtudisc(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 static void	tcp_timer_discard(struct tcpcb *, uint32_t);
 
 
 static struct tcp_function_block tcp_def_funcblk = {
 	"default",
 	tcp_output,
 	tcp_do_segment,
 	tcp_default_ctloutput,
 	NULL,
 	NULL,	
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	0,
 	0
 };
 
 struct tcp_funchead t_functions;
 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
 
 static struct tcp_function_block *
 find_tcp_functions_locked(struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	struct tcp_function_block *blk=NULL;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) {
 			blk = f->tf_fb;
 			break;
 		}
 	}
 	return(blk);
 }
 
 static struct tcp_function_block *
 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
 {
 	struct tcp_function_block *rblk=NULL;
 	struct tcp_function *f;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (f->tf_fb == blk) {
 			rblk = blk;
 			if (s) {
 				*s = f;
 			}
 			break;
 		}
 	}
 	return (rblk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_functions(struct tcp_function_set *fs)
 {
 	struct tcp_function_block *blk;
 	
 	rw_rlock(&tcp_function_lock);	
 	blk = find_tcp_functions_locked(fs);
 	if (blk)
 		refcount_acquire(&blk->tfb_refcnt); 
 	rw_runlock(&tcp_function_lock);
 	return(blk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *rblk;
 	
 	rw_rlock(&tcp_function_lock);	
 	rblk = find_tcp_fb_locked(blk, NULL);
 	if (rblk) 
 		refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(rblk);
 }
 
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
 	int error=ENOENT;
 	struct tcp_function_set fs;
 	struct tcp_function_block *blk;
 
 	memset(&fs, 0, sizeof(fs));
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
 	if (blk) {
 		/* Found him */
 		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 		fs.pcbcnt = blk->tfb_refcnt;
 	}
 	rw_runlock(&tcp_function_lock);	
 	error = sysctl_handle_string(oidp, fs.function_set_name,
 				     sizeof(fs.function_set_name), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		return(error);
 
 	rw_wlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(&fs);
 	if ((blk == NULL) ||
 	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { 
 		error = ENOENT; 
 		goto done;
 	}
 	tcp_func_set_ptr = blk;
 done:
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
 	    CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
 	    "Set/get the default TCP functions");
 
 static int
 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
 {
 	int error, cnt, linesz;
 	struct tcp_function *f;
 	char *buffer, *cp;
 	size_t bufsz, outsz;
 
 	cnt = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		cnt++;
 	}
 	rw_runlock(&tcp_function_lock);
 
 	bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1;
 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
 
 	error = 0;
 	cp = buffer;
 
 	linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count");
 	cp += linesz;
 	bufsz -= linesz;
 	outsz = linesz;
 
 	rw_rlock(&tcp_function_lock);	
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		linesz = snprintf(cp, bufsz, "%-32s%c %u\n",
 		    f->tf_fb->tfb_tcp_block_name,
 		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
 		    f->tf_fb->tfb_refcnt);
 		if (linesz >= bufsz) {
 			error = EOVERFLOW;
 			break;
 		}
 		cp += linesz;
 		bufsz -= linesz;
 		outsz += linesz;
 	}
 	rw_runlock(&tcp_function_lock);
 	if (error == 0)
 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
 	    CTLTYPE_STRING|CTLFLAG_RD,
 	    NULL, 0, sysctl_net_inet_list_available, "A",
 	    "list available TCP Function sets");
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 	struct	osd		osd;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
 
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 int
 register_tcp_functions(struct tcp_function_block *blk, int wait)
 {
 	struct tcp_function_block *lblk;
 	struct tcp_function *n;
 	struct tcp_function_set fs;
 
 	if ((blk->tfb_tcp_output == NULL) ||
 	    (blk->tfb_tcp_do_segment == NULL) ||
 	    (blk->tfb_tcp_ctloutput == NULL) ||
 	    (strlen(blk->tfb_tcp_block_name) == 0)) {
 		/* 
 		 * These functions are required and you
 		 * need a name.
 		 */
 		return (EINVAL);
 	}
 	if (blk->tfb_tcp_timer_stop_all ||
 	    blk->tfb_tcp_timers_left ||
 	    blk->tfb_tcp_timer_activate ||
 	    blk->tfb_tcp_timer_active ||
 	    blk->tfb_tcp_timer_stop) {
 		/*
 		 * If you define one timer function you 
 		 * must have them all.
 		 */
 		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
 		    (blk->tfb_tcp_timers_left  == NULL) ||
 		    (blk->tfb_tcp_timer_activate == NULL) ||
 		    (blk->tfb_tcp_timer_active == NULL) ||
 		    (blk->tfb_tcp_timer_stop == NULL)) {
 			return (EINVAL);			
 		}
 	}	
 	n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 	if (n == NULL) {
 		return (ENOMEM);
 	}
 	n->tf_fb = blk;
 	strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 	rw_wlock(&tcp_function_lock);
 	lblk = find_tcp_functions_locked(&fs);
 	if (lblk) {
 		/* Duplicate name space not allowed */
 		rw_wunlock(&tcp_function_lock);
 		free(n, M_TCPFUNCTIONS);
 		return (EALREADY);
 	}
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_flags = 0;
 	TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
 	rw_wunlock(&tcp_function_lock);
 	return(0);
 }	
 
 int
 deregister_tcp_functions(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *lblk;
 	struct tcp_function *f;
 	int error=ENOENT;
 	
 	if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
 		/* You can't un-register the default */
 		return (EPERM);
 	}
 	rw_wlock(&tcp_function_lock);
 	if (blk == tcp_func_set_ptr) {
 		/* You can't free the current default */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	if (blk->tfb_refcnt) {
 		/* Still tcb attached, mark it. */
 		blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
 		rw_wunlock(&tcp_function_lock);		
 		return (EBUSY);
 	}
 	lblk = find_tcp_fb_locked(blk, &f);
 	if (lblk) {
 		/* Found */
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
 		error = 0;
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 void
 tcp_init(void)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose && IS_DEFAULT_VNET(curvnet))
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
 	    "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_4TUPLE);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	tcp_reass_global_init();
 
 	/* XXX virtualize those bellow? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 	tcp_tcbhashsize = hashsize;
 	/* Setup the tcp function block list */
 	TAILQ_INIT(&t_functions);
 	rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0);
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_usrreqs.pru_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_usrreqs.pru_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 #ifdef TCPPCAP
 	tcp_pcap_init();
 #endif
 
 #ifdef TCP_RFC7413
 	tcp_fastopen_init();
 #endif
 }
 
 #ifdef VIMAGE
-void
+static void
 tcp_destroy(void)
 {
 	int error;
 
 #ifdef TCP_RFC7413
 	tcp_fastopen_destroy();
 #endif
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 }
+VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL);
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		nth->th_sum = in6_cksum_pseudo(ip6,
 		    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *),
 		    tp, nth);
 
 	TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth);
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 #endif
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 	rw_rlock(&tcp_function_lock);
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		(*tp->t_fb->tfb_tcp_fb_init)(tp);
 	}
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	CC_LIST_RLOCK();
 	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
 	CC_ALGO(tp) = CC_DEFAULT();
 	CC_LIST_RUNLOCK();
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, 1);
 	callout_init(&tp->t_timers->tt_persist, 1);
 	callout_init(&tp->t_timers->tt_keep, 1);
 	callout_init(&tp->t_timers->tt_2msl, 1);
 	callout_init(&tp->t_timers->tt_delack, 1);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	/*
 	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
 	 * is called.
 	 */
 	in_pcbref(inp);	/* Reference for tcpcb */
 	tp->t_inpcb = inp;
 
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 #ifdef TCPPCAP
 	/*
 	 * Init the TCP PCAP queues.
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
 	return (tp);		/* XXX */
 }
 
 /*
  * Switch the congestion control algorithm back to NewReno for any active
  * control blocks using an algorithm which is about to go away.
  * This ensures the CC framework can allow the unload to proceed without leaving
  * any dangling pointers which would trigger a panic.
  * Returning non-zero would inform the CC framework that something went wrong
  * and it would be unsafe to allow the unload to proceed. However, there is no
  * way for this to occur with this implementation so we always return zero.
  */
 int
 tcp_ccalgounload(struct cc_algo *unload_algo)
 {
 	struct cc_algo *tmpalgo;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	/*
 	 * Check all active control blocks across all network stacks and change
 	 * any that are using "unload_algo" back to NewReno. If "unload_algo"
 	 * requires cleanup code to be run, call it.
 	 */
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INP_INFO_WLOCK(&V_tcbinfo);
 		/*
 		 * New connections already part way through being initialised
 		 * with the CC algo we're removing will not race with this code
 		 * because the INP_INFO_WLOCK is held during initialisation. We
 		 * therefore don't enter the loop below until the connection
 		 * list has stabilised.
 		 */
 		LIST_FOREACH(inp, &V_tcb, inp_list) {
 			INP_WLOCK(inp);
 			/* Important to skip tcptw structs. */
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    (tp = intotcpcb(inp)) != NULL) {
 				/*
 				 * By holding INP_WLOCK here, we are assured
 				 * that the connection is not currently
 				 * executing inside the CC module's functions
 				 * i.e. it is safe to make the switch back to
 				 * NewReno.
 				 */
 				if (CC_ALGO(tp) == unload_algo) {
 					tmpalgo = CC_ALGO(tp);
 					/* NewReno does not require any init. */
 					CC_ALGO(tp) = &newreno_cc_algo;
 					if (tmpalgo->cb_destroy != NULL)
 						tmpalgo->cb_destroy(tp->ccv);
 				}
 			}
 			INP_WUNLOCK(inp);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		(void) tp->t_fb->tfb_tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 	int released;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * If stopping a timer fails, we schedule a discard function in same
 	 * callout, and the last discard function called will take care of
 	 * deleting the tcpcb.
 	 */
 	tcp_timer_stop(tp, TT_REXMT);
 	tcp_timer_stop(tp, TT_PERSIST);
 	tcp_timer_stop(tp, TT_KEEP);
 	tcp_timer_stop(tp, TT_2MSL);
 	tcp_timer_stop(tp, TT_DELACK);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
 		/* Call the stop-all function of the methods */
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 				sizeof (struct tcphdr) :
 #endif
 				sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 		
 	tcp_free_sackholes(tp);
 
 #ifdef TCPPCAP
 	/* Free the TCP PCAP queues. */
 	tcp_pcap_drain(&(tp->t_inpkts));
 	tcp_pcap_drain(&(tp->t_outpkts));
 #endif
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 
 	khelp_destroy_osd(tp->osd);
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on tcpcb, let's free it. */
 		if ((tp->t_fb->tfb_tcp_timers_left) &&
 		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
 			    /* Some fb timers left running! */
 			    return;
 		}
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		released = in_pcbrele_wlocked(inp);
 		KASSERT(!released, ("%s: inp %p should not have been released "
 			"here", __func__, inp));
 	}
 }
 
 void
 tcp_timer_2msl_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL);
 }
 
 void
 tcp_timer_keep_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP);
 }
 
 void
 tcp_timer_persist_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST);
 }
 
 void
 tcp_timer_rexmt_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT);
 }
 
 void
 tcp_timer_delack_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK);
 }
 
 void
 tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct inpcb *inp;
 
 	CURVNET_SET(tp->t_vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
 		__func__, tp));
 	INP_WLOCK(inp);
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
 		("%s: tcpcb has to be stopped here", __func__));
 	KASSERT((tp->t_timers->tt_flags & timer_type) != 0,
 		("%s: discard callout should be running", __func__));
 	tp->t_timers->tt_flags &= ~timer_type;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on this tcpcb, let's free it. */
 		if ((tp->t_fb->tfb_tcp_timers_left) &&
 		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
 			    /* Some fb timers left running! */
 			    goto leave;
 		}
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		if (in_pcbrele_wlocked(inp)) {
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 leave:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 #ifdef TCP_RFC7413
 	/*
 	 * This releases the TFO pending counter resource for TFO listen
 	 * sockets as well as passively-created TFO sockets that transition
 	 * from SYN_RECEIVED to CLOSED.
 	 */
 	if (tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		INP_INFO_WLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_flags & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = V_tcbinfo.ipi_count + syncache_pcbcount();
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_LIST_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_LIST_RUNLOCK(&V_tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_flags & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0) {
 				in_pcbref(inp);
 				inp_list[i++] = inp;
 			}
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_flags & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else {
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 				if (xt.xt_tp.t_timers)
 					tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
 			}
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_LIST_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_LIST_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 
 #ifdef INET
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 
 	if (ip == NULL) {
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 		return;
 	}
 
 	icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip));
 	th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src,
 	    th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL)  {
 		if (!(inp->inp_flags & INP_TIMEWAIT) &&
 		    !(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			icmp_tcp_seq = ntohl(th->th_seq);
 			tp = intotcpcb(inp);
 			if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 			    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 				if (cmd == PRC_MSGSIZE) {
 					/*
 					 * MTU discovery:
 					 * If we got a needfrag set the MTU
 					 * in the route to the suggested new
 					 * value (if given) and then notify.
 					 */
 				    	mtu = ntohs(icp->icmp_nextmtu);
 					/*
 					 * If no alternative MTU was
 					 * proposed, try the next smaller
 					 * one.
 					 */
 					if (!mtu)
 						mtu = ip_next_mtu(
 						    ntohs(ip->ip_len), 1);
 					if (mtu < V_tcp_minmss +
 					    sizeof(struct tcpiphdr))
 						mtu = V_tcp_minmss +
 						    sizeof(struct tcpiphdr);
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxopd +
 					    sizeof(struct tcpiphdr)) {
 						bzero(&inc, sizeof(inc));
 						inc.inc_faddr = faddr;
 						inc.inc_fibnum =
 						    inp->inp_inc.inc_fibnum;
 						tcp_hc_updatemtu(&inc, mtu);
 						tcp_mtudisc(inp, mtu);
 					}
 				} else
 					inp = (*notify)(inp,
 					    inetctlerrmap[cmd]);
 			}
 		}
 		if (inp != NULL)
 			INP_WUNLOCK(inp);
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th->th_dport;
 		inc.inc_lport = th->th_sport;
 		inc.inc_faddr = faddr;
 		inc.inc_laddr = ip->ip_src;
 		syncache_unreach(&inc, th);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 #endif /* INET */
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_flags |= INC_ISIPV6;
 		INP_INFO_RLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 static VNET_DEFINE(u_char, isn_secret[32]);
 static VNET_DEFINE(int, isn_last);
 static VNET_DEFINE(int, isn_last_reseed);
 static VNET_DEFINE(u_int32_t, isn_offset);
 static VNET_DEFINE(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	MD5_CTX isn_ctx;
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	tcp_mtudisc(inp, -1);
 	return (inp);
 }
 
 static void
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return;
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	tp->t_fb->tfb_tcp_output(tp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) ||
 		(!key_havesp(IPSEC_DIR_OUTBOUND)))
 		return (0);
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
 */
 struct secasvar *
 tcp_get_sav(struct mbuf *m, u_int direction)
 {
 	union sockaddr_union dst;
 	struct secasvar *sav;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (NULL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 	}
 
 	return (sav);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * sav		pointer to security assosiation
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Releases reference to SADB key before return. 
  *
  * Return 0 if successful, otherwise return -1.
  *
  */
 int
 tcp_signature_do_compute(struct mbuf *m, int len, int optlen,
     u_char *buf, struct secasvar *sav)
 {
 #ifdef INET
 	struct ippseudo ippseudo;
 #endif
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 #ifdef INET
 	struct ipovly *ipovly;
 #endif
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		KEY_FREESAV(&sav);
 		return (-1);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Return 0 if successful, otherwise return -1.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	struct secasvar *sav;
 
 	if ((sav = tcp_get_sav(m, direction)) == NULL)
 		return (-1);
 
 	return (tcp_signature_do_compute(m, len, optlen, buf, sav));
 }
 
 /*
  * Verify the TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * Return 1 if successful, otherwise return 0.
  */
 int
 tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	char tmpdigest[TCP_SIGLEN];
 
 	if (tcp_sig_checksigs == 0)
 		return (1);
 	if ((tcpbflag & TF_SIGNATURE) == 0) {
 		if ((to->to_flags & TOF_SIGNATURE) != 0) {
 
 			/*
 			 * If this socket is not expecting signature but
 			 * the segment contains signature just fail.
 			 */
 			TCPSTAT_INC(tcps_sig_err_sigopt);
 			TCPSTAT_INC(tcps_sig_rcvbadsig);
 			return (0);
 		}
 
 		/* Signature is not expected, and not present in segment. */
 		return (1);
 	}
 
 	/*
 	 * If this socket is expecting signature but the segment does not
 	 * contain any just fail.
 	 */
 	if ((to->to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_nosigopt);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0],
 	    IPSEC_DIR_INBOUND) == -1) {
 		TCPSTAT_INC(tcps_sig_err_buildsig);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	
 	if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) {
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	TCPSTAT_INC(tcps_sig_rcvgoodsig);
 	return (1);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_RLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (inp->inp_flags & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_flags & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
Index: projects/vnet/sys/netinet/tcp_var.h
===================================================================
--- projects/vnet/sys/netinet/tcp_var.h	(revision 294565)
+++ projects/vnet/sys/netinet/tcp_var.h	(revision 294566)
@@ -1,866 +1,863 @@
 /*-
  * Copyright (c) 1982, 1986, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_VAR_H_
 #define _NETINET_TCP_VAR_H_
 
 #include <netinet/tcp.h>
 
 #ifdef _KERNEL
 #include <net/vnet.h>
 #include <sys/mbuf.h>
 
 /*
  * Kernel variables for tcp.
  */
 VNET_DECLARE(int, tcp_do_rfc1323);
 #define	V_tcp_do_rfc1323	VNET(tcp_do_rfc1323)
 
 #endif /* _KERNEL */
 
 /* TCP segment queue entry */
 struct tseg_qent {
 	LIST_ENTRY(tseg_qent) tqe_q;
 	int	tqe_len;		/* TCP segment data length */
 	struct	tcphdr *tqe_th;		/* a pointer to tcp header */
 	struct	mbuf	*tqe_m;		/* mbuf contains packet */
 };
 LIST_HEAD(tsegqe_head, tseg_qent);
 
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
 };
 
 struct sackhole {
 	tcp_seq start;		/* start seq no. of hole */
 	tcp_seq end;		/* end seq no. */
 	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
 	TAILQ_ENTRY(sackhole) scblink;	/* scoreboard linkage */
 };
 
 struct sackhint {
 	struct sackhole	*nexthole;
 	int		sack_bytes_rexmit;
 	tcp_seq		last_sack_ack;	/* Most recent/largest sacked ack */
 
 	int		ispare;		/* explicit pad for 64bit alignment */
 	int             sacked_bytes;	/*
 					 * Total sacked bytes reported by the
 					 * receiver via sack option
 					 */
 	uint32_t	_pad1[1];	/* TBD */
 	uint64_t	_pad[1];	/* TBD */
 };
 
 struct tcptemp {
 	u_char	tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
 	struct	tcphdr tt_t;
 };
 
 #define tcp6cb		tcpcb  /* for KAME src sync over BSD*'s */
 
 /* 
  * TODO: We yet need to brave plowing in
  * to tcp_input() and the pru_usrreq() block.
  * Right now these go to the old standards which
  * are somewhat ok, but in the long term may
  * need to be changed. If we do tackle tcp_input()
  * then we need to get rid of the tcp_do_segment()
  * function below.
  */
 /* Flags for tcp functions */
 #define TCP_FUNC_BEING_REMOVED 0x01   	/* Can no longer be referenced */
 struct tcpcb;
 struct inpcb;
 struct sockopt;
 struct socket;
 
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
 	int	(*tfb_tcp_output)(struct tcpcb *);
 	void	(*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 			    int, int, uint8_t,
 			    int);
 	int     (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
 			    struct inpcb *inp, struct tcpcb *tp);
 	/* Optional memory allocation/free routine */
 	void	(*tfb_tcp_fb_init)(struct tcpcb *);
 	void	(*tfb_tcp_fb_fini)(struct tcpcb *);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	int	(*tfb_tcp_timers_left)(struct tcpcb *);
 	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
 			    uint32_t, u_int);
 	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 };
 
 struct tcp_function {
 	TAILQ_ENTRY(tcp_function) tf_next;
 	struct tcp_function_block *tf_fb;
 };
 
 TAILQ_HEAD(tcp_funchead, tcp_function);
 
 /*
  * Tcp control block, one per tcp; fields:
  * Organized for 16 byte cacheline efficiency.
  */
 struct tcpcb {
 	struct	tsegqe_head t_segq;	/* segment reassembly queue */
 	void	*t_pspare[2];		/* new reassembly queue */
 	int	t_segqlen;		/* segment reassembly queue length */
 	int	t_dupacks;		/* consecutive dup acks recd */
 
 	struct tcp_timer *t_timers;	/* All the TCP timers in one struct */
 
 	struct	inpcb *t_inpcb;		/* back pointer to internet pcb */
 	int	t_state;		/* state of this connection */
 	u_int	t_flags;
 
 	struct	vnet *t_vnet;		/* back pointer to parent vnet */
 
 	tcp_seq	snd_una;		/* sent but unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
 					 * used to recognize retransmits
 					 */
 	tcp_seq	snd_nxt;		/* send next */
 	tcp_seq	snd_up;			/* send urgent pointer */
 
 	tcp_seq	snd_wl1;		/* window update seg seq number */
 	tcp_seq	snd_wl2;		/* window update seg ack number */
 	tcp_seq	iss;			/* initial send sequence number */
 	tcp_seq	irs;			/* initial receive sequence number */
 
 	tcp_seq	rcv_nxt;		/* receive next */
 	tcp_seq	rcv_adv;		/* advertised window */
 	u_long	rcv_wnd;		/* receive window */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
 	u_long	snd_spare1;		/* unused */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
 	u_long	snd_spare2;		/* unused */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 
 	u_int	t_maxopd;		/* mss plus options */
 
 	u_int	t_rcvtime;		/* inactivity time */
 	u_int	t_starttime;		/* time connection was established */
 	u_int	t_rtttime;		/* RTT measurement start time */
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 
 	u_int	t_bw_spare1;		/* unused */
 	tcp_seq	t_bw_spare2;		/* unused */
 
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 	u_int	t_maxseg;		/* maximum segment size */
 	int	t_srtt;			/* smoothed round-trip time */
 	int	t_rttvar;		/* variance in round-trip time */
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rttmin;		/* minimum rtt allowed */
 	u_int	t_rttbest;		/* best rtt we've seen */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	u_long	max_sndwnd;		/* largest window peer has offered */
 
 	int	t_softerror;		/* possible error not yet reported */
 /* out-of-band data */
 	char	t_oobflags;		/* have some */
 	char	t_iobc;			/* input character */
 /* RFC 1323 variables */
 	u_char	snd_scale;		/* window scaling for send window */
 	u_char	rcv_scale;		/* window scaling for recv window */
 	u_char	request_r_scale;	/* pending window scaling */
 	u_int32_t  ts_recent;		/* timestamp echo data */
 	u_int	ts_recent_age;		/* when last updated */
 	u_int32_t  ts_offset;		/* our timestamp offset */
 
 	tcp_seq	last_ack_sent;
 /* experimental */
 	u_long	snd_cwnd_prev;		/* cwnd prior to retransmit */
 	u_long	snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	int	t_sndzerowin;		/* zero-window updates sent */
 	u_int	t_badrxtwin;		/* window for retransmit recovery */
 	u_char	snd_limited;		/* segments limited transmitted */
 /* SACK related state */
 	int	snd_numholes;		/* number of holes seen by sender */
 	TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
 					/* SACK scoreboard (sorted) */
 	tcp_seq	snd_fack;		/* last seq number(+1) sack'd by rcv'r*/
 	int	rcv_numsacks;		/* # distinct sack blks present */
 	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	tcp_seq sack_newdata;		/* New data xmitted in this recovery
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
 	struct cc_algo	*cc_algo;	/* congestion control algorithm */
 	struct cc_var	*ccv;		/* congestion control specific vars */
 	struct osd	*osd;		/* storage for Khelp module data */
 
 	u_int	t_keepinit;		/* time to establish connection */
 	u_int	t_keepidle;		/* time before keepalive probes begin */
 	u_int	t_keepintvl;		/* interval between keepalives */
 	u_int	t_keepcnt;		/* number of keepalives before close */
 
 	u_int	t_tsomax;		/* TSO total burst length limit in bytes */
 	u_int	t_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	u_int	t_pmtud_saved_maxopd;	/* pre-blackhole MSS */
 	u_int	t_flags2;		/* More tcpcb flags storage */
 #if defined(_KERNEL) && defined(TCP_RFC7413)
 	uint32_t t_ispare[6];		/* 5 UTO, 1 TBD */
 	uint64_t t_tfo_cookie;		/* TCP Fast Open cookie */
 #else
 	uint32_t t_ispare[8];		/* 5 UTO, 3 TBD */
 #endif
 	struct tcp_function_block *t_fb;/* TCP function call block */
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 #if defined(_KERNEL) && defined(TCP_RFC7413)
 	unsigned int *t_tfo_pending;	/* TCP Fast Open pending counter */
 	void	*t_pspare2[1];		/* 1 TCP_SIGNATURE */
 #else
 	void	*t_pspare2[2];		/* 1 TCP_SIGNATURE, 1 TBD */
 #endif
 #if defined(_KERNEL) && defined(TCPPCAP)
 	struct mbufq t_inpkts;		/* List of saved input packets. */
 	struct mbufq t_outpkts;		/* List of saved output packets. */
 #ifdef _LP64
 	uint64_t _pad[0];		/* all used! */
 #else
 	uint64_t _pad[2];		/* 2 are available */
 #endif /* _LP64 */
 #else
 	uint64_t _pad[6];
 #endif /* defined(_KERNEL) && defined(TCPPCAP) */
 };
 
 /*
  * Flags and utility macros for the t_flags field.
  */
 #define	TF_ACKNOW	0x000001	/* ack peer immediately */
 #define	TF_DELACK	0x000002	/* ack, but try to delay it */
 #define	TF_NODELAY	0x000004	/* don't delay packets to coalesce */
 #define	TF_NOOPT	0x000008	/* don't use tcp options */
 #define	TF_SENTFIN	0x000010	/* have sent FIN */
 #define	TF_REQ_SCALE	0x000020	/* have/will request window scaling */
 #define	TF_RCVD_SCALE	0x000040	/* other side has requested scaling */
 #define	TF_REQ_TSTMP	0x000080	/* have/will request timestamps */
 #define	TF_RCVD_TSTMP	0x000100	/* a timestamp was received in SYN */
 #define	TF_SACK_PERMIT	0x000200	/* other side said I could SACK */
 #define	TF_NEEDSYN	0x000400	/* send SYN (implicit state) */
 #define	TF_NEEDFIN	0x000800	/* send FIN (implicit state) */
 #define	TF_NOPUSH	0x001000	/* don't push */
 #define	TF_PREVVALID	0x002000	/* saved values for bad rxmit valid */
 #define	TF_MORETOCOME	0x010000	/* More data to be appended to sock */
 #define	TF_LQ_OVERFLOW	0x020000	/* listen queue overflow */
 #define	TF_LASTIDLE	0x040000	/* connection was previously idle */
 #define	TF_RXWIN0SENT	0x080000	/* sent a receiver win 0 in response */
 #define	TF_FASTRECOVERY	0x100000	/* in NewReno Fast Recovery */
 #define	TF_WASFRECOVERY	0x200000	/* was in NewReno Fast Recovery */
 #define	TF_SIGNATURE	0x400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x800000	/* force out a byte */
 #define	TF_TSO		0x1000000	/* TSO enabled on this connection */
 #define	TF_TOE		0x2000000	/* this connection is offloaded */
 #define	TF_ECN_PERMIT	0x4000000	/* connection ECN-ready */
 #define	TF_ECN_SND_CWR	0x8000000	/* ECN CWR in queue */
 #define	TF_ECN_SND_ECE	0x10000000	/* ECN ECE in queue */
 #define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
 #define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 #define	TF_FASTOPEN	0x80000000	/* TCP Fast Open indication */
 
 #define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
 #define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
 #define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
 
 #define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
 #define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
 #define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
 
 #define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
 #define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
 #define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
 
 #define	BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
 
 /*
  * Flags for the t_oobflags field.
  */
 #define	TCPOOB_HAVEDATA	0x01
 #define	TCPOOB_HADDATA	0x02
 
 #ifdef TCP_SIGNATURE
 /*
  * Defines which are needed by the xform_tcp module and tcp_[in|out]put
  * for SADB verification and lookup.
  */
 #define	TCP_SIGLEN	16	/* length of computed digest in bytes */
 #define	TCP_KEYLEN_MIN	1	/* minimum length of TCP-MD5 key */
 #define	TCP_KEYLEN_MAX	80	/* maximum length of TCP-MD5 key */
 /*
  * Only a single SA per host may be specified at this time. An SPI is
  * needed in order for the KEY_ALLOCSA() lookup to work.
  */
 #define	TCP_SIG_SPI	0x1000
 #endif /* TCP_SIGNATURE */
 
 /*
  * Flags for PLPMTU handling, t_flags2
  */
 #define	TF2_PLPMTU_BLACKHOLE	0x00000001 /* Possible PLPMTUD Black Hole. */
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
 
 /*
  * Structure to hold TCP options that are only used during segment
  * processing (in tcp_input), but not held in the tcpcb.
  * It's basically used to reduce the number of parameters
  * to tcp_dooptions and tcp_addoptions.
  * The binary order of the to_flags is relevant for packing of the
  * options in tcp_addoptions.
  */
 struct tcpopt {
 	u_int64_t	to_flags;	/* which options are present */
 #define	TOF_MSS		0x0001		/* maximum segment size */
 #define	TOF_SCALE	0x0002		/* window scaling */
 #define	TOF_SACKPERM	0x0004		/* SACK permitted */
 #define	TOF_TS		0x0010		/* timestamp */
 #define	TOF_SIGNATURE	0x0040		/* TCP-MD5 signature option (RFC2385) */
 #define	TOF_SACK	0x0080		/* Peer sent SACK option */
 #define	TOF_FASTOPEN	0x0100		/* TCP Fast Open (TFO) cookie */
 #define	TOF_MAXOPT	0x0200
 	u_int32_t	to_tsval;	/* new timestamp */
 	u_int32_t	to_tsecr;	/* reflected timestamp */
 	u_char		*to_sacks;	/* pointer to the first SACK blocks */
 	u_char		*to_signature;	/* pointer to the TCP-MD5 signature */
 	u_char		*to_tfo_cookie; /* pointer to the TFO cookie */
 	u_int16_t	to_mss;		/* maximum segment size */
 	u_int8_t	to_wscale;	/* window scaling */
 	u_int8_t	to_nsacks;	/* number of SACK blocks */
 	u_int8_t	to_tfo_len;	/* TFO cookie length */
 	u_int32_t	to_spare;	/* UTO */
 };
 
 /*
  * Flags for tcp_dooptions.
  */
 #define	TO_SYN		0x01		/* parse SYN-only options */
 
 struct hc_metrics_lite {	/* must stay in sync with hc_metrics */
 	u_long	rmx_mtu;	/* MTU for this path */
 	u_long	rmx_ssthresh;	/* outbound gateway buffer limit */
 	u_long	rmx_rtt;	/* estimated round trip time */
 	u_long	rmx_rttvar;	/* estimated rtt variance */
 	u_long	rmx_cwnd;	/* congestion window */
 	u_long	rmx_sendpipe;   /* outbound delay-bandwidth product */
 	u_long	rmx_recvpipe;   /* inbound delay-bandwidth product */
 };
 
 /*
  * Used by tcp_maxmtu() to communicate interface specific features
  * and limits at the time of connection setup.
  */
 struct tcp_ifcap {
 	int	ifcap;
 	u_int	tsomax;
 	u_int	tsomaxsegcount;
 	u_int	tsomaxsegsize;
 };
 
 #ifndef _NETINET_IN_PCB_H_
 struct in_conninfo;
 #endif /* _NETINET_IN_PCB_H_ */
 
 struct tcptw {
 	struct inpcb	*tw_inpcb;	/* XXX back pointer to internet pcb */
 	tcp_seq		snd_nxt;
 	tcp_seq		rcv_nxt;
 	tcp_seq		iss;
 	tcp_seq		irs;
 	u_short		last_win;	/* cached window value */
 	u_short		tw_so_options;	/* copy of so_options */
 	struct ucred	*tw_cred;	/* user credentials */
 	u_int32_t	t_recent;
 	u_int32_t	ts_offset;	/* our timestamp offset */
 	u_int		t_starttime;
 	int		tw_time;
 	TAILQ_ENTRY(tcptw) tw_2msl;
 	void		*tw_pspare;	/* TCP_SIGNATURE */
 	u_int		*tw_spare;	/* TCP_SIGNATURE */
 };
 
 #define	intotcpcb(ip)	((struct tcpcb *)(ip)->inp_ppcb)
 #define	intotw(ip)	((struct tcptw *)(ip)->inp_ppcb)
 #define	sototcpcb(so)	(intotcpcb(sotoinpcb(so)))
 
 /*
  * The smoothed round-trip time and estimated variance
  * are stored as fixed point numbers scaled by the values below.
  * For convenience, these scales are also used in smoothing the average
  * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
  * With these scales, srtt has 3 bits to the right of the binary point,
  * and thus an "ALPHA" of 0.875.  rttvar has 2 bits to the right of the
  * binary point, and is smoothed with an ALPHA of 0.75.
  */
 #define	TCP_RTT_SCALE		32	/* multiplier for srtt; 3 bits frac. */
 #define	TCP_RTT_SHIFT		5	/* shift for srtt; 3 bits frac. */
 #define	TCP_RTTVAR_SCALE	16	/* multiplier for rttvar; 2 bits */
 #define	TCP_RTTVAR_SHIFT	4	/* shift for rttvar; 2 bits */
 #define	TCP_DELTA_SHIFT		2	/* see tcp_input.c */
 
 /*
  * The initial retransmission should happen at rtt + 4 * rttvar.
  * Because of the way we do the smoothing, srtt and rttvar
  * will each average +1/2 tick of bias.  When we compute
  * the retransmit timer, we want 1/2 tick of rounding and
  * 1 extra tick because of +-1/2 tick uncertainty in the
  * firing of the timer.  The bias will give us exactly the
  * 1.5 tick we need.  But, because the bias is
  * statistical, we have to test that we don't drop below
  * the minimum feasible timer (which is 2 ticks).
  * This version of the macro adapted from a paper by Lawrence
  * Brakmo and Larry Peterson which outlines a problem caused
  * by insufficient precision in the original implementation,
  * which results in inappropriately large RTO values for very
  * fast networks.
  */
 #define	TCP_REXMTVAL(tp) \
 	max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT))  \
 	  + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
 
 /*
  * TCP statistics.
  * Many of these should be kept per connection,
  * but that's inconvenient at the moment.
  */
 struct	tcpstat {
 	uint64_t tcps_connattempt;	/* connections initiated */
 	uint64_t tcps_accepts;		/* connections accepted */
 	uint64_t tcps_connects;		/* connections established */
 	uint64_t tcps_drops;		/* connections dropped */
 	uint64_t tcps_conndrops;	/* embryonic connections dropped */
 	uint64_t tcps_minmssdrops;	/* average minmss too low drops */
 	uint64_t tcps_closed;		/* conn. closed (includes drops) */
 	uint64_t tcps_segstimed;	/* segs where we tried to get rtt */
 	uint64_t tcps_rttupdated;	/* times we succeeded */
 	uint64_t tcps_delack;		/* delayed acks sent */
 	uint64_t tcps_timeoutdrop;	/* conn. dropped in rxmt timeout */
 	uint64_t tcps_rexmttimeo;	/* retransmit timeouts */
 	uint64_t tcps_persisttimeo;	/* persist timeouts */
 	uint64_t tcps_keeptimeo;	/* keepalive timeouts */
 	uint64_t tcps_keepprobe;	/* keepalive probes sent */
 	uint64_t tcps_keepdrops;	/* connections dropped in keepalive */
 
 	uint64_t tcps_sndtotal;		/* total packets sent */
 	uint64_t tcps_sndpack;		/* data packets sent */
 	uint64_t tcps_sndbyte;		/* data bytes sent */
 	uint64_t tcps_sndrexmitpack;	/* data packets retransmitted */
 	uint64_t tcps_sndrexmitbyte;	/* data bytes retransmitted */
 	uint64_t tcps_sndrexmitbad;	/* unnecessary packet retransmissions */
 	uint64_t tcps_sndacks;		/* ack-only packets sent */
 	uint64_t tcps_sndprobe;		/* window probes sent */
 	uint64_t tcps_sndurg;		/* packets sent with URG only */
 	uint64_t tcps_sndwinup;		/* window update-only packets sent */
 	uint64_t tcps_sndctrl;		/* control (SYN|FIN|RST) packets sent */
 
 	uint64_t tcps_rcvtotal;		/* total packets received */
 	uint64_t tcps_rcvpack;		/* packets received in sequence */
 	uint64_t tcps_rcvbyte;		/* bytes received in sequence */
 	uint64_t tcps_rcvbadsum;	/* packets received with ccksum errs */
 	uint64_t tcps_rcvbadoff;	/* packets received with bad offset */
 	uint64_t tcps_rcvreassfull;	/* packets dropped for no reass space */
 	uint64_t tcps_rcvshort;		/* packets received too short */
 	uint64_t tcps_rcvduppack;	/* duplicate-only packets received */
 	uint64_t tcps_rcvdupbyte;	/* duplicate-only bytes received */
 	uint64_t tcps_rcvpartduppack;	/* packets with some duplicate data */
 	uint64_t tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	uint64_t tcps_rcvoopack;	/* out-of-order packets received */
 	uint64_t tcps_rcvoobyte;	/* out-of-order bytes received */
 	uint64_t tcps_rcvpackafterwin;	/* packets with data after window */
 	uint64_t tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	uint64_t tcps_rcvafterclose;	/* packets rcvd after "close" */
 	uint64_t tcps_rcvwinprobe;	/* rcvd window probe packets */
 	uint64_t tcps_rcvdupack;	/* rcvd duplicate acks */
 	uint64_t tcps_rcvacktoomuch;	/* rcvd acks for unsent data */
 	uint64_t tcps_rcvackpack;	/* rcvd ack packets */
 	uint64_t tcps_rcvackbyte;	/* bytes acked by rcvd acks */
 	uint64_t tcps_rcvwinupd;	/* rcvd window update packets */
 	uint64_t tcps_pawsdrop;		/* segments dropped due to PAWS */
 	uint64_t tcps_predack;		/* times hdr predict ok for acks */
 	uint64_t tcps_preddat;		/* times hdr predict ok for data pkts */
 	uint64_t tcps_pcbcachemiss;
 	uint64_t tcps_cachedrtt;	/* times cached RTT in route updated */
 	uint64_t tcps_cachedrttvar;	/* times cached rttvar updated */
 	uint64_t tcps_cachedssthresh;	/* times cached ssthresh updated */
 	uint64_t tcps_usedrtt;		/* times RTT initialized from route */
 	uint64_t tcps_usedrttvar;	/* times RTTVAR initialized from rt */
 	uint64_t tcps_usedssthresh;	/* times ssthresh initialized from rt*/
 	uint64_t tcps_persistdrop;	/* timeout in persist state */
 	uint64_t tcps_badsyn;		/* bogus SYN, e.g. premature ACK */
 	uint64_t tcps_mturesent;	/* resends due to MTU discovery */
 	uint64_t tcps_listendrop;	/* listen queue overflows */
 	uint64_t tcps_badrst;		/* ignored RSTs in the window */
 
 	uint64_t tcps_sc_added;		/* entry added to syncache */
 	uint64_t tcps_sc_retransmitted;	/* syncache entry was retransmitted */
 	uint64_t tcps_sc_dupsyn;	/* duplicate SYN packet */
 	uint64_t tcps_sc_dropped;	/* could not reply to packet */
 	uint64_t tcps_sc_completed;	/* successful extraction of entry */
 	uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */
 	uint64_t tcps_sc_cacheoverflow;	/* syncache cache limit hit */
 	uint64_t tcps_sc_reset;		/* RST removed entry from syncache */
 	uint64_t tcps_sc_stale;		/* timed out or listen socket gone */
 	uint64_t tcps_sc_aborted;	/* syncache entry aborted */
 	uint64_t tcps_sc_badack;	/* removed due to bad ACK */
 	uint64_t tcps_sc_unreach;	/* ICMP unreachable received */
 	uint64_t tcps_sc_zonefail;	/* zalloc() failed */
 	uint64_t tcps_sc_sendcookie;	/* SYN cookie sent */
 	uint64_t tcps_sc_recvcookie;	/* SYN cookie received */
 
 	uint64_t tcps_hc_added;		/* entry added to hostcache */
 	uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
 
 	uint64_t tcps_finwait2_drops;    /* Drop FIN_WAIT_2 connection after time limit */
 
 	/* SACK related stats */
 	uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */
 	uint64_t tcps_sack_rexmits;	    /* SACK rexmit segments   */
 	uint64_t tcps_sack_rexmit_bytes;    /* SACK rexmit bytes      */
 	uint64_t tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
 	uint64_t tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 	uint64_t tcps_sack_sboverflow;	    /* times scoreboard overflowed */
 	
 	/* ECN related stats */
 	uint64_t tcps_ecn_ce;		/* ECN Congestion Experienced */
 	uint64_t tcps_ecn_ect0;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_ect1;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_shs;		/* ECN successful handshakes */
 	uint64_t tcps_ecn_rcwnd;	/* # times ECN reduced the cwnd */
 
 	/* TCP_SIGNATURE related stats */
 	uint64_t tcps_sig_rcvgoodsig;	/* Total matching signature received */
 	uint64_t tcps_sig_rcvbadsig;	/* Total bad signature received */
 	uint64_t tcps_sig_err_buildsig;	/* Mismatching signature received */
 	uint64_t tcps_sig_err_sigopt;	/* No signature expected by socket */
 	uint64_t tcps_sig_err_nosigopt;	/* No signature provided by segment */
 
 	uint64_t _pad[12];		/* 6 UTO, 6 TBD */
 };
 
 #define	tcps_rcvmemdrop	tcps_rcvreassfull	/* compat */
 
 #ifdef _KERNEL
 #define	TI_UNLOCKED	1
 #define	TI_RLOCKED	2
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat);	/* tcp statistics */
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	TCPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val))
 #define	TCPSTAT_INC(name)	TCPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_tcpstat_inc(int statnum);
 #define	KMOD_TCPSTAT_INC(name)						\
     kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t))
 
 /*
  * TCP specific helper hook point identifiers.
  */
 #define	HHOOK_TCP_EST_IN		0
 #define	HHOOK_TCP_EST_OUT		1
 #define	HHOOK_TCP_LAST			HHOOK_TCP_EST_OUT
 
 struct tcp_hhook_data {
 	struct tcpcb	*tp;
 	struct tcphdr	*th;
 	struct tcpopt	*to;
 	long		len;
 	int		tso;
 	tcp_seq		curack;
 };
 #endif
 
 /*
  * TCB structure exported to user-land via sysctl(3).
  * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
  * included.  Not all of our clients do.
  */
 #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
 struct xtcp_timer {
 	int tt_rexmt;	/* retransmit timer */
 	int tt_persist;	/* retransmit persistence */
 	int tt_keep;	/* keepalive */
 	int tt_2msl;	/* 2*msl TIME_WAIT timer */
 	int tt_delack;	/* delayed ACK timer */
 	int t_rcvtime;	/* Time since last packet received */
 };
 struct	xtcpcb {
 	size_t	xt_len;
 	struct	inpcb	xt_inp;
 	struct	tcpcb	xt_tp;
 	struct	xsocket	xt_socket;
 	struct	xtcp_timer xt_timer;
 	u_quad_t	xt_alignment_hack;
 };
 #endif
 
 /*
  * Identifiers for TCP sysctl nodes
  */
 #define	TCPCTL_DO_RFC1323	1	/* use RFC-1323 extensions */
 #define	TCPCTL_MSSDFLT		3	/* MSS default */
 #define TCPCTL_STATS		4	/* statistics (read-only) */
 #define	TCPCTL_RTTDFLT		5	/* default RTT estimate */
 #define	TCPCTL_KEEPIDLE		6	/* keepalive idle timer */
 #define	TCPCTL_KEEPINTVL	7	/* interval to send keepalives */
 #define	TCPCTL_SENDSPACE	8	/* send buffer space */
 #define	TCPCTL_RECVSPACE	9	/* receive buffer space */
 #define	TCPCTL_KEEPINIT		10	/* timeout for establishing syn */
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
 #define	TCPCTL_SACK		14	/* Selective Acknowledgement,rfc 2018 */
 #define	TCPCTL_DROP		15	/* drop tcp connection */
 
 #ifdef _KERNEL
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_tcp);
 SYSCTL_DECL(_net_inet_tcp_sack);
 MALLOC_DECLARE(M_TCPLOG);
 #endif
 
 VNET_DECLARE(struct inpcbhead, tcb);		/* queue of active tcpcb's */
 VNET_DECLARE(struct inpcbinfo, tcbinfo);
 extern	int tcp_log_in_vain;
 VNET_DECLARE(int, tcp_mssdflt);	/* XXX */
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_rfc3390);
 VNET_DECLARE(int, tcp_initcwnd_segments);
 VNET_DECLARE(int, tcp_sendspace);
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, path_mtu_discovery);
 VNET_DECLARE(int, tcp_do_rfc3465);
 VNET_DECLARE(int, tcp_abc_l_var);
 #define	V_tcb			VNET(tcb)
 #define	V_tcbinfo		VNET(tcbinfo)
 #define	V_tcp_mssdflt		VNET(tcp_mssdflt)
 #define	V_tcp_minmss		VNET(tcp_minmss)
 #define	V_tcp_delack_enabled	VNET(tcp_delack_enabled)
 #define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
 #define	V_tcp_initcwnd_segments	VNET(tcp_initcwnd_segments)
 #define	V_tcp_sendspace		VNET(tcp_sendspace)
 #define	V_tcp_recvspace		VNET(tcp_recvspace)
 #define	V_path_mtu_discovery	VNET(path_mtu_discovery)
 #define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
 #define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 
 VNET_DECLARE(int, tcp_do_sack);			/* SACK enabled/disabled */
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);	/* RST on sock alloc failure */
 #define	V_tcp_do_sack		VNET(tcp_do_sack)
 #define	V_tcp_sc_rst_sock_fail	VNET(tcp_sc_rst_sock_fail)
 
 VNET_DECLARE(int, tcp_do_ecn);			/* TCP ECN enabled/disabled */
 VNET_DECLARE(int, tcp_ecn_maxretries);
 #define	V_tcp_do_ecn		VNET(tcp_do_ecn)
 #define	V_tcp_ecn_maxretries	VNET(tcp_ecn_maxretries)
 
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #define	V_tcp_hhh		VNET(tcp_hhh)
 
 VNET_DECLARE(int, tcp_do_rfc6675_pipe);
 #define V_tcp_do_rfc6675_pipe	VNET(tcp_do_rfc6675_pipe)
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
 int	 tcp_ccalgounload(struct cc_algo *unload_algo);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);
 void	 tcp_twstart(struct tcpcb *);
 void	 tcp_twclose(struct tcptw *, int);
 void	 tcp_ctlinput(int, struct sockaddr *, void *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 struct tcpcb *
 	 tcp_drop(struct tcpcb *, int);
 void	 tcp_drain(void);
 void	 tcp_init(void);
-#ifdef VIMAGE
-void	 tcp_destroy(void);
-#endif
 void	 tcp_fini(void *);
 char	*tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
 void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
 void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 void	tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 void	tcp_xmit_timer(struct tcpcb *, int);
 void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t type);
 void 	cc_conn_init(struct tcpcb *tp);
 void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
 void	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 
 int	 tcp_input(struct mbuf **, int *, int);
 void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 			struct socket *, struct tcpcb *, int, int, uint8_t,
 			int);
 
 int register_tcp_functions(struct tcp_function_block *blk, int wait);
 int deregister_tcp_functions(struct tcp_function_block *blk);
 struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
 struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
 int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
 
 u_long	 tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 u_long	 tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    struct tcp_ifcap *);
 void	 tcp_mss(struct tcpcb *, int);
 int	 tcp_mssopt(struct in_conninfo *);
 struct inpcb *
 	 tcp_drop_syn_sent(struct inpcb *, int);
 struct tcpcb *
 	 tcp_newtcpcb(struct inpcb *);
 int	 tcp_output(struct tcpcb *);
 void	 tcp_state_change(struct tcpcb *, int);
 void	 tcp_respond(struct tcpcb *, void *,
 	    struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
 void	 tcp_tw_init(void);
 #ifdef VIMAGE
 void	 tcp_tw_destroy(void);
 #endif
 void	 tcp_tw_zone_change(void);
 int	 tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
 	    struct mbuf *, int);
 void	 tcp_setpersist(struct tcpcb *);
 #ifdef TCP_SIGNATURE
 struct secasvar;
 struct secasvar *tcp_get_sav(struct mbuf *, u_int);
 int	 tcp_signature_do_compute(struct mbuf *, int, int, u_char *,
 	    struct secasvar *);
 int	 tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int);
 int	 tcp_signature_verify(struct mbuf *, int, int, int, struct tcpopt *,
 	    struct tcphdr *, u_int);
 int	tcp_signature_check(struct mbuf *m, int off0, int tlen, int optlen,
 	    struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
 #endif
 void	 tcp_slowtimo(void);
 struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, void *, void *);
 void	 tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
 int	 tcp_timer_active(struct tcpcb *, uint32_t);
 void	 tcp_timer_stop(struct tcpcb *, uint32_t);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
  */
 void	 tcp_hc_init(void);
 #ifdef VIMAGE
 void	 tcp_hc_destroy(void);
 #endif
 void	 tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
 u_long	 tcp_hc_getmtu(struct in_conninfo *);
 void	 tcp_hc_updatemtu(struct in_conninfo *, u_long);
 void	 tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
 
 extern	struct pr_usrreqs tcp_usrreqs;
 tcp_seq tcp_new_isn(struct tcpcb *);
 
 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_free_sackholes(struct tcpcb *tp);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long	 tcp_seq_subtract(u_long, u_long );
 int	 tcp_compute_pipe(struct tcpcb *);
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
 
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 }
 
 #ifdef TCP_SIGNATURE
 static inline void
 tcp_fields_to_net(struct tcphdr *th)
 {
 
 	th->th_seq = htonl(th->th_seq);
 	th->th_ack = htonl(th->th_ack);
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
 #endif
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */
Index: projects/vnet/sys/netinet/udp_usrreq.c
===================================================================
--- projects/vnet/sys/netinet/udp_usrreq.c	(revision 294565)
+++ projects/vnet/sys/netinet/udp_usrreq.c	(revision 294566)
@@ -1,1915 +1,1918 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 #include <netinet/in_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/esp.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP and UDP-Lite protocols implementation.
  * Per RFC 768, August, 1980.
  * Per RFC 3828, July, 2004.
  */
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 VNET_DEFINE(int, udp_cksum) = 1;
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
 
 int	udp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &udp_log_in_vain, 0, "Log all incoming UDP packets");
 
 VNET_DEFINE(int, udp_blackhole) = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_blackhole), 0,
     "Do not send port unreachables for refused connects");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );	/* 40 1K datagrams */
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
 VNET_DEFINE(struct inpcbinfo, udbinfo);
 VNET_DEFINE(struct inpcbhead, ulitecb);
 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
 static VNET_DEFINE(uma_zone_t, udpcb_zone);
 #define	V_udpcb_zone			VNET(udpcb_zone)
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
 VNET_PCPUSTAT_SYSINIT(udpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(udpstat);
 #endif /* VIMAGE */
 #ifdef INET
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *);
 #endif
 
 #ifdef IPSEC
 #ifdef IPSEC_NAT_T
 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
 #ifdef INET
 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
 #endif
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 
 static void
 udp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 }
 
 static int
 udp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpinp");
 	return (0);
 }
 
 static int
 udplite_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpliteinp");
 	return (0);
 }
 
 void
 udp_init(void)
 {
 
 	/*
 	 * For now default to 2-tuple UDP hashing - until the fragment
 	 * reassembly code can also update the flowid.
 	 *
 	 * Once we can calculate the flowid that way and re-establish
 	 * a 4-tuple, flip this to 4-tuple.
 	 */
 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
 	    "udp_inpcb", udp_inpcb_init, NULL, 0,
 	    IPI_HASHFIELDS_2TUPLE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 void
 udplite_init(void)
 {
 
 	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
 	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
 	    0, IPI_HASHFIELDS_2TUPLE);
 }
 
 /*
  * Kernel module interface for updating udpstat.  The argument is an index
  * into udpstat treated as an array of u_long.  While this encodes the
  * general layout of udpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_udpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(udpstat)[statnum], 1);
 }
 
 int
 udp_newudpcb(struct inpcb *inp)
 {
 	struct udpcb *up;
 
 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
 	if (up == NULL)
 		return (ENOBUFS);
 	inp->inp_ppcb = up;
 	return (0);
 }
 
 void
 udp_discardcb(struct udpcb *up)
 {
 
 	uma_zfree(V_udpcb_zone, up);
 }
 
 #ifdef VIMAGE
-void
+static void
 udp_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_udbinfo);
 	uma_zdestroy(V_udpcb_zone);
 }
+VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
 
-void
+static void
 udplite_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_ulitecbinfo);
 }
+VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
+    NULL);
 #endif
 
 #ifdef INET
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  *
  * In the normal case udp_append() will return 0, indicating that you
  * must unlock the inp. However if a tunneling protocol is in place we increment
  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
  * then decrement the reference count. If the inp_rele returns 1, indicating the
  * inp is gone, we return that to the caller to tell them *not* to unlock
  * the inp. In the case of multi-cast this will cause the distribution
  * to stop (though most tunneling protocols known currently do *not* use
  * multicast).
  */
 static int
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 	struct udpcb *up;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		in_pcbref(inp);
 		INP_RUNLOCK(inp);
 		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in,
 		    up->u_tun_ctx);
 		INP_RLOCK(inp);
 		return (in_pcbrele_rlocked(inp));
 	}
 
 	off += sizeof(struct udphdr);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, inp)) {
 		m_freem(n);
 		return (0);
 	}
 #ifdef IPSEC_NAT_T
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
 		n = udp4_espdecap(inp, n, off);
 		if (n == NULL)				/* Consumed. */
 			return (0);
 	}
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return (0);
 	}
 #endif /* MAC */
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif /* INET6 */
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif /* INET6 */
 		append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 	return (0);
 }
 
 int
 udp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	uint16_t len, ip_len;
 	struct inpcbinfo *pcbinfo;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 	struct mbuf *m;
 	struct m_tag *fwd_tag;
 	int cscov_partial, iphlen;
 
 	m = *mp;
 	iphlen = *offp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
 			UDPSTAT_INC(udps_hdrops);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	ip_len = ntohs(ip->ip_len) - iphlen;
 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
 		/* Zero means checksum over the complete packet. */
 		if (len == 0)
 			len = ip_len;
 		cscov_partial = 0;
 	}
 	if (ip_len != len) {
 		if (len > ip_len || len < sizeof(struct udphdr)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (proto == IPPROTO_UDP)
 			m_adj(m, len - ip_len);
 	}
 
 	/*
 	 * Save a copy of the IP header in case we want restore it for
 	 * sending an ICMP error message in response.
 	 */
 	if (!V_udp_blackhole)
 		save_ip = *ip;
 	else
 		memset(&save_ip, 0, sizeof(save_ip));
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 		    !cscov_partial) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + proto));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
 			    uh->uh_ulen : htons(ip_len);
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh_sum) {
 			UDPSTAT_INC(udps_badsum);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	} else {
 		if (proto == IPPROTO_UDP) {
 			UDPSTAT_INC(udps_nosum);
 		} else {
 			/* UDPLite requires a checksum */
 			/* XXX: What is the right UDPLite MIB counter here? */
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	pcbinfo = udp_get_inpcbinfo(proto);
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp)) {
 		struct inpcb *last;
 		struct inpcbhead *pcblist;
 		struct ip_moptions *imo;
 
 		INP_INFO_RLOCK(pcbinfo);
 		pcblist = udp_get_pcblist(proto);
 		last = NULL;
 		LIST_FOREACH(inp, pcblist, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 				continue;
 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 
 			INP_RLOCK(inp);
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match
 			 * before, we should probably recheck now that the
 			 * inpcb lock is held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->inp_moptions;
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 				struct sockaddr_in	 group;
 				int			 blocked;
 				if (imo == NULL) {
 					INP_RUNLOCK(inp);
 					continue;
 				}
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(imo, ifp,
 					(struct sockaddr *)&group,
 					(struct sockaddr *)&udp_in);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IPSTAT_INC(ips_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp);
 					continue;
 				}
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
 					UDP_PROBE(receive, NULL, last, ip,
 					    last, uh);
 					if (udp_append(last, ip, n, iphlen,
 						&udp_in)) {
 						goto inp_lost;
 					}
 				}
 				INP_RUNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noportbcast);
 			if (inp)
 				INP_RUNLOCK(inp);
 			INP_INFO_RUNLOCK(pcbinfo);
 			goto badunlocked;
 		}
 		UDP_PROBE(receive, NULL, last, ip, last, uh);
 		if (udp_append(last, ip, m, iphlen, &udp_in) == 0) 
 			INP_RUNLOCK(last);
 	inp_lost:
 		INP_INFO_RUNLOCK(pcbinfo);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(pcbinfo, ip->ip_src,
 			    uh->uh_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? htons(next_hop->sin_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, ifp);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	} else
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
 		    INPLOOKUP_RLOCKPCB, ifp, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			UDPSTAT_INC(udps_noportbcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badunlocked;
 		*ip = save_ip;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK_ASSERT(inp);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	if (cscov_partial) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 	if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) 
 		INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badunlocked:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 #endif /* INET */
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	/*
 	 * While udp_ctlinput() always calls udp_notify() with a read lock
 	 * when invoking it directly, in_pcbnotifyall() currently uses write
 	 * locks due to sharing code with TCP.  For now, accept either a read
 	 * or a write lock, but a read lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 #ifdef INET
 static void
 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
     struct inpcbinfo *pcbinfo)
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
 		if (inp != NULL) {
 			INP_RLOCK_ASSERT(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_RUNLOCK(inp);
 		}
 	} else
 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
 }
 
 void
 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
 }
 #endif /* INET */
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_udbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	gencnt = V_udbinfo.ipi_gencnt;
 	n = V_udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_udbinfo);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_udbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_udbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_udbinfo);
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 #ifdef INET
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 #endif /* INET */
 
 int
 udp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 	int isudplite, error, optval;
 
 	error = 0;
 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 		}
 #endif
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case UDP_ENCAP:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 #ifdef IPSEC_NAT_T
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 #endif
 			switch (optval) {
 			case 0:
 				/* Clear all UDP encap. */
 #ifdef IPSEC_NAT_T
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 #endif
 				break;
 #ifdef IPSEC_NAT_T
 			case UDP_ENCAP_ESPINUDP:
 			case UDP_ENCAP_ESPINUDP_NON_IKE:
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 				if (optval == UDP_ENCAP_ESPINUDP)
 					up->u_flags |= UF_ESPINUDP;
 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
 					up->u_flags |= UF_ESPINUDP_NON_IKE;
 				break;
 #endif
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			if (error != 0)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
 				INP_WUNLOCK(inp);
 				error = EINVAL;
 				break;
 			}
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				up->u_txcslen = optval;
 			else
 				up->u_rxcslen = optval;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 #ifdef IPSEC_NAT_T
 		case UDP_ENCAP:
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			optval = up->u_flags & UF_ESPINUDP_ALL;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				optval = up->u_txcslen;
 			else
 				optval = up->u_rxcslen;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}	
 	return (error);
 }
 
 #ifdef INET
 #define	UH_WLOCKED	2
 #define	UH_RLOCKED	1
 #define	UH_UNLOCKED	0
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin, src;
 	int cscov_partial = 0;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 	u_char tos;
 	uint8_t pr;
 	uint16_t cscov = 0;
 	uint32_t flowid = 0;
 	uint8_t flowtype = M_HASHTYPE_NONE;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
 	INP_RLOCK(inp);
 	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			INP_RUNLOCK(inp);
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			case IP_TOS:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
 					error = EINVAL;
 					break;
 				}
 				tos = *(u_char *)CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowid = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWTYPE:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowtype = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				/* This is just a placeholder for now */
 				break;
 #endif	/* RSS */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (error);
 	}
 
 	/*
 	 * Depending on whether or not the application has bound or connected
 	 * the socket, we may have to do varying levels of work.  The optimal
 	 * case is for a connected UDP socket, as a global lock isn't
 	 * required at all.
 	 *
 	 * In order to decide which we need, we require stability of the
 	 * inpcb binding, which we ensure by acquiring a read lock on the
 	 * inpcb.  This doesn't strictly follow the lock order, so we play
 	 * the trylock and retry game; note that we may end up with more
 	 * conservative locks than required the second time around, so later
 	 * assertions have to accept that.  Further analysis of the number of
 	 * misses under contention is required.
 	 *
 	 * XXXRW: Check that hash locking update here is correct.
 	 */
 	pr = inp->inp_socket->so_proto->pr_protocol;
 	pcbinfo = udp_get_inpcbinfo(pr);
 	sin = (struct sockaddr_in *)addr;
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
 		INP_RUNLOCK(inp);
 		INP_WLOCK(inp);
 		INP_HASH_WLOCK(pcbinfo);
 		unlock_udbinfo = UH_WLOCKED;
 	} else if ((sin != NULL && (
 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
 	    (inp->inp_lport == 0))) ||
 	    (src.sin_family == AF_INET)) {
 		INP_HASH_RLOCK(pcbinfo);
 		unlock_udbinfo = UH_RLOCKED;
 	} else
 		unlock_udbinfo = UH_UNLOCKED;
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		INP_HASH_LOCK_ASSERT(pcbinfo);
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 		if (error)
 			goto release;
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_HASH_LOCK_ASSERT(pcbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error)
 				goto release;
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_WLOCK_ASSERT(inp);
 				INP_HASH_WLOCK_ASSERT(pcbinfo);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (prison_flag(td->td_ucred, PR_IP4))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				if (in_pcbinshash(inp) != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			}
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = pr;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 	if (pr == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 		uint16_t plen;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		plen = (u_short)len + sizeof(struct udphdr);
 		if (cscov >= plen)
 			cscov = 0;
 		ui->ui_len = htons(plen);
 		ui->ui_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	} else
 		ui->ui_v = IPVERSION << 4;
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= htons(IP_DF);
 	}
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	ui->ui_sum = 0;
 	if (pr == IPPROTO_UDPLITE) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		if (cscov_partial) {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
 				ui->ui_sum = 0xffff;
 		} else {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
 				ui->ui_sum = 0xffff;
 		}
 	} else if (V_udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + pr));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	/*
 	 * Setup flowid / RSS information for outbound socket.
 	 *
 	 * Once the UDP code decides to set a flowid some other way,
 	 * this allows the flowid to be overridden by userland.
 	 */
 	if (flowtype != M_HASHTYPE_NONE) {
 		m->m_pkthdr.flowid = flowid;
 		M_HASHTYPE_SET(m, flowtype);
 #ifdef	RSS
 	} else {
 		uint32_t hash_val, hash_type;
 		/*
 		 * Calculate an appropriate RSS hash for UDP and
 		 * UDP Lite.
 		 *
 		 * The called function will take care of figuring out
 		 * whether a 2-tuple or 4-tuple hash is required based
 		 * on the currently configured scheme.
 		 *
 		 * Later later on connected socket values should be
 		 * cached in the inpcb and reused, rather than constantly
 		 * re-calculating it.
 		 *
 		 * UDP Lite is a different protocol number and will
 		 * likely end up being hashed as a 2-tuple until
 		 * RSS / NICs grow UDP Lite protocol awareness.
 		 */
 		if (rss_proto_software_hash_v4(faddr, laddr, fport, lport,
 		    pr, &hash_val, &hash_type) == 0) {
 			m->m_pkthdr.flowid = hash_val;
 			M_HASHTYPE_SET(m, hash_type);
 		}
 #endif
 	}
 
 #ifdef	RSS
 	/*
 	 * Don't override with the inp cached flowid value.
 	 *
 	 * Depending upon the kind of send being done, the inp
 	 * flowid/flowtype values may actually not be appropriate
 	 * for this particular socket send.
 	 *
 	 * We should either leave the flowid at zero (which is what is
 	 * currently done) or set it to some software generated
 	 * hash value based on the packet contents.
 	 */
 	ipflags |= IP_NODEFAULTFLOWID;
 #endif	/* RSS */
 
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_HASH_WUNLOCK(pcbinfo);
 	else if (unlock_udbinfo == UH_RLOCKED)
 		INP_HASH_RUNLOCK(pcbinfo);
 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_WUNLOCK(inp);
 	else
 		INP_RUNLOCK(inp);
 	return (error);
 
 release:
 	if (unlock_udbinfo == UH_WLOCKED) {
 		INP_HASH_WUNLOCK(pcbinfo);
 		INP_WUNLOCK(inp);
 	} else if (unlock_udbinfo == UH_RLOCKED) {
 		INP_HASH_RUNLOCK(pcbinfo);
 		INP_RUNLOCK(inp);
 	} else
 		INP_RUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 
 #if defined(IPSEC) && defined(IPSEC_NAT_T)
 /*
  * Potentially decap ESP in UDP frame.  Check for an ESP header
  * and optional marker; if present, strip the UDP header and
  * push the result through IPSec.
  *
  * Returns mbuf to be processed (potentially re-allocated) or
  * NULL if consumed and/or processed.
  */
 static struct mbuf *
 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
 {
 	size_t minlen, payload, skip, iphlen;
 	caddr_t data;
 	struct udpcb *up;
 	struct m_tag *tag;
 	struct udphdr *udphdr;
 	struct ip *ip;
 
 	INP_RLOCK_ASSERT(inp);
 
 	/* 
 	 * Pull up data so the longest case is contiguous:
 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
 	 */
 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
 	if (minlen > m->m_pkthdr.len)
 		minlen = m->m_pkthdr.len;
 	if ((m = m_pullup(m, minlen)) == NULL) {
 		IPSECSTAT_INC(ips_in_inval);
 		return (NULL);		/* Bypass caller processing. */
 	}
 	data = mtod(m, caddr_t);	/* Points to ip header. */
 	payload = m->m_len - off;	/* Size of payload. */
 
 	if (payload == 1 && data[off] == '\xff')
 		return (m);		/* NB: keepalive packet, no decap. */
 
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
 	    ("u_flags 0x%x", up->u_flags));
 
 	/* 
 	 * Check that the payload is large enough to hold an
 	 * ESP header and compute the amount of data to remove.
 	 *
 	 * NB: the caller has already done a pullup for us.
 	 * XXX can we assume alignment and eliminate bcopys?
 	 */
 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
 		/*
 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
 		 * possible AH mode non-IKE marker+non-ESP marker
 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
 		 */
 		uint64_t marker;
 
 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
 			return (m);	/* NB: no decap. */
 		bcopy(data + off, &marker, sizeof(uint64_t));
 		if (marker != 0)	/* Non-IKE marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
 	} else {
 		uint32_t spi;
 
 		if (payload <= sizeof(struct esp)) {
 			IPSECSTAT_INC(ips_in_inval);
 			m_freem(m);
 			return (NULL);	/* Discard. */
 		}
 		bcopy(data + off, &spi, sizeof(uint32_t));
 		if (spi == 0)		/* Non-ESP marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(struct udphdr);
 	}
 
 	/*
 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
 	 * the UDP ports. This is required if we want to select
 	 * the right SPD for multiple hosts behind same NAT.
 	 *
 	 * NB: ports are maintained in network byte order everywhere
 	 *     in the NAT-T code.
 	 */
 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
 		2 * sizeof(uint16_t), M_NOWAIT);
 	if (tag == NULL) {
 		IPSECSTAT_INC(ips_in_nomem);
 		m_freem(m);
 		return (NULL);		/* Discard. */
 	}
 	iphlen = off - sizeof(struct udphdr);
 	udphdr = (struct udphdr *)(data + iphlen);
 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
 	m_tag_prepend(m, tag);
 
 	/*
 	 * Remove the UDP header (and possibly the non ESP marker)
 	 * IP header length is iphlen
 	 * Before:
 	 *   <--- off --->
 	 *   +----+------+-----+
 	 *   | IP |  UDP | ESP |
 	 *   +----+------+-----+
 	 *        <-skip->
 	 * After:
 	 *          +----+-----+
 	 *          | IP | ESP |
 	 *          +----+-----+
 	 *   <-skip->
 	 */
 	ovbcopy(data, data + skip, iphlen);
 	m_adj(m, skip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
 	ip->ip_p = IPPROTO_ESP;
 
 	/*
 	 * We cannot yet update the cksums so clear any
 	 * h/w cksum flags as they are no longer valid.
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 
 	(void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p),
 				AF_INET, ip->ip_p);
 	return (NULL);			/* NB: consumed, bypass processing. */
 }
 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
 
 static void
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(pcbinfo);
 	error = in_pcballoc(so, pcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	inp = sotoinpcb(so);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	return (0);
 }
 #endif /* INET */
 
 int
 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	KASSERT(so->so_type == SOCK_DGRAM,
 	    ("udp_set_kernel_tunneling: !dgram"));
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		INP_WUNLOCK(inp);
 		return (EBUSY);
 	}
 	up->u_tun_func = f;
 	up->u_tun_ctx = ctx;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (EISCONN);
 	}
 	sin = (struct sockaddr_in *)nam;
 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 	if (error != 0) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	inp->inp_ppcb = NULL;
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 	return (udp_output(inp, m, addr, control, td));
 }
 #endif /* INET */
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		udp_send,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp_close,
 };
 #endif /* INET */
Index: projects/vnet/sys/netinet/udp_var.h
===================================================================
--- projects/vnet/sys/netinet/udp_var.h	(revision 294565)
+++ projects/vnet/sys/netinet/udp_var.h	(revision 294566)
@@ -1,186 +1,182 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_var.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_UDP_VAR_H_
 #define	_NETINET_UDP_VAR_H_
 
 /*
  * UDP kernel structures and variables.
  */
 struct udpiphdr {
 	struct ipovly	ui_i;		/* overlaid ip structure */
 	struct udphdr	ui_u;		/* udp header */
 };
 #define	ui_x1		ui_i.ih_x1
 #define	ui_v		ui_i.ih_x1[0]
 #define	ui_pr		ui_i.ih_pr
 #define	ui_len		ui_i.ih_len
 #define	ui_src		ui_i.ih_src
 #define	ui_dst		ui_i.ih_dst
 #define	ui_sport	ui_u.uh_sport
 #define	ui_dport	ui_u.uh_dport
 #define	ui_ulen		ui_u.uh_ulen
 #define	ui_sum		ui_u.uh_sum
 
 struct inpcb;
 struct mbuf;
 
 typedef void(*udp_tun_func_t)(struct mbuf *, int off, struct inpcb *,
 			      const struct sockaddr *, void *);
 
 /*
  * UDP control block; one per udp.
  */
 struct udpcb {
 	udp_tun_func_t	u_tun_func;	/* UDP kernel tunneling callback. */
 	u_int		u_flags;	/* Generic UDP flags. */
 	uint16_t	u_rxcslen;	/* Coverage for incoming datagrams. */
 	uint16_t	u_txcslen;	/* Coverage for outgoing datagrams. */
 	void 		*u_tun_ctx;	/* Tunneling callback context. */
 };
 
 #define	intoudpcb(ip)	((struct udpcb *)(ip)->inp_ppcb)
 #define	sotoudpcb(so)	(intoudpcb(sotoinpcb(so)))
 
 				/* IPsec: ESP in UDP tunneling: */
 #define	UF_ESPINUDP_NON_IKE	0x00000001	/* w/ non-IKE marker .. */
 	/* .. per draft-ietf-ipsec-nat-t-ike-0[01],
 	 * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */
 #define	UF_ESPINUDP		0x00000002	/* w/ non-ESP marker. */
 
 struct udpstat {
 				/* input statistics: */
 	uint64_t udps_ipackets;		/* total input packets */
 	uint64_t udps_hdrops;		/* packet shorter than header */
 	uint64_t udps_badsum;		/* checksum error */
 	uint64_t udps_nosum;		/* no checksum */
 	uint64_t udps_badlen;		/* data length larger than packet */
 	uint64_t udps_noport;		/* no socket on port */
 	uint64_t udps_noportbcast;	/* of above, arrived as broadcast */
 	uint64_t udps_fullsock;		/* not delivered, input socket full */
 	uint64_t udpps_pcbcachemiss;	/* input packets missing pcb cache */
 	uint64_t udpps_pcbhashmiss;	/* input packets not for hashed pcb */
 				/* output statistics: */
 	uint64_t udps_opackets;		/* total output packets */
 	uint64_t udps_fastout;		/* output packets on fast path */
 	/* of no socket on port, arrived as multicast */
 	uint64_t udps_noportmcast;
 	uint64_t udps_filtermcast;	/* blocked by multicast filter */
 };
 
 #ifdef _KERNEL
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	UDPSTAT_ADD(name, val)  \
     VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val))
 #define	UDPSTAT_INC(name)	UDPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_udpstat_inc(int statnum);
 #define	KMOD_UDPSTAT_INC(name)	\
     kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t))
 #endif
 
 /*
  * Identifiers for UDP sysctl nodes.
  */
 #define	UDPCTL_CHECKSUM		1	/* checksum UDP packets */
 #define	UDPCTL_STATS		2	/* statistics (read-only) */
 #define	UDPCTL_MAXDGRAM		3	/* max datagram size */
 #define	UDPCTL_RECVSPACE	4	/* default receive buffer space */
 #define	UDPCTL_PCBLIST		5	/* list of PCBs for UDP sockets */
 
 #ifdef _KERNEL
 #include <netinet/in_pcb.h>
 SYSCTL_DECL(_net_inet_udp);
 
 extern struct pr_usrreqs	udp_usrreqs;
 VNET_DECLARE(struct inpcbhead, udb);
 VNET_DECLARE(struct inpcbinfo, udbinfo);
 VNET_DECLARE(struct inpcbhead, ulitecb);
 VNET_DECLARE(struct inpcbinfo, ulitecbinfo);
 #define	V_udb			VNET(udb)
 #define	V_udbinfo		VNET(udbinfo)
 #define	V_ulitecb		VNET(ulitecb)
 #define	V_ulitecbinfo		VNET(ulitecbinfo)
 
 extern u_long			udp_sendspace;
 extern u_long			udp_recvspace;
 VNET_DECLARE(int, udp_cksum);
 VNET_DECLARE(int, udp_blackhole);
 #define	V_udp_cksum		VNET(udp_cksum)
 #define	V_udp_blackhole		VNET(udp_blackhole)
 extern int			udp_log_in_vain;
 
 static __inline struct inpcbinfo *
 udp_get_inpcbinfo(int protocol)
 {
 	return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo;
 }
 
 static __inline struct inpcbhead *
 udp_get_pcblist(int protocol)
 {
 	return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb;
 }
 
 int		udp_newudpcb(struct inpcb *);
 void		udp_discardcb(struct udpcb *);
 
 void		udp_ctlinput(int, struct sockaddr *, void *);
 void		udplite_ctlinput(int, struct sockaddr *, void *);
 int		udp_ctloutput(struct socket *, struct sockopt *);
 void		udp_init(void);
 void		udplite_init(void);
-#ifdef VIMAGE
-void		udp_destroy(void);
-void		udplite_destroy(void);
-#endif
 int		udp_input(struct mbuf **, int *, int);
 void		udplite_input(struct mbuf *, int);
 struct inpcb	*udp_notify(struct inpcb *inp, int errno);
 int		udp_shutdown(struct socket *so);
 
 int		udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f,
 					 void *ctx);
 
 #endif /* _KERNEL */
 
 #endif /* _NETINET_UDP_VAR_H_ */
Index: projects/vnet/sys/netinet6/in6.c
===================================================================
--- projects/vnet/sys/netinet6/in6.c	(revision 294565)
+++ projects/vnet/sys/netinet6/in6.c	(revision 294566)
@@ -1,2524 +1,2524 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.2 (Berkeley) 11/15/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_carp.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 
 VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix);
 #define V_icmp6_nodeinfo_oldmcprefix	VNET(icmp6_nodeinfo_oldmcprefix)
 
 /*
  * Definitions of some costant IP6 addresses.
  */
 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
 const struct in6_addr in6addr_nodelocal_allnodes =
 	IN6ADDR_NODELOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allnodes =
 	IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allrouters =
 	IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
 const struct in6_addr in6addr_linklocal_allv2routers =
 	IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT;
 
 const struct in6_addr in6mask0 = IN6MASK0;
 const struct in6_addr in6mask32 = IN6MASK32;
 const struct in6_addr in6mask64 = IN6MASK64;
 const struct in6_addr in6mask96 = IN6MASK96;
 const struct in6_addr in6mask128 = IN6MASK128;
 
 const struct sockaddr_in6 sa6_any =
 	{ sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 };
 
 static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *,
 	struct in6_aliasreq *, int);
 static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
 
 static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *,
     struct in6_aliasreq *, int flags);
 static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int, int);
 static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 
 #define ifa2ia6(ifa)	((struct in6_ifaddr *)(ifa))
 #define ia62ifa(ia6)	(&((ia6)->ia_ifa))
 
 
 void
 in6_newaddrmsg(struct in6_ifaddr *ia, int cmd)
 {
 	struct sockaddr_dl gateway;
 	struct sockaddr_in6 mask, addr;
 	struct rtentry rt;
 
 	/*
 	 * initialize for rtmsg generation
 	 */
 	bzero(&gateway, sizeof(gateway));
 	gateway.sdl_len = sizeof(gateway);
 	gateway.sdl_family = AF_LINK;
 
 	bzero(&rt, sizeof(rt));
 	rt.rt_gateway = (struct sockaddr *)&gateway;
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	rt_mask(&rt) = (struct sockaddr *)&mask;
 	rt_key(&rt) = (struct sockaddr *)&addr;
 	rt.rt_flags = RTF_HOST | RTF_STATIC;
 	if (cmd == RTM_ADD)
 		rt.rt_flags |= RTF_UP;
 	/* Announce arrival of local address to all FIBs. */
 	rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt);
 }
 
 int
 in6_mask2len(struct in6_addr *mask, u_char *lim0)
 {
 	int x = 0, y;
 	u_char *lim = lim0, *p;
 
 	/* ignore the scope_id part */
 	if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
 		lim = (u_char *)mask + sizeof(*mask);
 	for (p = (u_char *)mask; p < lim; x++, p++) {
 		if (*p != 0xff)
 			break;
 	}
 	y = 0;
 	if (p < lim) {
 		for (y = 0; y < 8; y++) {
 			if ((*p & (0x80 >> y)) == 0)
 				break;
 		}
 	}
 
 	/*
 	 * when the limit pointer is given, do a stricter check on the
 	 * remaining bits.
 	 */
 	if (p < lim) {
 		if (y != 0 && (*p & (0x00ff >> y)) != 0)
 			return (-1);
 		for (p = p + 1; p < lim; p++)
 			if (*p != 0)
 				return (-1);
 	}
 
 	return x * 8 + y;
 }
 
 #ifdef COMPAT_FREEBSD32
 struct in6_ndifreq32 {
 	char ifname[IFNAMSIZ];
 	uint32_t ifindex;
 };
 #define	SIOCGDEFIFACE32_IN6	_IOWR('i', 86, struct in6_ndifreq32)
 #endif
 
 int
 in6_control(struct socket *so, u_long cmd, caddr_t data,
     struct ifnet *ifp, struct thread *td)
 {
 	struct	in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct	in6_ifaddr *ia = NULL;
 	struct	in6_aliasreq *ifra = (struct in6_aliasreq *)data;
 	struct sockaddr_in6 *sa6;
 	int carp_attached = 0;
 	int error;
 	u_long ocmd = cmd;
 
 	/*
 	 * Compat to make pre-10.x ifconfig(8) operable.
 	 */
 	if (cmd == OSIOCAIFADDR_IN6)
 		cmd = SIOCAIFADDR_IN6;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 	case SIOCGETMIFCNT_IN6:
 		/*
 		 * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c.
 		 * We cannot see how that would be needed, so do not adjust the
 		 * KPI blindly; more likely should clean up the IPv4 variant.
 		 */
 		return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 	case SIOCDADDRCTL_POLICY:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ADDRCTRL6);
 			if (error)
 				return (error);
 		}
 		return (in6_src_ioctl(cmd, data));
 	}
 
 	if (ifp == NULL)
 		return (EOPNOTSUPP);
 
 	switch (cmd) {
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCSDEFIFACE_IN6:
 	case SIOCSIFINFO_FLAGS:
 	case SIOCSIFINFO_IN6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ND6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case OSIOCGIFINFO_IN6:
 	case SIOCGIFINFO_IN6:
 	case SIOCGNBRINFO_IN6:
 	case SIOCGDEFIFACE_IN6:
 		return (nd6_ioctl(cmd, data, ifp));
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGDEFIFACE32_IN6:
 		{
 			struct in6_ndifreq ndif;
 			struct in6_ndifreq32 *ndif32;
 
 			error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif,
 			    ifp);
 			if (error)
 				return (error);
 			ndif32 = (struct in6_ndifreq32 *)data;
 			ndif32->ifindex = ndif.ifindex;
 			return (0);
 		}
 #endif
 	}
 
 	switch (cmd) {
 	case SIOCSIFPREFIX_IN6:
 	case SIOCDIFPREFIX_IN6:
 	case SIOCAIFPREFIX_IN6:
 	case SIOCCIFPREFIX_IN6:
 	case SIOCSGIFPREFIX_IN6:
 	case SIOCGIFPREFIX_IN6:
 		log(LOG_NOTICE,
 		    "prefix ioctls are now invalidated. "
 		    "please use ifconfig.\n");
 		return (EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_SCOPE6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case SIOCGSCOPE6:
 	case SIOCGSCOPE6DEF:
 		return (scope6_ioctl(cmd, data, ifp));
 	}
 
 	/*
 	 * Find address for this interface, if it exists.
 	 *
 	 * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
 	 * only, and used the first interface address as the target of other
 	 * operations (without checking ifra_addr).  This was because netinet
 	 * code/API assumed at most 1 interface address per interface.
 	 * Since IPv6 allows a node to assign multiple addresses
 	 * on a single interface, we almost always look and check the
 	 * presence of ifra_addr, and reject invalid ones here.
 	 * It also decreases duplicated code among SIOC*_IN6 operations.
 	 */
 	switch (cmd) {
 	case SIOCAIFADDR_IN6:
 	case SIOCSIFPHYADDR_IN6:
 		sa6 = &ifra->ifra_addr;
 		break;
 	case SIOCSIFADDR_IN6:
 	case SIOCGIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCDIFADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 	case SIOCGIFAFLAG_IN6:
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCGIFALIFETIME_IN6:
 	case SIOCSIFALIFETIME_IN6:
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		sa6 = &ifr->ifr_addr;
 		break;
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/*
 		 * Although we should pass any non-INET6 ioctl requests
 		 * down to driver, we filter some legacy INET requests.
 		 * Drivers trust SIOCSIFADDR et al to come from an already
 		 * privileged layer, and do not perform any credentials
 		 * checks or input validation.
 		 */
 		return (EINVAL);
 	default:
 		sa6 = NULL;
 		break;
 	}
 	if (sa6 && sa6->sin6_family == AF_INET6) {
 		if (sa6->sin6_scope_id != 0)
 			error = sa6_embedscope(sa6, 0);
 		else
 			error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
 		if (error != 0)
 			return (error);
 		if (td != NULL && (error = prison_check_ip6(td->td_ucred,
 		    &sa6->sin6_addr)) != 0)
 			return (error);
 		ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
 	} else
 		ia = NULL;
 
 	switch (cmd) {
 	case SIOCSIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 		/*
 		 * Since IPv6 allows a node to assign multiple addresses
 		 * on a single interface, SIOCSIFxxx ioctls are deprecated.
 		 */
 		/* we decided to obsolete this command (20000704) */
 		error = EINVAL;
 		goto out;
 
 	case SIOCDIFADDR_IN6:
 		/*
 		 * for IPv4, we look for existing in_ifaddr here to allow
 		 * "ifconfig if0 delete" to remove the first IPv4 address on
 		 * the interface.  For IPv6, as the spec allows multiple
 		 * interface address from the day one, we consider "remove the
 		 * first one" semantics to be not preferable.
 		 */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCAIFADDR_IN6:
 		/*
 		 * We always require users to specify a valid IPv6 address for
 		 * the corresponding operation.
 		 */
 		if (ifra->ifra_addr.sin6_family != AF_INET6 ||
 		    ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		if (td != NULL) {
 			error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ?
 			    PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
 			if (error)
 				goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		if (ifp->if_afdata[AF_INET6] == NULL) {
 			error = EPFNOSUPPORT;
 			goto out;
 		}
 		break;
 
 	case SIOCGIFADDR_IN6:
 		/* This interface is basically deprecated. use SIOCGIFCONF. */
 		/* FALLTHROUGH */
 	case SIOCGIFAFLAG_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFALIFETIME_IN6:
 		/* must think again about its semantics */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 	    {
 		struct in6_addrlifetime *lt;
 
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ALIFETIME6);
 			if (error)
 				goto out;
 		}
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* sanity for overflow - beware unsigned */
 		lt = &ifr->ifr_ifru.ifru_lifetime;
 		if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_vltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_pltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		break;
 	    }
 	}
 
 	switch (cmd) {
 	case SIOCGIFADDR_IN6:
 		ifr->ifr_addr = ia->ia_addr;
 		if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFDSTADDR_IN6:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * XXX: should we check if ifa_dstaddr is NULL and return
 		 * an error?
 		 */
 		ifr->ifr_dstaddr = ia->ia_dstaddr;
 		if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFNETMASK_IN6:
 		ifr->ifr_addr = ia->ia_prefixmask;
 		break;
 
 	case SIOCGIFAFLAG_IN6:
 		ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
 		break;
 
 	case SIOCGIFSTAT_IN6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->in6_ifstat,
 		    &ifr->ifr_ifru.ifru_stat,
 		    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFSTAT_ICMP6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->icmp6_ifstat,
 		    &ifr->ifr_ifru.ifru_icmp6stat,
 		    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFALIFETIME_IN6:
 		ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_vltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_expire = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_vltime;
 			} else
 				retlt->ia6t_expire = maxexpire;
 		}
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_pltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_preferred = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_pltime;
 			} else
 				retlt->ia6t_preferred = maxexpire;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 		ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime;
 		/* for sanity */
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_expire =
 				time_uptime + ia->ia6_lifetime.ia6t_vltime;
 		} else
 			ia->ia6_lifetime.ia6t_expire = 0;
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_preferred =
 				time_uptime + ia->ia6_lifetime.ia6t_pltime;
 		} else
 			ia->ia6_lifetime.ia6t_preferred = 0;
 		break;
 
 	case SIOCAIFADDR_IN6:
 	{
 		struct nd_prefixctl pr0;
 		struct nd_prefix *pr;
 
 		/*
 		 * first, make or update the interface address structure,
 		 * and link it to the list.
 		 */
 		if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0)
 			goto out;
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr))
 		    == NULL) {
 			/*
 			 * this can happen when the user specify the 0 valid
 			 * lifetime.
 			 */
 			break;
 		}
 
 		if (cmd == ocmd && ifra->ifra_vhid > 0) {
 			if (carp_attach_p != NULL)
 				error = (*carp_attach_p)(&ia->ia_ifa,
 				    ifra->ifra_vhid);
 			else
 				error = EPROTONOSUPPORT;
 			if (error)
 				goto out;
 			else
 				carp_attached = 1;
 		}
 
 		/*
 		 * then, make the prefix on-link on the interface.
 		 * XXX: we'd rather create the prefix before the address, but
 		 * we need at least one address to install the corresponding
 		 * interface route, so we configure the address first.
 		 */
 
 		/*
 		 * convert mask to prefix length (prefixmask has already
 		 * been validated in in6_update_ifa().
 		 */
 		bzero(&pr0, sizeof(pr0));
 		pr0.ndpr_ifp = ifp;
 		pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    NULL);
 		if (pr0.ndpr_plen == 128) {
 			/* we don't need to install a host route. */
 			goto aifaddr_out;
 		}
 		pr0.ndpr_prefix = ifra->ifra_addr;
 		/* apply the mask for safety. */
 		IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr,
 		    &ifra->ifra_prefixmask.sin6_addr);
 
 		/*
 		 * XXX: since we don't have an API to set prefix (not address)
 		 * lifetimes, we just use the same lifetimes as addresses.
 		 * The (temporarily) installed lifetimes can be overridden by
 		 * later advertised RAs (when accept_rtadv is non 0), which is
 		 * an intended behavior.
 		 */
 		pr0.ndpr_raf_onlink = 1; /* should be configurable? */
 		pr0.ndpr_raf_auto =
 		    ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0);
 		pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime;
 		pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime;
 
 		/* add the prefix if not yet. */
 		if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 			/*
 			 * nd6_prelist_add will install the corresponding
 			 * interface route.
 			 */
 			if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				goto out;
 			}
 			if (pr == NULL) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				log(LOG_ERR, "nd6_prelist_add succeeded but "
 				    "no prefix\n");
 				error = EINVAL;
 				goto out;
 			}
 		}
 
 		/* relate the address to the prefix */
 		if (ia->ia6_ndpr == NULL) {
 			ia->ia6_ndpr = pr;
 			pr->ndpr_refcnt++;
 
 			/*
 			 * If this is the first autoconf address from the
 			 * prefix, create a temporary address as well
 			 * (when required).
 			 */
 			if ((ia->ia6_flags & IN6_IFF_AUTOCONF) &&
 			    V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) {
 				int e;
 				if ((e = in6_tmpifadd(ia, 1, 0)) != 0) {
 					log(LOG_NOTICE, "in6_control: failed "
 					    "to create a temporary address, "
 					    "errno=%d\n", e);
 				}
 			}
 		}
 
 		/*
 		 * this might affect the status of autoconfigured addresses,
 		 * that is, this address might make other addresses detached.
 		 */
 		pfxlist_onlink_check();
 aifaddr_out:
 		if (error != 0 || ia == NULL)
 			break;
 		/*
 		 * Try to clear the flag when a new IPv6 address is added
 		 * onto an IFDISABLED interface and it succeeds.
 		 */
 		if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 			struct in6_ndireq nd;
 
 			memset(&nd, 0, sizeof(nd));
 			nd.ndi.flags = ND_IFINFO(ifp)->flags;
 			nd.ndi.flags &= ~ND6_IFF_IFDISABLED;
 			if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0)
 				log(LOG_NOTICE, "SIOCAIFADDR_IN6: "
 				    "SIOCSIFINFO_FLAGS for -ifdisabled "
 				    "failed.");
 			/*
 			 * Ignore failure of clearing the flag intentionally.
 			 * The failure means address duplication was detected.
 			 */
 		}
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	case SIOCDIFADDR_IN6:
 	{
 		struct nd_prefix *pr;
 
 		/*
 		 * If the address being deleted is the only one that owns
 		 * the corresponding prefix, expire the prefix as well.
 		 * XXX: theoretically, we don't have to worry about such
 		 * relationship, since we separate the address management
 		 * and the prefix management.  We do this, however, to provide
 		 * as much backward compatibility as possible in terms of
 		 * the ioctl operation.
 		 * Note that in6_purgeaddr() will decrement ndpr_refcnt.
 		 */
 		pr = ia->ia6_ndpr;
 		in6_purgeaddr(&ia->ia_ifa);
 		if (pr && pr->ndpr_refcnt == 0)
 			prelist_remove(pr);
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	default:
 		if (ifp->if_ioctl == NULL) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		goto out;
 	}
 
 	error = 0;
 out:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 
 /*
  * Join necessary multicast groups.  Factored out from in6_update_ifa().
  * This entire work should only be done once, for the default FIB.
  */
 static int
 in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	struct in6_addr mltaddr;
 	struct in6_multi_mship *imm;
 	int delay, error;
 
 	KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__));
 
 	/* Join solicited multicast addr for new host id. */
 	bzero(&mltaddr, sizeof(struct in6_addr));
 	mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
 	mltaddr.s6_addr32[2] = htonl(1);
 	mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3];
 	mltaddr.s6_addr8[12] = 0xff;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) {
 		/* XXX: should not happen */
 		log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
 		goto cleanup;
 	}
 	delay = error = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * We need a random delay for DAD on the address being
 		 * configured.  It also means delaying transmission of the
 		 * corresponding MLD report to avoid report collision.
 		 * [RFC 4861, Section 6.3.7]
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	*in6m_sol = imm->i6mm_maddr;
 
 	/*
 	 * Join link-local all-nodes address.
 	 */
 	mltaddr = in6addr_linklocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 	/*
 	 * Join node information group address.
 	 */
 	delay = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * The spec does not say anything about delay for this group,
 		 * but the same logic should apply.
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) {
 		/* XXX jinmei */
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 	if (V_icmp6_nodeinfo_oldmcprefix &&
 	    in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) {
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 
 	/*
 	 * Join interface-local all-nodes address.
 	 * (ff01::1%ifN, and ff01::%ifN/32)
 	 */
 	mltaddr = in6addr_nodelocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 		    &mltaddr), if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 cleanup:
 	return (error);
 }
 
 /*
  * Update parameters of an IPv6 interface address.
  * If necessary, a new entry is created and linked into address chains.
  * This function is separated from in6_control().
  */
 int
 in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int error, hostIsNew = 0;
 
 	if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0)
 		return (error);
 
 	if (ia == NULL) {
 		hostIsNew = 1;
 		if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL)
 			return (ENOBUFS);
 	}
 
 	error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags);
 	if (error != 0) {
 		if (hostIsNew != 0) {
 			in6_unlink_ifa(ia, ifp);
 			ifa_free(&ia->ia_ifa);
 		}
 		return (error);
 	}
 
 	if (hostIsNew)
 		error = in6_broadcast_ifa(ifp, ifra, ia, flags);
 
 	return (error);
 }
 
 /*
  * Fill in basic IPv6 address request info.
  */
 void
 in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr,
     const struct in6_addr *mask)
 {
 
 	memset(ifra, 0, sizeof(struct in6_aliasreq));
 
 	ifra->ifra_addr.sin6_family = AF_INET6;
 	ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	if (addr != NULL)
 		ifra->ifra_addr.sin6_addr = *addr;
 
 	ifra->ifra_prefixmask.sin6_family = AF_INET6;
 	ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	if (mask != NULL)
 		ifra->ifra_prefixmask.sin6_addr = *mask;
 }
 
 static int
 in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int plen = -1;
 	struct sockaddr_in6 dst6;
 	struct in6_addrlifetime *lt;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* Validate parameters */
 	if (ifp == NULL || ifra == NULL) /* this maybe redundant */
 		return (EINVAL);
 
 	/*
 	 * The destination address for a p2p link must have a family
 	 * of AF_UNSPEC or AF_INET6.
 	 */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Validate address
 	 */
 	if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) ||
 	    ifra->ifra_addr.sin6_family != AF_INET6)
 		return (EINVAL);
 
 	/*
 	 * validate ifra_prefixmask.  don't check sin6_family, netmask
 	 * does not carry fields other than sin6_len.
 	 */
 	if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
 		return (EINVAL);
 	/*
 	 * Because the IPv6 address architecture is classless, we require
 	 * users to specify a (non 0) prefix length (mask) for a new address.
 	 * We also require the prefix (when specified) mask is valid, and thus
 	 * reject a non-consecutive mask.
 	 */
 	if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
 		return (EINVAL);
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    (u_char *)&ifra->ifra_prefixmask +
 		    ifra->ifra_prefixmask.sin6_len);
 		if (plen <= 0)
 			return (EINVAL);
 	} else {
 		/*
 		 * In this case, ia must not be NULL.  We just use its prefix
 		 * length.
 		 */
 		plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
 	}
 	/*
 	 * If the destination address on a p2p interface is specified,
 	 * and the address is a scoped one, validate/set the scope
 	 * zone identifier.
 	 */
 	dst6 = ifra->ifra_dstaddr;
 	if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
 	    (dst6.sin6_family == AF_INET6)) {
 		struct in6_addr in6_tmp;
 		u_int32_t zoneid;
 
 		in6_tmp = dst6.sin6_addr;
 		if (in6_setscope(&in6_tmp, ifp, &zoneid))
 			return (EINVAL); /* XXX: should be impossible */
 
 		if (dst6.sin6_scope_id != 0) {
 			if (dst6.sin6_scope_id != zoneid)
 				return (EINVAL);
 		} else		/* user omit to specify the ID. */
 			dst6.sin6_scope_id = zoneid;
 
 		/* convert into the internal form */
 		if (sa6_embedscope(&dst6, 0))
 			return (EINVAL); /* XXX: should be impossible */
 	}
 	/* Modify original ifra_dstaddr to reflect changes */
 	ifra->ifra_dstaddr = dst6;
 
 	/*
 	 * The destination address can be specified only for a p2p or a
 	 * loopback interface.  If specified, the corresponding prefix length
 	 * must be 128.
 	 */
 	if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
 		if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
 			/* XXX: noisy message */
 			nd6log((LOG_INFO, "in6_update_ifa: a destination can "
 			    "be specified for a p2p or a loopback IF only\n"));
 			return (EINVAL);
 		}
 		if (plen != 128) {
 			nd6log((LOG_INFO, "in6_update_ifa: prefixlen should "
 			    "be 128 when dstaddr is specified\n"));
 			return (EINVAL);
 		}
 	}
 	/* lifetime consistency check */
 	lt = &ifra->ifra_lifetime;
 	if (lt->ia6t_pltime > lt->ia6t_vltime)
 		return (EINVAL);
 	if (lt->ia6t_vltime == 0) {
 		/*
 		 * the following log might be noisy, but this is a typical
 		 * configuration mistake or a tool's bug.
 		 */
 		nd6log((LOG_INFO,
 		    "in6_update_ifa: valid lifetime is 0 for %s\n",
 		    ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr)));
 
 		if (ia == NULL)
 			return (0); /* there's nothing to do */
 	}
 
 	/* Check prefix mask */
 	if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) {
 		/*
 		 * We prohibit changing the prefix length of an existing
 		 * address, because
 		 * + such an operation should be rare in IPv6, and
 		 * + the operation would confuse prefix management.
 		 */
 		if (ia->ia_prefixmask.sin6_len != 0 &&
 		    in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) {
 			nd6log((LOG_INFO, "in6_validate_ifa: the prefix length "
 			    "of an existing %s address should not be changed\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 
 /*
  * Allocate a new ifaddr and link it into chains.
  */
 static struct in6_ifaddr *
 in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
 {
 	struct in6_ifaddr *ia;
 
 	/*
 	 * When in6_alloc_ifa() is called in a process of a received
 	 * RA, it is called under an interrupt context.  So, we should
 	 * call malloc with M_NOWAIT.
 	 */
 	ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT);
 	if (ia == NULL)
 		return (NULL);
 	LIST_INIT(&ia->ia6_memberships);
 	/* Initialize the address and masks, and put time stamp */
 	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ia->ia_addr.sin6_family = AF_INET6;
 	ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
 	/* XXX: Can we assign ,sin6_addr and skip the rest? */
 	ia->ia_addr = ifra->ifra_addr;
 	ia->ia6_createtime = time_uptime;
 	if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
 		/*
 		 * Some functions expect that ifa_dstaddr is not
 		 * NULL for p2p interfaces.
 		 */
 		ia->ia_ifa.ifa_dstaddr =
 		    (struct sockaddr *)&ia->ia_dstaddr;
 	} else {
 		ia->ia_ifa.ifa_dstaddr = NULL;
 	}
 
 	/* set prefix mask if any */
 	ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask;
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		ia->ia_prefixmask.sin6_family = AF_INET6;
 		ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len;
 		ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr;
 	}
 
 	ia->ia_ifp = ifp;
 	ifa_ref(&ia->ia_ifa);			/* if_addrhead */
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(&ia->ia_ifa);			/* in6_ifaddrhead */
 	IN6_IFADDR_WLOCK();
 	TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	return (ia);
 }
 
 /*
  * Update/configure interface address parameters:
  *
  * 1) Update lifetime
  * 2) Update interface metric ad flags
  * 3) Notify other subsystems
  */
 static int
 in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int hostIsNew, int flags)
 {
 	int error;
 
 	/* update timestamp */
 	ia->ia6_updatetime = time_uptime;
 
 	/*
 	 * Set lifetimes.  We do not refer to ia6t_expire and ia6t_preferred
 	 * to see if the address is deprecated or invalidated, but initialize
 	 * these members for applications.
 	 */
 	ia->ia6_lifetime = ifra->ifra_lifetime;
 	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_expire =
 		    time_uptime + ia->ia6_lifetime.ia6t_vltime;
 	} else
 		ia->ia6_lifetime.ia6t_expire = 0;
 	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_preferred =
 		    time_uptime + ia->ia6_lifetime.ia6t_pltime;
 	} else
 		ia->ia6_lifetime.ia6t_preferred = 0;
 
 	/*
 	 * backward compatibility - if IN6_IFF_DEPRECATED is set from the
 	 * userland, make it deprecated.
 	 */
 	if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
 		ia->ia6_lifetime.ia6t_pltime = 0;
 		ia->ia6_lifetime.ia6t_preferred = time_uptime;
 	}
 
 	/*
 	 * configure address flags.
 	 */
 	ia->ia6_flags = ifra->ifra_flags;
 
 	/*
 	 * Make the address tentative before joining multicast addresses,
 	 * so that corresponding MLD responses would not have a tentative
 	 * source address.
 	 */
 	ia->ia6_flags &= ~IN6_IFF_DUPLICATED;	/* safety */
 
 	/*
 	 * DAD should be performed for an new address or addresses on
 	 * an interface with ND6_IFF_IFDISABLED.
 	 */
 	if (in6if_do_dad(ifp) &&
 	    (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)))
 		ia->ia6_flags |= IN6_IFF_TENTATIVE;
 
 	/* notify other subsystems */
 	error = in6_notify_ifa(ifp, ia, ifra, hostIsNew);
 
 	return (error);
 }
 
 /*
  * Do link-level ifa job:
  * 1) Add lle entry for added address
  * 2) Notifies routing socket users about new address
  * 3) join appropriate multicast group
  * 4) start DAD if enabled
  */
 static int
 in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	struct in6_multi *in6m_sol;
 	int error = 0;
 
 	/* Add local address to lltable, if necessary (ex. on p2p link). */
 	if ((error = nd6_add_ifa_lle(ia)) != 0) {
 		in6_purgeaddr(&ia->ia_ifa);
 		ifa_free(&ia->ia_ifa);
 		return (error);
 	}
 
 	/* Join necessary multicast groups. */
 	in6m_sol = NULL;
 	if ((ifp->if_flags & IFF_MULTICAST) != 0) {
 		error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol);
 		if (error != 0) {
 			in6_purgeaddr(&ia->ia_ifa);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 
 	/* Perform DAD, if the address is TENTATIVE. */
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		int delay, mindelay, maxdelay;
 
 		delay = 0;
 		if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 			/*
 			 * We need to impose a delay before sending an NS
 			 * for DAD.  Check if we also needed a delay for the
 			 * corresponding MLD message.  If we did, the delay
 			 * should be larger than the MLD delay (this could be
 			 * relaxed a bit, but this simple logic is at least
 			 * safe).
 			 * XXX: Break data hiding guidelines and look at
 			 * state for the solicited multicast group.
 			 */
 			mindelay = 0;
 			if (in6m_sol != NULL &&
 			    in6m_sol->in6m_state == MLD_REPORTING_MEMBER) {
 				mindelay = in6m_sol->in6m_timer;
 			}
 			maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
 			if (maxdelay - mindelay == 0)
 				delay = 0;
 			else {
 				delay =
 				    (arc4random() % (maxdelay - mindelay)) +
 				    mindelay;
 			}
 		}
 		nd6_dad_start((struct ifaddr *)ia, delay);
 	}
 
 	in6_newaddrmsg(ia, RTM_ADD);
 	ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 void
 in6_purgeaddr(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
 	struct in6_multi_mship *imm;
 	int plen, error;
 
 	if (ifa->ifa_carp)
 		(*carp_detach_p)(ifa);
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 * The check for the current setting of "nd6_useloopback"
 	 * is not needed.
 	 */
 	if (ia->ia_flags & IFA_RTSELF) {
 		error = ifa_del_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags &= ~IFA_RTSELF;
 	}
 
 	/* stop DAD processing */
 	nd6_dad_stop(ifa);
 
 	/* Leave multicast groups. */
 	while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_leavegroup(imm);
 	}
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if ((ia->ia_flags & IFA_ROUTE) && plen == 128) {
 		error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags |
 		    (ia->ia_dstaddr.sin6_family == AF_INET6) ? RTF_HOST : 0);
 		if (error != 0)
 			log(LOG_INFO, "%s: err=%d, destination address delete "
 			    "failed\n", __func__, error);
 		ia->ia_flags &= ~IFA_ROUTE;
 	}
 
 	in6_newaddrmsg(ia, RTM_DELETE);
 	in6_unlink_ifa(ia, ifp);
 }
 
 static void
 in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	int remove_lle;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);			/* if_addrhead */
 
 	/*
 	 * Defer the release of what might be the last reference to the
 	 * in6_ifaddr so that it can't be freed before the remainder of the
 	 * cleanup.
 	 */
 	IN6_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	/*
 	 * Release the reference to the base prefix.  There should be a
 	 * positive reference.
 	 */
 	remove_lle = 0;
 	if (ia->ia6_ndpr == NULL) {
 		nd6log((LOG_NOTICE,
 		    "in6_unlink_ifa: autoconf'ed address "
 		    "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
 	} else {
 		ia->ia6_ndpr->ndpr_refcnt--;
 		/* Do not delete lles within prefix if refcont != 0 */
 		if (ia->ia6_ndpr->ndpr_refcnt == 0)
 			remove_lle = 1;
 		ia->ia6_ndpr = NULL;
 	}
 
 	nd6_rem_ifa_lle(ia, remove_lle);
 
 	/*
 	 * Also, if the address being removed is autoconf'ed, call
 	 * pfxlist_onlink_check() since the release might affect the status of
 	 * other (detached) addresses.
 	 */
 	if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) {
 		pfxlist_onlink_check();
 	}
 	ifa_free(&ia->ia_ifa);			/* in6_ifaddrhead */
 }
 
 /*
  * Notifies other subsystems about address change/arrival:
  * 1) Notifies device handler on the first IPv6 address assignment
  * 2) Handle routing table changes for P2P links and route
  * 3) Handle routing table changes for address host route
  */
 static int
 in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia,
     struct in6_aliasreq *ifra, int hostIsNew)
 {
 	int	error = 0, plen, ifacount = 0;
 	struct ifaddr *ifa;
 	struct sockaddr_in6 *pdst;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 */
 	if (hostIsNew != 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifacount++;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 	if (ifacount <= 1 && ifp->if_ioctl) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * If a new destination address is specified, scrub the old one and
 	 * install the new destination.  Note that the interface must be
 	 * p2p or loopback.
 	 */
 	pdst = &ifra->ifra_dstaddr;
 	if (pdst->sin6_family == AF_INET6 &&
 	    !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) {
 		if ((ia->ia_flags & IFA_ROUTE) != 0 &&
 		    (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) {
 			nd6log((LOG_ERR, "in6_update_ifa_internal: failed to "
 			    "remove a route to the old destination: %s\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 			/* proceed anyway... */
 		} else
 			ia->ia_flags &= ~IFA_ROUTE;
 		ia->ia_dstaddr = *pdst;
 	}
 
 	/*
 	 * If a new destination address is specified for a point-to-point
 	 * interface, install a route to the destination as an interface
 	 * direct route.
 	 * XXX: the logic below rejects assigning multiple addresses on a p2p
 	 * interface that share the same destination.
 	 */
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 &&
 	    ia->ia_dstaddr.sin6_family == AF_INET6) {
 		int rtflags = RTF_UP | RTF_HOST;
 		/*
 		 * Handle the case for ::1 .
 		 */
 		if (ifp->if_flags & IFF_LOOPBACK)
 			ia->ia_flags |= IFA_RTSELF;
 		error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags);
 		if (error)
 			return (error);
 		ia->ia_flags |= IFA_ROUTE;
 	}
 
 	/*
 	 * add a loopback route to self if not exists
 	 */
 	if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) {
 		error = ifa_add_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags |= IFA_RTSELF;
 	}
 
 	return (error);
 }
 
 /*
  * Find an IPv6 interface link-local address specific to an interface.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) {
 			if ((((struct in6_ifaddr *)ifa)->ia6_flags &
 			    ignoreflags) != 0)
 				continue;
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 
 /*
  * find the internet address corresponding to a given address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
 			if (zoneid != 0 &&
 			    zoneid != ia->ia_addr.sin6_scope_id)
 				continue;
 			ifa_ref(&ia->ia_ifa);
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (ia);
 }
 
 /*
  * find the internet address corresponding to a given interface and address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) {
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Find a link-local scoped address on ifp and return it if any.
  */
 struct in6_ifaddr *
 in6ifa_llaonifp(struct ifnet *ifp)
 {
 	struct sockaddr_in6 *sin6;
 	struct ifaddr *ifa;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)
 		return (NULL);
-	if_addr_rlock(ifp);
+	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 		if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr))
 			break;
 	}
-	if_addr_runlock(ifp);
+	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Convert IP6 address to printable (loggable) representation. Caller
  * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long.
  */
 static char digits[] = "0123456789abcdef";
 char *
 ip6_sprintf(char *ip6buf, const struct in6_addr *addr)
 {
 	int i, cnt = 0, maxcnt = 0, idx = 0, index = 0;
 	char *cp;
 	const u_int16_t *a = (const u_int16_t *)addr;
 	const u_int8_t *d;
 	int dcolon = 0, zero = 0;
 
 	cp = ip6buf;
 
 	for (i = 0; i < 8; i++) {
 		if (*(a + i) == 0) {
 			cnt++;
 			if (cnt == 1)
 				idx = i;
 		}
 		else if (maxcnt < cnt) {
 			maxcnt = cnt;
 			index = idx;
 			cnt = 0;
 		}
 	}
 	if (maxcnt < cnt) {
 		maxcnt = cnt;
 		index = idx;
 	}
 
 	for (i = 0; i < 8; i++) {
 		if (dcolon == 1) {
 			if (*a == 0) {
 				if (i == 7)
 					*cp++ = ':';
 				a++;
 				continue;
 			} else
 				dcolon = 2;
 		}
 		if (*a == 0) {
 			if (dcolon == 0 && *(a + 1) == 0 && i == index) {
 				if (i == 0)
 					*cp++ = ':';
 				*cp++ = ':';
 				dcolon = 1;
 			} else {
 				*cp++ = '0';
 				*cp++ = ':';
 			}
 			a++;
 			continue;
 		}
 		d = (const u_char *)a;
 		/* Try to eliminate leading zeros in printout like in :0001. */
 		zero = 1;
 		*cp = digits[*d >> 4];
 		if (*cp != '0') {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d++ & 0xf];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d >> 4];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp++ = digits[*d & 0xf];
 		*cp++ = ':';
 		a++;
 	}
 	*--cp = '\0';
 	return (ip6buf);
 }
 
 int
 in6_localaddr(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
 		return 1;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
 		    &ia->ia_prefixmask.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return 1;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in6_localip(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
  
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr)
 {
 	struct in6_addr in6;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia6;
 
 	in6 = *addr;
 	if (in6_clearscope(&in6))
 		return (0);
 	in6_setscope(&in6, ifp, NULL);
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia6 = (struct in6_ifaddr *)ifa;
 		if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (1);
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 int
 in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) {
 			if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
 				IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 				return (1); /* true */
 			}
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);		/* false */
 }
 
 /*
  * return length of part which dst and src are equal
  * hard coding...
  */
 int
 in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
 {
 	int match = 0;
 	u_char *s = (u_char *)src, *d = (u_char *)dst;
 	u_char *lim = s + 16, r;
 
 	while (s < lim)
 		if ((r = (*d++ ^ *s++)) != 0) {
 			while (r < 128) {
 				match++;
 				r <<= 1;
 			}
 			break;
 		} else
 			match += 8;
 	return match;
 }
 
 /* XXX: to be scope conscious */
 int
 in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len)
 {
 	int bytelen, bitlen;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n",
 		    len);
 		return (0);
 	}
 
 	bytelen = len / 8;
 	bitlen = len % 8;
 
 	if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen))
 		return (0);
 	if (bitlen != 0 &&
 	    p1->s6_addr[bytelen] >> (8 - bitlen) !=
 	    p2->s6_addr[bytelen] >> (8 - bitlen))
 		return (0);
 
 	return (1);
 }
 
 void
 in6_prefixlen2mask(struct in6_addr *maskp, int len)
 {
 	u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
 	int bytelen, bitlen, i;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
 		    len);
 		return;
 	}
 
 	bzero(maskp, sizeof(*maskp));
 	bytelen = len / 8;
 	bitlen = len % 8;
 	for (i = 0; i < bytelen; i++)
 		maskp->s6_addr[i] = 0xff;
 	if (bitlen)
 		maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
 }
 
 /*
  * return the best address out of the same scope. if no address was
  * found, return the first valid address from designated IF.
  */
 struct in6_ifaddr *
 in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
 {
 	int dst_scope =	in6_addrscope(dst), blen = -1, tlen;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *besta = 0;
 	struct in6_ifaddr *dep[2];	/* last-resort: deprecated */
 
 	dep[0] = dep[1] = NULL;
 
 	/*
 	 * We first look for addresses in the same scope.
 	 * If there is one, return it.
 	 * If two or more, return one which matches the dst longest.
 	 * If none, return one of global addresses assigned other ifs.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[0] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (dst_scope == in6_addrscope(IFA_IN6(ifa))) {
 			/*
 			 * call in6_matchlen() as few as possible
 			 */
 			if (besta) {
 				if (blen == -1)
 					blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst);
 				tlen = in6_matchlen(IFA_IN6(ifa), dst);
 				if (tlen > blen) {
 					blen = tlen;
 					besta = (struct in6_ifaddr *)ifa;
 				}
 			} else
 				besta = (struct in6_ifaddr *)ifa;
 		}
 	}
 	if (besta) {
 		ifa_ref(&besta->ia_ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (besta);
 	}
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[1] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (ifa != NULL)
 			ifa_ref(ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (struct in6_ifaddr *)ifa;
 	}
 
 	/* use the last-resort values, that are, deprecated addresses */
 	if (dep[0]) {
 		ifa_ref((struct ifaddr *)dep[0]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[0];
 	}
 	if (dep[1]) {
 		ifa_ref((struct ifaddr *)dep[1]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[1];
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	return NULL;
 }
 
 /*
  * perform DAD when interface becomes IFF_UP.
  */
 void
 in6_if_up(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia = (struct in6_ifaddr *)ifa;
 		if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
 			/*
 			 * The TENTATIVE flag was likely set by hand
 			 * beforehand, implicitly indicating the need for DAD.
 			 * We may be able to skip the random delay in this
 			 * case, but we impose delays just in case.
 			 */
 			nd6_dad_start(ifa,
 			    arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz));
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * special cases, like 6to4, are handled in in6_ifattach
 	 */
 	in6_ifattach(ifp, NULL);
 }
 
 int
 in6if_do_dad(struct ifnet *ifp)
 {
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		return (0);
 
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) ||
 	    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD))
 		return (0);
 
 	/*
 	 * Our DAD routine requires the interface up and running.
 	 * However, some interfaces can be up before the RUNNING
 	 * status.  Additionaly, users may try to assign addresses
 	 * before the interface becomes up (or running).
 	 * This function returns EAGAIN in that case.
 	 * The caller should mark "tentative" on the address instead of
 	 * performing DAD immediately.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		return (EAGAIN);
 
 	return (1);
 }
 
 /*
  * Calculate max IPv6 MTU through all the interfaces and store it
  * to in6_maxmtu.
  */
 void
 in6_setmaxmtu(void)
 {
 	unsigned long maxmtu = 0;
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* this function can be called during ifnet initialization */
 		if (!ifp->if_afdata[AF_INET6])
 			continue;
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
 		    IN6_LINKMTU(ifp) > maxmtu)
 			maxmtu = IN6_LINKMTU(ifp);
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	if (maxmtu)	/* update only when maxmtu is positive */
 		V_in6_maxmtu = maxmtu;
 }
 
 /*
  * Provide the length of interface identifiers to be used for the link attached
  * to the given interface.  The length should be defined in "IPv6 over
  * xxx-link" document.  Note that address architecture might also define
  * the length for a particular set of address prefixes, regardless of the
  * link type.  As clarified in rfc2462bis, those two definitions should be
  * consistent, and those really are as of August 2004.
  */
 int
 in6_if2idlen(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ETHER:		/* RFC2464 */
 	case IFT_PROPVIRTUAL:	/* XXX: no RFC. treat it as ether */
 	case IFT_L2VLAN:	/* ditto */
 	case IFT_IEEE80211:	/* ditto */
 	case IFT_BRIDGE:	/* bridge(4) only does Ethernet-like links */
 	case IFT_INFINIBAND:
 		return (64);
 	case IFT_FDDI:		/* RFC2467 */
 		return (64);
 	case IFT_ISO88025:	/* RFC2470 (IPv6 over Token Ring) */
 		return (64);
 	case IFT_PPP:		/* RFC2472 */
 		return (64);
 	case IFT_ARCNET:	/* RFC2497 */
 		return (64);
 	case IFT_FRELAY:	/* RFC2590 */
 		return (64);
 	case IFT_IEEE1394:	/* RFC3146 */
 		return (64);
 	case IFT_GIF:
 		return (64);	/* draft-ietf-v6ops-mech-v2-07 */
 	case IFT_LOOP:
 		return (64);	/* XXX: is this really correct? */
 	default:
 		/*
 		 * Unknown link type:
 		 * It might be controversial to use the today's common constant
 		 * of 64 for these cases unconditionally.  For full compliance,
 		 * we should return an error in this case.  On the other hand,
 		 * if we simply miss the standard for the link type or a new
 		 * standard is defined for a new link type, the IFID length
 		 * is very likely to be the common constant.  As a compromise,
 		 * we always use the constant, but make an explicit notice
 		 * indicating the "unknown" case.
 		 */
 		printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type);
 		return (64);
 	}
 }
 
 #include <sys/sysctl.h>
 
 struct in6_llentry {
 	struct llentry		base;
 };
 
 #define	IN6_LLTBL_DEFAULT_HSIZE	32
 #define	IN6_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in6_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	LLE_LOCK_DESTROY(lle);
 	LLE_REQ_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 static struct llentry *
 in6_lltable_new(const struct in6_addr *addr6, u_int flags)
 {
 	struct in6_llentry *lle;
 
 	lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	lle->base.r_l3addr.addr6 = *addr6;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in6_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	LLE_REQ_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 static int
 in6_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	const struct in6_addr *addr, *mask, *lle_addr;
 
 	addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr;
 	mask = &((const struct sockaddr_in6 *)smask)->sin6_addr;
 	lle_addr = &lle->r_l3addr.addr6;
 
 	if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) &&
 		    (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 
 		ifp = llt->llt_ifp;
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 
 	llentry_free(lle);
 }
 
 static int
 in6_lltable_rtcheck(struct ifnet *ifp,
 		    u_int flags,
 		    const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6;
 	struct nhop6_basic nh6;
 	struct in6_addr dst;
 	uint32_t scopeid;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/* Our local addresses are always only installed on the default FIB. */
 
 	sin6 = (const struct sockaddr_in6 *)l3addr;
 	in6_splitscope(&sin6->sin6_addr, &dst, &scopeid);
 	error = fib6_lookup_nh_basic(RT_DEFAULT_FIB, &dst, scopeid, 0, 0, &nh6);
 	if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) {
 		struct ifaddr *ifa;
 		/*
 		 * Create an ND6 cache for an IPv6 neighbor
 		 * that is not covered by our own prefix.
 		 */
 		ifa = ifaof_ifpforaddr(l3addr, ifp);
 		if (ifa != NULL) {
 			ifa_free(ifa);
 			return 0;
 		}
 		log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
 		    ip6_sprintf(ip6buf, &sin6->sin6_addr));
 		return EINVAL;
 	}
 	return 0;
 }
 
 static inline uint32_t
 in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
 {
 
 	return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize));
 }
 
 static uint32_t
 in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize));
 }
 
 static void
 in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = (struct sockaddr_in6 *)sa;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_addr = lle->r_l3addr.addr6;
 }
 
 static inline struct llentry *
 in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in6_lltable_alloc(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in6_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in6_lltable_new(&sin6->sin6_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp));
 		lle->la_flags |= LLE_STATIC;
 	}
 
 	if ((lle->la_flags & LLE_STATIC) != 0)
 		lle->ln_state = ND6_LLINFO_REACHABLE;
 
 	return (lle);
 }
 
 static struct llentry *
 in6_lltable_lookup(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
 
 	if (lle == NULL)
 		return (NULL);
 
 	KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) !=
 	    (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X",
 	    flags));
 
 	if (flags & LLE_UNLOCKED)
 		return (lle);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 	return (lle);
 }
 
 static int
 in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in6	sin6;
 		/*
 		 * ndp.c assumes that sdl is word aligned
 		 */
 #ifdef __LP64__
 		uint32_t		pad;
 #endif
 		struct sockaddr_dl	sdl;
 	} ndpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&ndpc, sizeof(ndpc));
 			/* skip deleted entries */
 			if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 				return (0);
 			/* Skip if jailed and not a valid IP of the prison. */
 			lltable_fill_sa_entry(lle,
 			    (struct sockaddr *)&ndpc.sin6);
 			if (prison_if(wr->td->td_ucred,
 			    (struct sockaddr *)&ndpc.sin6) != 0)
 				return (0);
 			/*
 			 * produce a msg made of:
 			 *  struct rt_msghdr;
 			 *  struct sockaddr_in6 (IPv6)
 			 *  struct sockaddr_dl;
 			 */
 			ndpc.rtm.rtm_msglen = sizeof(ndpc);
 			ndpc.rtm.rtm_version = RTM_VERSION;
 			ndpc.rtm.rtm_type = RTM_GET;
 			ndpc.rtm.rtm_flags = RTF_UP;
 			ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 			if (V_deembed_scopeid)
 				sa6_recoverscope(&ndpc.sin6);
 
 			/* publish */
 			if (lle->la_flags & LLE_PUB)
 				ndpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 			sdl = &ndpc.sdl;
 			sdl->sdl_family = AF_LINK;
 			sdl->sdl_len = sizeof(*sdl);
 			sdl->sdl_alen = ifp->if_addrlen;
 			sdl->sdl_index = ifp->if_index;
 			sdl->sdl_type = ifp->if_type;
 			bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 			if (lle->la_expire != 0)
 				ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire +
 				    lle->lle_remtime / hz +
 				    time_second - time_uptime;
 			ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 			if (lle->la_flags & LLE_STATIC)
 				ndpc.rtm.rtm_flags |= RTF_STATIC;
 			if (lle->la_flags & LLE_IFADDR)
 				ndpc.rtm.rtm_flags |= RTF_PINNED;
 			if (lle->ln_router != 0)
 				ndpc.rtm.rtm_flags |= RTF_GATEWAY;
 			ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked;
 			/* Store state in rmx_weight value */
 			ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state;
 			ndpc.rtm.rtm_index = ifp->if_index;
 			error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc));
 
 	return (error);
 }
 
 static struct lltable *
 in6_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
 	llt->llt_af = AF_INET6;
 	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in6_lltable_lookup;
 	llt->llt_alloc_entry = in6_lltable_alloc;
 	llt->llt_delete_entry = in6_lltable_delete_entry;
 	llt->llt_dump_entry = in6_lltable_dump_entry;
 	llt->llt_hash = in6_lltable_hash;
 	llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
 	llt->llt_free_entry = in6_lltable_free_entry;
 	llt->llt_match_prefix = in6_lltable_match_prefix;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 void *
 in6_domifattach(struct ifnet *ifp)
 {
 	struct in6_ifextra *ext;
 
 	/* There are not IPv6-capable interfaces. */
 	switch (ifp->if_type) {
 	case IFT_PFLOG:
 	case IFT_PFSYNC:
 	case IFT_USB:
 		return (NULL);
 	}
 	ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK);
 	bzero(ext, sizeof(*ext));
 
 	ext->in6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR,
 	    M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->nd_ifinfo = nd6_ifattach(ifp);
 	ext->scope6_id = scope6_ifattach(ifp);
 	ext->lltable = in6_lltattach(ifp);
 
 	ext->mld_ifinfo = mld_domifattach(ifp);
 
 	return ext;
 }
 
 int
 in6_domifmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return ifp->if_mtu;
 
 	return (IN6_LINKMTU(ifp));
 }
 
 void
 in6_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in6_ifextra *ext = (struct in6_ifextra *)aux;
 
 	mld_domifdetach(ifp);
 	scope6_ifdetach(ext->scope6_id);
 	nd6_ifdetach(ext->nd_ifinfo);
 	lltable_free(ext->lltable);
 	COUNTER_ARRAY_FREE(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 	free(ext->in6_ifstat, M_IFADDR);
 	COUNTER_ARRAY_FREE(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 	free(ext->icmp6_ifstat, M_IFADDR);
 	free(ext, M_IFADDR);
 }
 
 /*
  * Convert sockaddr_in6 to sockaddr_in.  Original sockaddr_in6 must be
  * v4 mapped addr or v4 compat addr
  */
 void
 in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 
 	bzero(sin, sizeof(*sin));
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_family = AF_INET;
 	sin->sin_port = sin6->sin6_port;
 	sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
 }
 
 /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_port = sin->sin_port;
 	sin6->sin6_addr.s6_addr32[0] = 0;
 	sin6->sin6_addr.s6_addr32[1] = 0;
 	sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 	sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr;
 }
 
 /* Convert sockaddr_in6 into sockaddr_in. */
 void
 in6_sin6_2_sin_in_sock(struct sockaddr *nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 sin6;
 
 	/*
 	 * Save original sockaddr_in6 addr and convert it
 	 * to sockaddr_in.
 	 */
 	sin6 = *(struct sockaddr_in6 *)nam;
 	sin_p = (struct sockaddr_in *)nam;
 	in6_sin6_2_sin(sin_p, &sin6);
 }
 
 /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 *sin6_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK);
 	sin_p = (struct sockaddr_in *)*nam;
 	in6_sin_2_v4mapsin6(sin_p, sin6_p);
 	free(*nam, M_SONAME);
 	*nam = (struct sockaddr *)sin6_p;
 }
Index: projects/vnet/sys/netinet6/in6_ifattach.c
===================================================================
--- projects/vnet/sys/netinet6/in6_ifattach.c	(revision 294565)
+++ projects/vnet/sys/netinet6/in6_ifattach.c	(revision 294566)
@@ -1,892 +1,918 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 VNET_DEFINE(unsigned long, in6_maxmtu) = 0;
 
 #ifdef IP6_AUTO_LINKLOCAL
 VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL;
 #else
 VNET_DEFINE(int, ip6_auto_linklocal) = 1;	/* enabled by default */
 #endif
 
 VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch);
 #define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static int get_rand_ifid(struct ifnet *, struct in6_addr *);
 static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *);
 static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
 static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
 static int in6_ifattach_loopback(struct ifnet *);
 static void in6_purgemaddrs(struct ifnet *);
 
 #define EUI64_GBIT	0x01
 #define EUI64_UBIT	0x02
 #define EUI64_TO_IFID(in6)	do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
 #define EUI64_GROUP(in6)	((in6)->s6_addr[8] & EUI64_GBIT)
 #define EUI64_INDIVIDUAL(in6)	(!EUI64_GROUP(in6))
 #define EUI64_LOCAL(in6)	((in6)->s6_addr[8] & EUI64_UBIT)
 #define EUI64_UNIVERSAL(in6)	(!EUI64_LOCAL(in6))
 
 #define IFID_LOCAL(in6)		(!EUI64_LOCAL(in6))
 #define IFID_UNIVERSAL(in6)	(!EUI64_UNIVERSAL(in6))
 
 /*
  * Generate a last-resort interface identifier, when the machine has no
  * IEEE802/EUI64 address sources.
  * The goal here is to get an interface identifier that is
  * (1) random enough and (2) does not change across reboot.
  * We currently use MD5(hostname) for it.
  *
  * in6 - upper 64bits are preserved
  */
 static int
 get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	MD5_CTX ctxt;
 	struct prison *pr;
 	u_int8_t digest[16];
 	int hostnamelen;
 
 	pr = curthread->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	hostnamelen = strlen(pr->pr_hostname);
 #if 0
 	/* we need at least several letters as seed for ifid */
 	if (hostnamelen < 3) {
 		mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 #endif
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, pr->pr_hostname, hostnamelen);
 	mtx_unlock(&pr->pr_mtx);
 	MD5Final(digest, &ctxt);
 
 	/* assumes sizeof(digest) > sizeof(ifid) */
 	bcopy(digest, &in6->s6_addr[8], 8);
 
 	/* make sure to set "u" bit to local, and "g" bit to individual. */
 	in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 	in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	return 0;
 }
 
 static int
 generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret)
 {
 	MD5_CTX ctxt;
 	u_int8_t seed[16], digest[16], nullbuf[8];
 	u_int32_t val32;
 
 	/* If there's no history, start with a random seed. */
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
 		int i;
 
 		for (i = 0; i < 2; i++) {
 			val32 = arc4random();
 			bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
 		}
 	} else
 		bcopy(seed0, seed, 8);
 
 	/* copy the right-most 64-bits of the given address */
 	/* XXX assumption on the size of IFID */
 	bcopy(seed1, &seed[8], 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("generate_tmp_ifid: new randomized ID from: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", seed[i]);
 		printf(" ");
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, seed, sizeof(seed));
 	MD5Final(digest, &ctxt);
 
 	/*
 	 * RFC 3041 3.2.1. (3)
 	 * Take the left-most 64-bits of the MD5 digest and set bit 6 (the
 	 * left-most bit is numbered 0) to zero.
 	 */
 	bcopy(digest, ret, 8);
 	ret[0] &= ~EUI64_UBIT;
 
 	/*
 	 * XXX: we'd like to ensure that the generated value is not zero
 	 * for simplicity.  If the caclculated digest happens to be zero,
 	 * use a random non-zero value as the last resort.
 	 */
 	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
 		nd6log((LOG_INFO,
 		    "generate_tmp_ifid: computed MD5 value is zero.\n"));
 
 		val32 = arc4random();
 		val32 = 1 + (val32 % (0xffffffff - 1));
 	}
 
 	/*
 	 * RFC 3041 3.2.1. (4)
 	 * Take the rightmost 64-bits of the MD5 digest and save them in
 	 * stable storage as the history value to be used in the next
 	 * iteration of the algorithm.
 	 */
 	bcopy(&digest[8], seed0, 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("to: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", digest[i]);
 		printf("\n");
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.
  * XXX assumes single sockaddr_dl (AF_LINK address) per an interface
  *
  * in6 - upper 64bits are preserved
  */
 int
 in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	u_int8_t *addr;
 	size_t addrlen;
 	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	static u_int8_t allone[8] =
 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		if (sdl == NULL)
 			continue;
 		if (sdl->sdl_alen == 0)
 			continue;
 
 		goto found;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return -1;
 
 found:
 	IF_ADDR_LOCK_ASSERT(ifp);
 	addr = LLADDR(sdl);
 	addrlen = sdl->sdl_alen;
 
 	/* get EUI64 */
 	switch (ifp->if_type) {
 	case IFT_BRIDGE:
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_FDDI:
 	case IFT_ISO88025:
 	case IFT_ATM:
 	case IFT_IEEE1394:
 	case IFT_IEEE80211:
 		/* IEEE802/EUI64 cases - what others? */
 		/* IEEE1394 uses 16byte length address starting with EUI64 */
 		if (addrlen > 8)
 			addrlen = 8;
 
 		/* look at IEEE802/EUI64 only */
 		if (addrlen != 8 && addrlen != 6) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		/*
 		 * check for invalid MAC address - on bsdi, we see it a lot
 		 * since wildboar configures all-zero MAC on pccard before
 		 * card insertion.
 		 */
 		if (bcmp(addr, allzero, addrlen) == 0) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 		if (bcmp(addr, allone, addrlen) == 0) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		/* make EUI64 address */
 		if (addrlen == 8)
 			bcopy(addr, &in6->s6_addr[8], 8);
 		else if (addrlen == 6) {
 			in6->s6_addr[8] = addr[0];
 			in6->s6_addr[9] = addr[1];
 			in6->s6_addr[10] = addr[2];
 			in6->s6_addr[11] = 0xff;
 			in6->s6_addr[12] = 0xfe;
 			in6->s6_addr[13] = addr[3];
 			in6->s6_addr[14] = addr[4];
 			in6->s6_addr[15] = addr[5];
 		}
 		break;
 
 	case IFT_ARCNET:
 		if (addrlen != 1) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 		if (!addr[0]) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		bzero(&in6->s6_addr[8], 8);
 		in6->s6_addr[15] = addr[0];
 
 		/*
 		 * due to insufficient bitwidth, we mark it local.
 		 */
 		in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 		in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 		break;
 
 	case IFT_GIF:
 	case IFT_STF:
 		/*
 		 * RFC2893 says: "SHOULD use IPv4 address as ifid source".
 		 * however, IPv4 address is not very suitable as unique
 		 * identifier source (can be renumbered).
 		 * we don't do this.
 		 */
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 
 	default:
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	/* sanity check: g bit must not indicate "group" */
 	if (EUI64_GROUP(in6)) {
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	/*
 	 * sanity check: ifid must not be all zero, avoid conflict with
 	 * subnet router anycast
 	 */
 	if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
 	    bcmp(&in6->s6_addr[9], allzero, 7) == 0) {
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.  If it is not
  * available on ifp0, borrow interface identifier from other information
  * sources.
  *
  * altifp - secondary EUI64 source
  */
 static int
 get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
     struct in6_addr *in6)
 {
 	struct ifnet *ifp;
 
 	/* first, try to get it from the interface itself */
 	if (in6_get_hw_ifid(ifp0, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	/* try secondary EUI64 source. this basically is for ATM PVC */
 	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
 		    if_name(ifp0), if_name(altifp)));
 		goto success;
 	}
 
 	/* next, try to get it from some other hardware interface */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp == ifp0)
 			continue;
 		if (in6_get_hw_ifid(ifp, in6) != 0)
 			continue;
 
 		/*
 		 * to borrow ifid from other interface, ifid needs to be
 		 * globally unique
 		 */
 		if (IFID_UNIVERSAL(in6)) {
 			nd6log((LOG_DEBUG,
 			    "%s: borrow interface identifier from %s\n",
 			    if_name(ifp0), if_name(ifp)));
 			IFNET_RUNLOCK_NOSLEEP();
 			goto success;
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 
 	/* last resort: get from random number source */
 	if (get_rand_ifid(ifp, in6) == 0) {
 		nd6log((LOG_DEBUG,
 		    "%s: interface identifier generated by random number\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	printf("%s: failed to get interface identifier\n", if_name(ifp0));
 	return -1;
 
 success:
 	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
 	    in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
 	    in6->s6_addr[14], in6->s6_addr[15]));
 	return 0;
 }
 
 /*
  * altifp - secondary EUI64 source
  */
 static int
 in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_aliasreq ifra;
 	struct nd_prefixctl pr0;
 	int error;
 
 	/*
 	 * configure link-local address.
 	 */
 	in6_prepare_ifra(&ifra, NULL, &in6mask64);
 
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
 		ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
 	} else {
 		if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: no ifid available\n", if_name(ifp)));
 			return (-1);
 		}
 	}
 	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
 		return (-1);
 
 	/* link-local addresses should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * Now call in6_update_ifa() to do a bunch of procedures to configure
 	 * a link-local address. We can set the 3rd argument to NULL, because
 	 * we know there's no other link-local address on the interface
 	 * and therefore we are adding one (instead of updating one).
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL,
 				    IN6_IFAUPDATE_DADDELAY)) != 0) {
 		/*
 		 * XXX: When the interface does not support IPv6, this call
 		 * would fail in the SIOCSIFADDR ioctl.  I believe the
 		 * notification is rather confusing in this case, so just
 		 * suppress it.  (jinmei@kame.net 20010130)
 		 */
 		if (error != EAFNOSUPPORT)
 			nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
 			    "configure a link-local address on %s "
 			    "(errno=%d)\n",
 			    if_name(ifp), error));
 		return (-1);
 	}
 
 	ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */
 	KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp));
 
 	ifa_free(&ia->ia_ifa);
 
 	/*
 	 * Make the link-local prefix (fe80::%link/64) as on-link.
 	 * Since we'd like to manage prefixes separately from addresses,
 	 * we make an ND6 prefix structure for the link-local prefix,
 	 * and add it to the prefix list as a never-expire prefix.
 	 * XXX: this change might affect some existing code base...
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	/* this should be 64 at this moment. */
 	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
 	pr0.ndpr_prefix = ifra.ifra_addr;
 	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
 	IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64);
 	/*
 	 * Initialize parameters.  The link-local prefix must always be
 	 * on-link, and its lifetimes never expire.
 	 */
 	pr0.ndpr_raf_onlink = 1;
 	pr0.ndpr_raf_auto = 1;	/* probably meaningless */
 	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
 	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
 	/*
 	 * Since there is no other link-local addresses, nd6_prefix_lookup()
 	 * probably returns NULL.  However, we cannot always expect the result.
 	 * For example, if we first remove the (only) existing link-local
 	 * address, and then reconfigure another one, the prefix is still
 	 * valid with referring to the old link-local address.
 	 */
 	if (nd6_prefix_lookup(&pr0) == NULL) {
 		if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0)
 			return (error);
 	}
 
 	return 0;
 }
 
 /*
  * ifp - must be IFT_LOOP
  */
 static int
 in6_ifattach_loopback(struct ifnet *ifp)
 {
 	struct in6_aliasreq ifra;
 	int error;
 
 	in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128);
 
 	/*
 	 * Always initialize ia_dstaddr (= broadcast address) to loopback
 	 * address.  Follows IPv4 practice - see in_ifinit().
 	 */
 	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_dstaddr.sin6_family = AF_INET6;
 	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;
 
 	/* the loopback  address should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * We are sure that this is a newly assigned address, so we can set
 	 * NULL to the 3rd arg.
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
 		nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
 		    "the loopback address on %s (errno=%d)\n",
 		    if_name(ifp), error));
 		return (-1);
 	}
 
 	return 0;
 }
 
 /*
  * compute NI group address, based on the current hostname setting.
  * see RFC 4620.
  *
  * when ifp == NULL, the caller is responsible for filling scopeid.
  *
  * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address
  * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. 
  */
 static int
 in6_nigroup0(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6, int oldmcprefix)
 {
 	struct prison *pr;
 	const char *p;
 	u_char *q;
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	char l;
 	char n[64];	/* a single label must not exceed 63 chars */
 
 	/*
 	 * If no name is given and namelen is -1,
 	 * we try to do the hostname lookup ourselves.
 	 */
 	if (!name && namelen == -1) {
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
 		name = pr->pr_hostname;
 		namelen = strlen(name);
 	} else
 		pr = NULL;
 	if (!name || !namelen) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 
 	p = name;
 	while (p && *p && *p != '.' && p - name < namelen)
 		p++;
 	if (p == name || p - name > sizeof(n) - 1) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;	/* label too long */
 	}
 	l = p - name;
 	strncpy(n, name, l);
 	if (pr != NULL)
 		mtx_unlock(&pr->pr_mtx);
 	n[(int)l] = '\0';
 	for (q = n; *q; q++) {
 		if ('A' <= *q && *q <= 'Z')
 			*q = *q - 'A' + 'a';
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, &l, sizeof(l));
 	MD5Update(&ctxt, n, l);
 	MD5Final(digest, &ctxt);
 
 	bzero(in6, sizeof(*in6));
 	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 	in6->s6_addr8[11] = 2;
 	if (oldmcprefix == 0) {
 		in6->s6_addr8[12] = 0xff;
 	 	/* Copy the first 24 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr8[13], 3);
 	} else {
 	 	/* Copy the first 32 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
 	}
 	if (in6_setscope(in6, ifp, NULL))
 		return (-1); /* XXX: should not fail */
 
 	return 0;
 }
 
 int
 in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 0));
 }
 
 int
 in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 1));
 }
 
 /*
  * XXX multiple loopback interface needs more care.  for instance,
  * nodelocal address needs to be configured onto only one of them.
  * XXX multiple link-local address case
  *
  * altifp - secondary EUI64 source
  */
 void
 in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_addr in6;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 	/*
 	 * quirks based on interface type
 	 */
 	switch (ifp->if_type) {
 	case IFT_STF:
 		/*
 		 * 6to4 interface is a very special kind of beast.
 		 * no multicast, no linklocal.  RFC2529 specifies how to make
 		 * linklocals for 6to4 interface, but there's no use and
 		 * it is rather harmful to have one.
 		 */
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * usually, we require multicast capability to the interface
 	 */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		nd6log((LOG_INFO, "in6_ifattach: "
 		    "%s is not multicast capable, IPv6 not enabled\n",
 		    if_name(ifp)));
 		return;
 	}
 
 	/*
 	 * assign loopback address for loopback interface.
 	 * XXX multiple loopback interface case.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct ifaddr *ifa;
 
 		in6 = in6addr_loopback;
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &in6);
 		if (ifa == NULL) {
 			if (in6_ifattach_loopback(ifp) != 0)
 				return;
 		} else
 			ifa_free(ifa);
 	}
 
 	/*
 	 * assign a link-local address, if there's none.
 	 */
 	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) {
 		int error;
 
 		ia = in6ifa_ifpforlinklocal(ifp, 0);
 		if (ia == NULL) {
 			error = in6_ifattach_linklocal(ifp, altifp);
 #if 0
 			if (error)
 				log(LOG_NOTICE, "in6_ifattach_linklocal: "
 				    "failed to add a link-local addr to %s\n",
 				    if_name(ifp));
 #endif
 		} else
 			ifa_free(&ia->ia_ifa);
 	}
 
 	/* update dynamically. */
 	if (V_in6_maxmtu < ifp->if_mtu)
 		V_in6_maxmtu = ifp->if_mtu;
 }
 
 /*
  * NOTE: in6_ifdetach() does not support loopback if at this moment.
  * We don't need this function in bsdi, because interfaces are never removed
  * from the ifnet list in bsdi.
  */
 void
 in6_ifdetach(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	/* remove neighbor management table */
 	nd6_purge(ifp);
 
 	/*
 	 * nuke any of IPv6 addresses we have
 	 * XXX: all addresses should be already removed
 	 */
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		in6_purgeaddr(ifa);
 	}
 	in6_pcbpurgeif0(&V_udbinfo, ifp);
 	in6_pcbpurgeif0(&V_ulitecbinfo, ifp);
 	in6_pcbpurgeif0(&V_ripcbinfo, ifp);
 	/* leave from all multicast groups joined */
 	in6_purgemaddrs(ifp);
 
 	/*
 	 * remove neighbor management table.  we call it twice just to make
 	 * sure we nuke everything.  maybe we need just one call.
 	 * XXX: since the first call did not release addresses, some prefixes
 	 * might remain.  We should call nd6_purge() again to release the
 	 * prefixes after removing all addresses above.
 	 * (Or can we just delay calling nd6_purge until at this point?)
 	 */
 	nd6_purge(ifp);
 }
 
 int
 in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf,
     const u_int8_t *baseid, int generate)
 {
 	u_int8_t nullbuf[8];
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
 		/* we've never created a random ID.  Create a new one. */
 		generate = 1;
 	}
 
 	if (generate) {
 		bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));
 
 		/* generate_tmp_ifid will update seedn and buf */
 		(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
 		    ndi->randomid);
 	}
 	bcopy(ndi->randomid, retbuf, 8);
 
 	return (0);
 }
 
 void
 in6_tmpaddrtimer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *ndi;
 	u_int8_t nullbuf[8];
 	struct ifnet *ifp;
 
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		ndi = ND_IFINFO(ifp);
 		if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
 			/*
 			 * We've been generating a random ID on this interface.
 			 * Create a new one.
 			 */
 			(void)generate_tmp_ifid(ndi->randomseed0,
 			    ndi->randomseed1, ndi->randomid);
 		}
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 in6_purgemaddrs(struct ifnet *ifp)
 {
 	LIST_HEAD(,in6_multi)	 purgeinms;
 	struct in6_multi	*inm, *tinm;
 	struct ifmultiaddr	*ifma;
 
 	LIST_INIT(&purgeinms);
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Extract list of in6_multi associated with the detaching ifp
 	 * which the PF_INET6 layer is about to release.
 	 * We need to do this as IF_ADDR_LOCK() may be re-acquired
 	 * by code further down.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		LIST_INSERT_HEAD(&purgeinms, inm, in6m_entry);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	LIST_FOREACH_SAFE(inm, &purgeinms, in6m_entry, tinm) {
 		LIST_REMOVE(inm, in6m_entry);
 		in6m_release_locked(inm);
 	}
 	mld_ifdetach(ifp);
 
 	IN6_MULTI_UNLOCK();
 }
+
+void
+in6_ifattach_destroy(void)
+{
+
+	callout_drain(&V_in6_tmpaddrtimer_ch);
+}
+
+static void
+in6_ifattach_init(void *dummy)
+{
+
+	/* Timer for regeneranation of temporary addresses randomize ID. */
+	callout_init(&V_in6_tmpaddrtimer_ch, 0);
+	callout_reset(&V_in6_tmpaddrtimer_ch,
+	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
+	    V_ip6_temp_regen_advance) * hz,
+	    in6_tmpaddrtimer, curvnet);
+}
+
+/*
+ * Cheat.
+ * This must be after route_init(), which is now SI_ORDER_THIRD.
+ */
+SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
+    in6_ifattach_init, NULL);
Index: projects/vnet/sys/netinet6/in6_ifattach.h
===================================================================
--- projects/vnet/sys/netinet6/in6_ifattach.h	(revision 294565)
+++ projects/vnet/sys/netinet6/in6_ifattach.h	(revision 294566)
@@ -1,46 +1,47 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.h,v 1.14 2001/02/08 12:48:39 jinmei Exp $
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_IN6_IFATTACH_H_
 #define _NETINET6_IN6_IFATTACH_H_
 
 #ifdef _KERNEL
 void in6_ifattach(struct ifnet *, struct ifnet *);
+void in6_ifattach_destroy(void);
 void in6_ifdetach(struct ifnet *);
 int in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int);
 void in6_tmpaddrtimer(void *);
 int in6_get_hw_ifid(struct ifnet *, struct in6_addr *);
 int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *);
 int in6_nigroup_oldmcprefix(struct ifnet *, const char *, int, struct in6_addr *);
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_IN6_IFATTACH_H_ */
Index: projects/vnet/sys/netinet6/in6_proto.c
===================================================================
--- projects/vnet/sys/netinet6/in6_proto.c	(revision 294565)
+++ projects/vnet/sys/netinet6/in6_proto.c	(revision 294566)
@@ -1,627 +1,624 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ipstealth.h"
 #include "opt_sctp.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/pim6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #include <netinet6/sctp6_var.h>
 #endif /* SCTP */
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 /*
  * TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
  */
 FEATURE(inet6, "Internet Protocol version 6");
 
 extern	struct domain inet6domain;
 static	struct pr_usrreqs nousrreqs;
 
 #define PR_LISTEN	0
 #define PR_ABRTACPTDIS	0
 
 /* Spacer for loadable protocols. */
 #define IP6PROTOSPACER   			\
 {						\
 	.pr_domain =		&inet6domain,	\
 	.pr_protocol =		PROTO_SPACER,	\
 	.pr_usrreqs =		&nousrreqs	\
 }
 
 struct protosw inet6sw[] = {
 {
 	.pr_type =		0,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_init =		ip6_init,
-#ifdef VIMAGE
-	.pr_destroy =		ip6_destroy,
-#endif
 	.pr_slowtimo =		frag6_slowtimo,
 	.pr_drain =		frag6_drain,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_UDP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp6_input,
 	.pr_ctlinput =		udp6_ctlinput,
 	.pr_ctloutput =		ip6_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		udp_init,
 #endif
 	.pr_usrreqs =		&udp6_usrreqs,
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN,
 	.pr_input =		tcp6_input,
 	.pr_ctlinput =		tcp6_ctlinput,
 	.pr_ctloutput =		tcp_ctloutput,
 #ifndef INET	/* don't call initialization and timeout routines twice */
 	.pr_init =		tcp_init,
 	.pr_slowtimo =		tcp_slowtimo,
 #endif
 	.pr_drain =		tcp_drain,
 	.pr_usrreqs =		&tcp6_usrreqs,
 },
 #ifdef SCTP
 {
 	.pr_type =		SOCK_SEQPACKET,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp6_input,
 	.pr_ctlinput =		sctp6_ctlinput,
 	.pr_ctloutput =	sctp_ctloutput,
 	.pr_drain =		sctp_drain,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		sctp_init,
 #endif
 	.pr_usrreqs =		&sctp6_usrreqs
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp6_input,
 	.pr_ctlinput =		sctp6_ctlinput,
 	.pr_ctloutput =		sctp_ctloutput,
 	.pr_drain =		sctp_drain,
 	.pr_usrreqs =		&sctp6_usrreqs
 },
 #endif /* SCTP */
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_UDPLITE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp6_input,
 	.pr_ctlinput =		udplite6_ctlinput,
 	.pr_ctloutput =		udp_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		udplite_init,
 #endif
 	.pr_usrreqs =		&udp6_usrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		rip_init,
 #endif
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ICMPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		icmp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_fasttimo =		icmp6_fasttimo,
 	.pr_slowtimo =		icmp6_slowtimo,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_DSTOPTS,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		dest6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ROUTING,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		route6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_FRAGMENT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		frag6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 #ifdef IPSEC
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_AH,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ESP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_ctlinput =		esp6_ctlinput,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPCOMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 #endif /* IPSEC */
 #ifdef INET
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV4,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 #endif /* INET */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_GRE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 /* Spacer n-times for loadable protocols. */
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 /* raw wildcard */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 };
 
 extern int in6_inithead(void **, int);
 #ifdef VIMAGE
 extern int in6_detachhead(void **, int);
 #endif
 
 struct domain inet6domain = {
 	.dom_family =		AF_INET6,
 	.dom_name =		"internet6",
 	.dom_protosw =		(struct protosw *)inet6sw,
 	.dom_protoswNPROTOSW =	(struct protosw *)
 				&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])],
 #ifdef RADIX_MPATH
 	.dom_rtattach =		rn6_mpath_inithead,
 #else
 	.dom_rtattach =		in6_inithead,
 #endif
 #ifdef VIMAGE
 	.dom_rtdetach =		in6_detachhead,
 #endif
 	.dom_ifattach =		in6_domifattach,
 	.dom_ifdetach =		in6_domifdetach,
 	.dom_ifmtu    =		in6_domifmtu
 };
 
 VNET_DOMAIN_SET(inet6);
 
 /*
  * Internet configuration info
  */
 #ifndef	IPV6FORWARDING
 #ifdef GATEWAY6
 #define	IPV6FORWARDING	1	/* forward IP6 packets not for us */
 #else
 #define	IPV6FORWARDING	0	/* don't forward IP6 packets not for us */
 #endif /* GATEWAY6 */
 #endif /* !IPV6FORWARDING */
 
 #ifndef	IPV6_SENDREDIRECTS
 #define	IPV6_SENDREDIRECTS	1
 #endif
 
 VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING;	/* act as router? */
 VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS;
 VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM;
 VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS;
 VNET_DEFINE(int, ip6_accept_rtadv) = 0;
 VNET_DEFINE(int, ip6_no_radr) = 0;
 VNET_DEFINE(int, ip6_norbit_raif) = 0;
 VNET_DEFINE(int, ip6_rfc6204w3) = 0;
 VNET_DEFINE(int, ip6_maxfragpackets);	/* initialized in frag6.c:frag6_init() */
 VNET_DEFINE(int, ip6_maxfrags);		/* initialized in frag6.c:frag6_init() */
 VNET_DEFINE(int, ip6_log_interval) = 5;
 VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
 					 * process? */
 VNET_DEFINE(int, ip6_dad_count) = 1;	/* DupAddrDetectionTransmits */
 VNET_DEFINE(int, ip6_auto_flowlabel) = 1;
 VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr
 					 * (RFC2462 5.5.4) */
 VNET_DEFINE(int, ip6_rr_prune) = 5;	/* router renumbering prefix
 					 * walk list every 5 sec. */
 VNET_DEFINE(int, ip6_mcast_pmtu) = 0;	/* enable pMTU discovery for multicast? */
 VNET_DEFINE(int, ip6_v6only) = 1;
 
 VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L;
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ip6stealth) = 0;
 #endif
 VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS
 					     * (RFC 4861) */
 
 /* icmp6 */
 /*
  * BSDI4 defines these variables in in_proto.c...
  * XXX: what if we don't define INET? Should we define pmtu6_expire
  * or so? (jinmei@kame.net 19990310)
  */
 VNET_DEFINE(int, pmtu_expire) = 60*10;
 VNET_DEFINE(int, pmtu_probe) = 60*2;
 
 /* ICMPV6 parameters */
 VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */
 VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60;	/* 10 minutes */
 VNET_DEFINE(int, icmp6errppslim) = 100;		/* 100pps */
 /* control how to respond to NI queries */
 VNET_DEFINE(int, icmp6_nodeinfo) =
     (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK);
 VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1;
 
 /*
  * sysctl related items.
  */
 SYSCTL_NODE(_net,	PF_INET6,	inet6,	CTLFLAG_RW,	0,
 	"Internet6 Family");
 
 /* net.inet6 */
 SYSCTL_NODE(_net_inet6,	IPPROTO_IPV6,	ip6,	CTLFLAG_RW, 0,	"IP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_ICMPV6,	icmp6,	CTLFLAG_RW, 0,	"ICMP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_UDP,	udp6,	CTLFLAG_RW, 0,	"UDP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_TCP,	tcp6,	CTLFLAG_RW, 0,	"TCP6");
 #ifdef SCTP
 SYSCTL_NODE(_net_inet6,	IPPROTO_SCTP,	sctp6,	CTLFLAG_RW, 0,	"SCTP6");
 #endif
 #ifdef IPSEC
 SYSCTL_NODE(_net_inet6,	IPPROTO_ESP,	ipsec6,	CTLFLAG_RW, 0,	"IPSEC6");
 #endif /* IPSEC */
 
 /* net.inet6.ip6 */
 static int
 sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	int old;
 
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_preferred_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_preferred_lifetime <
 	    V_ip6_desync_factor + V_ip6_temp_regen_advance) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
 static int
 sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	int old;
 
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_valid_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip6_defhlim), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
 	ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0,
 	"Default value of per-interface flag for accepting ICMPv6 Router"
 	"Advertisement messages");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0,
 	"Default value of per-interface flag to control whether routers "
 	"sending ICMPv6 RA messages on that interface are added into the "
 	"default router list.");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0,
 	"Always set 0 to R flag in ICMPv6 NA messages when accepting RA"
 	" on the interface.");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0,
 	"Accept the default router list from ICMPv6 RA messages even "
 	"when packet forwarding enabled.");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "");
 SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version,
 	CTLFLAG_RD, __KAME_VERSION, 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "");
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ip6_temp_preferred_lifetime), 0,
    	sysctl_ip6_temppltime, "I", "");
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ip6_temp_valid_lifetime), 0,
    	sysctl_ip6_tempvltime, "I", "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0,
 	"Default value of per-interface flag for automatically adding an IPv6"
 	" link-local address to interfaces when attached");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats,
 	struct rip6stat, rip6stat,
 	"Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,"");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0, "");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "");
 #ifdef IPSTEALTH
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip6stealth), 0, "");
 #endif
 
 /* net.inet6.icmp6 */
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats,
 	struct icmp6stat, icmp6stat,
 	"ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX,
 	nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, 
 	"Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup"
 	" for compatibility with KAME implememtation.");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
 	nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nd6_onlink_ns_rfc4861), 0,
 	"Accept 'on-link' nd6 NS in compliance with RFC 4861.");
Index: projects/vnet/sys/netinet6/ip6_input.c
===================================================================
--- projects/vnet/sys/netinet6/ip6_input.c	(revision 294565)
+++ projects/vnet/sys/netinet6/ip6_input.c	(revision 294566)
@@ -1,1736 +1,1703 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netinet6/ip6_ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 #ifdef RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef RSS
 static struct netisr_handler ip6_direct_nh = {
 	.nh_name = "ip6_direct",
 	.nh_handler = ip6_direct_input,
 	.nh_proto = NETISR_IPV6_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
-VNET_DECLARE(struct callout, in6_tmpaddrtimer_ch);
-#define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
-
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rmlock in6_ifaddr_lock;
 RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
-static void ip6_init2(void *);
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	TAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
 	netisr_register(&ip6_nh);
 #ifdef RSS
 	netisr_register(&ip6_direct_nh);
 #endif
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 void
 ip6_destroy()
 {
 	int error;
 
 	if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 	nd6_destroy();
-	callout_drain(&V_in6_tmpaddrtimer_ch);
+	in6_ifattach_destroy();
 }
 #endif
-
-static int
-ip6_init2_vnet(const void *unused __unused)
-{
-
-	/* nd6_timer_init */
-	callout_init(&V_nd6_timer_ch, 0);
-	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
-
-	/* timer for regeneranation of temporary addresses randomize ID */
-	callout_init(&V_in6_tmpaddrtimer_ch, 0);
-	callout_reset(&V_in6_tmpaddrtimer_ch,
-		      (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
-		       V_ip6_temp_regen_advance) * hz,
-		      in6_tmpaddrtimer, curvnet);
-
-	return (0);
-}
-
-static void
-ip6_init2(void *dummy)
-{
-
-	ip6_init2_vnet(NULL);
-}
-
-/* cheat */
-/* This must be after route_init(), which is now SI_ORDER_THIRD */
-SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL);
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 #ifdef RSS
 /*
  * IPv6 direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip6_direct_input(struct mbuf *m)
 {
 	int off, nxt;
 	int nest;
 	struct m_tag *mtag;
 	struct ip6_direct_ctx *ip6dc;
 
 	mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL);
 	KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!"));
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	nxt = ip6dc->ip6dc_nxt;
 	off = ip6dc->ip6dc_off;
 
 	nest = 0;
 
 	m_tag_delete(m, mtag);
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 #endif
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct in6_addr odst;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int off = sizeof(struct ip6_hdr), nest;
 	int nxt, ours = 0;
 	int srcrt = 0;
 
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto hbhcheck;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(
 				    ip6s_m2m[m->m_pkthdr.rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	/* drop the packet if IPv6 operation is disabled on the IF */
 	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED))
 		goto bad;
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;	/* ENOBUFS */
 		}
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 
 	IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 	    IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) {
 		/*
 		 * RFC4291 2.7:
 		 * Nodes must not originate a packet to a multicast address
 		 * whose scop field contains the reserved value 0; if such
 		 * a packet is received, it must be silently dropped.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip6_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 	odst = ip6->ip6_dst;
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		goto hbhcheck;
 	}
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		return;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		goto hbhcheck;
 	}
 	/*
 	 * Unicast check
 	 * XXX: For now we keep link-local IPv6 addresses with embedded
 	 *      scope zone id, therefore we use zero zoneid here.
 	 */
 	ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia != NULL) {
 		if (ia->ia6_flags & IN6_IFF_NOTREADY) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 			ifa_free(&ia->ia_ifa);
 			goto bad;
 		}
 		/* Count the packet in the ip address stats */
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		ifa_free(&ia->ia_ifa);
 		ours = 1;
 		goto hbhcheck;
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0)
 			return;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Use mbuf flags to propagate Router Alert option to
 	 * ICMPv6 layer, as hop-by-hop options have been stripped.
 	 */
 	if (rtalert != ~0)
 		m->m_flags |= M_RTALERT_MLD;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward &&
 		    ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 	if (in6p->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * Notify the error by sending IPV6_PATHMTU ancillary data if
 	 * application wanted to know the MTU value.
 	 * NOTE: we notify disconnected sockets, because some udp
 	 * applications keep sending sockets disconnected.
 	 * NOTE: our implementation doesn't notify connected sockets that has
 	 * foreign address that is different than given destination addresses
 	 * (this is permitted by RFC 3542).
 	 */
 	if ((inp->inp_flags & IN6P_MTU) == 0 || (
 	    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr)))
 		return;
 
 	mtuctl.ip6m_mtu = mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	so =  inp->inp_socket;
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(const struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return (&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return (&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };
Index: projects/vnet/sys/netinet6/ip6_mroute.c
===================================================================
--- projects/vnet/sys/netinet6/ip6_mroute.c	(revision 294565)
+++ projects/vnet/sys/netinet6/ip6_mroute.c	(revision 294566)
@@ -1,1969 +1,1969 @@
 /*-
  * Copyright (C) 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_mroute.c	8.2 (Berkeley) 11/15/93
  *	BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1994
  *
  * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/raw_cb.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_encap.h>
 
 #include <netinet/ip6.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/pim6.h>
 #include <netinet6/pim6_var.h>
 
 static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry");
 
 static int	ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
 static void	phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	set_pim6(int *);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in6 *);
 
 extern int in6_mcast_loop;
 extern struct domain inet6domain;
 
 static const struct encaptab *pim6_encap_cookie;
 static const struct protosw in6_pim_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		pim6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 };
 static int pim6_encapcheck(const struct mbuf *, int, int, void *);
 
 static VNET_DEFINE(int, ip6_mrouter_ver) = 0;
 #define	V_ip6_mrouter_ver	VNET(ip6_mrouter_ver)
 
 SYSCTL_DECL(_net_inet6);
 SYSCTL_DECL(_net_inet6_ip6);
 static SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 
 static struct mrt6stat mrt6stat;
 SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW,
     &mrt6stat, mrt6stat,
     "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)");
 
 #define	MRT6STAT_INC(name)	mrt6stat.name += 1
 #define NO_RTE_FOUND	0x1
 #define RTE_FOUND	0x2
 
 static struct mtx mrouter6_mtx;
 #define	MROUTER6_LOCK()		mtx_lock(&mrouter6_mtx)
 #define	MROUTER6_UNLOCK()	mtx_unlock(&mrouter6_mtx)
 #define	MROUTER6_LOCK_ASSERT()	do {					\
 	mtx_assert(&mrouter6_mtx, MA_OWNED);				\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MROUTER6_LOCK_INIT()	\
 	mtx_init(&mrouter6_mtx, "IPv6 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER6_LOCK_DESTROY()	mtx_destroy(&mrouter6_mtx)
 
 static struct mf6c *mf6ctable[MF6CTBLSIZ];
 SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD,
     &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]",
     "IPv6 Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], "
     "netinet6/ip6_mroute.h)");
 
 static struct mtx mfc6_mtx;
 #define	MFC6_LOCK()		mtx_lock(&mfc6_mtx)
 #define	MFC6_UNLOCK()		mtx_unlock(&mfc6_mtx)
 #define	MFC6_LOCK_ASSERT()	do {					\
 	mtx_assert(&mfc6_mtx, MA_OWNED);				\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MFC6_LOCK_INIT()		\
 	mtx_init(&mfc6_mtx, "IPv6 multicast forwarding cache", NULL, MTX_DEF)
 #define	MFC6_LOCK_DESTROY()	mtx_destroy(&mfc6_mtx)
 
 static u_char n6expire[MF6CTBLSIZ];
 
 static struct mif6 mif6table[MAXMIFS];
 static int
 sysctl_mif6table(SYSCTL_HANDLER_ARGS)
 {
 	struct mif6_sctl *out;
 	int error;
 
 	out = malloc(sizeof(struct mif6_sctl) * MAXMIFS, M_TEMP, M_WAITOK);
 	for (int i = 0; i < MAXMIFS; i++) {
 		out[i].m6_flags		= mif6table[i].m6_flags;
 		out[i].m6_rate_limit	= mif6table[i].m6_rate_limit;
 		out[i].m6_lcl_addr	= mif6table[i].m6_lcl_addr;
 		if (mif6table[i].m6_ifp != NULL)
 			out[i].m6_ifp	= mif6table[i].m6_ifp->if_index;
 		else
 			out[i].m6_ifp	= 0;
 		out[i].m6_pkt_in	= mif6table[i].m6_pkt_in;
 		out[i].m6_pkt_out	= mif6table[i].m6_pkt_out;
 		out[i].m6_bytes_in	= mif6table[i].m6_bytes_in;
 		out[i].m6_bytes_out	= mif6table[i].m6_bytes_out;
 	}
 	error = SYSCTL_OUT(req, out, sizeof(struct mif6_sctl) * MAXMIFS);
 	free(out, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, mif6table, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_mif6table, "S,mif6_sctl[MAXMIFS]",
     "IPv6 Multicast Interfaces (struct mif6_sctl[MAXMIFS], "
     "netinet6/ip6_mroute.h)");
 
 static struct mtx mif6_mtx;
 #define	MIF6_LOCK()		mtx_lock(&mif6_mtx)
 #define	MIF6_UNLOCK()		mtx_unlock(&mif6_mtx)
 #define	MIF6_LOCK_ASSERT()	mtx_assert(&mif6_mtx, MA_OWNED)
 #define	MIF6_LOCK_INIT()	\
 	mtx_init(&mif6_mtx, "IPv6 multicast interfaces", NULL, MTX_DEF)
 #define	MIF6_LOCK_DESTROY()	mtx_destroy(&mif6_mtx)
 
 #ifdef MRT6DEBUG
 static VNET_DEFINE(u_int, mrt6debug) = 0;	/* debug level */
 #define	V_mrt6debug		VNET(mrt6debug)
 #define DEBUG_MFC	0x02
 #define DEBUG_FORWARD	0x04
 #define DEBUG_EXPIRE	0x08
 #define DEBUG_XMIT	0x10
 #define DEBUG_REG	0x20
 #define DEBUG_PIM	0x40
 #define	DEBUG_ERR	0x80
 #define	DEBUG_ANY	0x7f
 #define	MRT6_DLOG(m, fmt, ...)	\
 	if (V_mrt6debug & (m))	\
 		log(((m) & DEBUG_ERR) ? LOG_ERR: LOG_DEBUG, \
 		    "%s: " fmt "\n", __func__, ##__VA_ARGS__)
 #else
 #define	MRT6_DLOG(m, fmt, ...)
 #endif
 
 static void	expire_upcalls(void *);
 #define	EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
 #define	UPCALL_EXPIRE	6		/* number of timeouts */
 
 /*
  * XXX TODO: maintain a count to if_allmulti() calls in struct ifnet.
  */
 
 /*
  * 'Interfaces' associated with decapsulator (so we can tell
  * packets that went through it from ones that get reflected
  * by a broken gateway).  Different from IPv4 register_if,
  * these interfaces are linked into the system ifnet list,
  * because per-interface IPv6 statistics are maintained in
  * ifp->if_afdata.  But it does not have any routes point
  * to them.  I.e., packets can't be sent this way.  They
  * only exist as a placeholder for multicast source
  * verification.
  */
 static struct ifnet *multicast_register_if6;
 
 #define ENCAP_HOPS 64
 
 /*
  * Private variables.
  */
 static mifi_t nummifs = 0;
 static mifi_t reg_mif_num = (mifi_t)-1;
 
 static struct pim6stat pim6stat;
 SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RW,
     &pim6stat, pim6stat,
     "PIM Statistics (struct pim6stat, netinet6/pim6_var.h)");
 
 #define	PIM6STAT_INC(name)	pim6stat.name += 1
 static VNET_DEFINE(int, pim6);
 #define	V_pim6		VNET(pim6)
 
 /*
  * Hash function for a source, group entry
  */
 #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
 				   (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
 				   (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
 				   (g).s6_addr32[2] ^ (g).s6_addr32[3])
 
 /*
  * Find a route for a given origin IPv6 address and Multicast group address.
  */
 #define MF6CFIND(o, g, rt) do { \
 	struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
 	rt = NULL; \
 	while (_rt) { \
 		if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
 		    IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
 		    (_rt->mf6c_stall == NULL)) { \
 			rt = _rt; \
 			break; \
 		} \
 		_rt = _rt->mf6c_next; \
 	} \
 	if (rt == NULL) { \
 		MRT6STAT_INC(mrt6s_mfc_misses); \
 	} \
 } while (/*CONSTCOND*/ 0)
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  * XXX: replace with timersub() ?
  */
 #define TV_DELTA(a, b, delta) do { \
 	    int xxs; \
 		\
 	    delta = (a).tv_usec - (b).tv_usec; \
 	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
 	       switch (xxs) { \
 		      case 2: \
 			  delta += 1000000; \
 			      /* FALLTHROUGH */ \
 		      case 1: \
 			  delta += 1000000; \
 			  break; \
 		      default: \
 			  delta += (1000000 * xxs); \
 	       } \
 	    } \
 } while (/*CONSTCOND*/ 0)
 
 /* XXX: replace with timercmp(a, b, <) ? */
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 #ifdef UPCALL_TIMING
 #define UPCALL_MAX	50
 static u_long upcall_data[UPCALL_MAX + 1];
 static void collate();
 #endif /* UPCALL_TIMING */
 
 static int ip6_mrouter_init(struct socket *, int, int);
 static int add_m6fc(struct mf6cctl *);
 static int add_m6if(struct mif6ctl *);
 static int del_m6fc(struct mf6cctl *);
 static int del_m6if(mifi_t *);
 static int del_m6if_locked(mifi_t *);
 static int get_mif6_cnt(struct sioc_mif_req6 *);
 static int get_sg_cnt(struct sioc_sg_req6 *);
 
 static struct callout expire_upcalls_ch;
 
 int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int X_ip6_mrouter_done(void);
 int X_ip6_mrouter_set(struct socket *, struct sockopt *);
 int X_ip6_mrouter_get(struct socket *, struct sockopt *);
 int X_mrt6_ioctl(u_long, caddr_t);
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 int
 X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optval;
 	struct mif6ctl mifc;
 	struct mf6cctl mfcc;
 	mifi_t mifi;
 
 	if (so != V_ip6_mrouter && sopt->sopt_name != MRT6_INIT)
 		return (EPERM);
 
 	switch (sopt->sopt_name) {
 	case MRT6_INIT:
 #ifdef MRT6_OINIT
 	case MRT6_OINIT:
 #endif
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = ip6_mrouter_init(so, optval, sopt->sopt_name);
 		break;
 	case MRT6_DONE:
 		error = X_ip6_mrouter_done();
 		break;
 	case MRT6_ADD_MIF:
 		error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc));
 		if (error)
 			break;
 		error = add_m6if(&mifc);
 		break;
 	case MRT6_ADD_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = add_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = del_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MIF:
 		error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi));
 		if (error)
 			break;
 		error = del_m6if(&mifi);
 		break;
 	case MRT6_PIM:
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = set_pim6(&optval);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 int
 X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 
 	if (so != V_ip6_mrouter)
 		return (EACCES);
 
 	switch (sopt->sopt_name) {
 		case MRT6_PIM:
 			error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6));
 			break;
 	}
 	return (error);
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 int
 X_mrt6_ioctl(u_long cmd, caddr_t data)
 {
 	int ret;
 
 	ret = EINVAL;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 		ret = get_sg_cnt((struct sioc_sg_req6 *)data);
 		break;
 
 	case SIOCGETMIFCNT_IN6:
 		ret = get_mif6_cnt((struct sioc_mif_req6 *)data);
 		break;
 
 	default:
 		break;
 	}
 
 	return (ret);
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req6 *req)
 {
 	struct mf6c *rt;
 	int ret;
 
 	ret = 0;
 
 	MFC6_LOCK();
 
 	MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
 	if (rt == NULL) {
 		ret = ESRCH;
 	} else {
 		req->pktcnt = rt->mf6c_pkt_cnt;
 		req->bytecnt = rt->mf6c_byte_cnt;
 		req->wrong_if = rt->mf6c_wrong_if;
 	}
 
 	MFC6_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * returns the input and output packet and byte counts on the mif provided
  */
 static int
 get_mif6_cnt(struct sioc_mif_req6 *req)
 {
 	mifi_t mifi;
 	int ret;
 
 	ret = 0;
 	mifi = req->mifi;
 
 	MIF6_LOCK();
 
 	if (mifi >= nummifs) {
 		ret = EINVAL;
 	} else {
 		req->icount = mif6table[mifi].m6_pkt_in;
 		req->ocount = mif6table[mifi].m6_pkt_out;
 		req->ibytes = mif6table[mifi].m6_bytes_in;
 		req->obytes = mif6table[mifi].m6_bytes_out;
 	}
 
 	MIF6_UNLOCK();
 
 	return (ret);
 }
 
 static int
 set_pim6(int *i)
 {
 	if ((*i != 1) && (*i != 0))
 		return (EINVAL);
 
 	V_pim6 = *i;
 
 	return (0);
 }
 
 /*
  * Enable multicast routing
  */
 static int
 ip6_mrouter_init(struct socket *so, int v, int cmd)
 {
 
 	MRT6_DLOG(DEBUG_ANY, "so_type = %d, pr_protocol = %d",
 	    so->so_type, so->so_proto->pr_protocol);
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_ICMPV6)
 		return (EOPNOTSUPP);
 
 	if (v != 1)
 		return (ENOPROTOOPT);
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter != NULL) {
 		MROUTER6_UNLOCK();
 		return (EADDRINUSE);
 	}
 
 	V_ip6_mrouter = so;
 	V_ip6_mrouter_ver = cmd;
 
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	bzero((caddr_t)n6expire, sizeof(n6expire));
 
 	V_pim6 = 0;/* used for stubbing out/in pim stuff */
 
 	callout_init(&expire_upcalls_ch, 0);
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 
 	MROUTER6_UNLOCK();
 	MRT6_DLOG(DEBUG_ANY, "finished");
 
 	return (0);
 }
 
 /*
  * Disable IPv6 multicast forwarding.
  */
 int
 X_ip6_mrouter_done(void)
 {
 	mifi_t mifi;
 	u_long i;
 	struct mf6c *rt;
 	struct rtdetq *rte;
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter == NULL) {
 		MROUTER6_UNLOCK();
 		return (EINVAL);
 	}
 
 	/*
 	 * For each phyint in use, disable promiscuous reception of all IPv6
 	 * multicasts.
 	 */
 	for (mifi = 0; mifi < nummifs; mifi++) {
 		if (mif6table[mifi].m6_ifp &&
 		    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 			if_allmulti(mif6table[mifi].m6_ifp, 0);
 		}
 	}
 	bzero((caddr_t)mif6table, sizeof(mif6table));
 	nummifs = 0;
 
 	V_pim6 = 0; /* used to stub out/in pim specific code */
 
 	callout_stop(&expire_upcalls_ch);
 
 	/*
 	 * Free all multicast forwarding cache entries.
 	 */
 	MFC6_LOCK();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		rt = mf6ctable[i];
 		while (rt) {
 			struct mf6c *frt;
 
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 
 				m_freem(rte->m);
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			frt = rt;
 			rt = rt->mf6c_next;
 			free(frt, M_MRTABLE6);
 		}
 	}
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	MFC6_UNLOCK();
 
 	/*
 	 * Reset register interface
 	 */
 	if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) {
 		if_detach(multicast_register_if6);
 		if_free(multicast_register_if6);
 		reg_mif_num = (mifi_t)-1;
 		multicast_register_if6 = NULL;
 	}
 
 	V_ip6_mrouter = NULL;
 	V_ip6_mrouter_ver = 0;
 
 	MROUTER6_UNLOCK();
 	MRT6_DLOG(DEBUG_ANY, "finished");
 
 	return (0);
 }
 
 static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 
 /*
  * Add a mif to the mif table
  */
 static int
 add_m6if(struct mif6ctl *mifcp)
 {
 	struct mif6 *mifp;
 	struct ifnet *ifp;
 	int error;
 
 	MIF6_LOCK();
 
 	if (mifcp->mif6c_mifi >= MAXMIFS) {
 		MIF6_UNLOCK();
 		return (EINVAL);
 	}
 	mifp = mif6table + mifcp->mif6c_mifi;
 	if (mifp->m6_ifp != NULL) {
 		MIF6_UNLOCK();
 		return (EADDRINUSE); /* XXX: is it appropriate? */
 	}
 	if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) {
 		MIF6_UNLOCK();
 		return (ENXIO);
 	}
 
 	ifp = ifnet_byindex(mifcp->mif6c_pifi);
 
 	if (mifcp->mif6c_flags & MIFF_REGISTER) {
 		if (reg_mif_num == (mifi_t)-1) {
 			ifp = if_alloc(IFT_OTHER);
 
 			if_initname(ifp, "register_mif", 0);
 			ifp->if_flags |= IFF_LOOPBACK;
 			if_attach(ifp);
 			multicast_register_if6 = ifp;
 			reg_mif_num = mifcp->mif6c_mifi;
 			/*
 			 * it is impossible to guess the ifindex of the
 			 * register interface.  So mif6c_pifi is automatically
 			 * calculated.
 			 */
 			mifcp->mif6c_pifi = ifp->if_index;
 		} else {
 			ifp = multicast_register_if6;
 		}
 	} else {
 		/* Make sure the interface supports multicast */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 			MIF6_UNLOCK();
 			return (EOPNOTSUPP);
 		}
 
 		error = if_allmulti(ifp, 1);
 		if (error) {
 			MIF6_UNLOCK();
 			return (error);
 		}
 	}
 
 	mifp->m6_flags     = mifcp->mif6c_flags;
 	mifp->m6_ifp       = ifp;
 
 	/* initialize per mif pkt counters */
 	mifp->m6_pkt_in    = 0;
 	mifp->m6_pkt_out   = 0;
 	mifp->m6_bytes_in  = 0;
 	mifp->m6_bytes_out = 0;
 
 	/* Adjust nummifs up if the mifi is higher than nummifs */
 	if (nummifs <= mifcp->mif6c_mifi)
 		nummifs = mifcp->mif6c_mifi + 1;
 
 	MIF6_UNLOCK();
 	MRT6_DLOG(DEBUG_ANY, "mif #%d, phyint %s", mifcp->mif6c_mifi,
 	    if_name(ifp));
 
 	return (0);
 }
 
 /*
  * Delete a mif from the mif table
  */
 static int
 del_m6if_locked(mifi_t *mifip)
 {
 	struct mif6 *mifp = mif6table + *mifip;
 	mifi_t mifi;
 	struct ifnet *ifp;
 
 	MIF6_LOCK_ASSERT();
 
 	if (*mifip >= nummifs)
 		return (EINVAL);
 	if (mifp->m6_ifp == NULL)
 		return (EINVAL);
 
 	if (!(mifp->m6_flags & MIFF_REGISTER)) {
 		/* XXX: TODO: Maintain an ALLMULTI refcount in struct ifnet. */
 		ifp = mifp->m6_ifp;
 		if_allmulti(ifp, 0);
 	} else {
 		if (reg_mif_num != (mifi_t)-1 &&
 		    multicast_register_if6 != NULL) {
 			if_detach(multicast_register_if6);
 			if_free(multicast_register_if6);
 			reg_mif_num = (mifi_t)-1;
 			multicast_register_if6 = NULL;
 		}
 	}
 
 	bzero((caddr_t)mifp, sizeof(*mifp));
 
 	/* Adjust nummifs down */
 	for (mifi = nummifs; mifi > 0; mifi--)
 		if (mif6table[mifi - 1].m6_ifp)
 			break;
 	nummifs = mifi;
 	MRT6_DLOG(DEBUG_ANY, "mif %d, nummifs %d", *mifip, nummifs);
 
 	return (0);
 }
 
 static int
 del_m6if(mifi_t *mifip)
 {
 	int cc;
 
 	MIF6_LOCK();
 	cc = del_m6if_locked(mifip);
 	MIF6_UNLOCK();
 
 	return (cc);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_m6fc(struct mf6cctl *mfccp)
 {
 	struct mf6c *rt;
 	u_long hash;
 	struct rtdetq *rte;
 	u_short nstl;
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 
 	MFC6_LOCK();
 
 	MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
 		 mfccp->mf6cc_mcastgrp.sin6_addr, rt);
 
 	/* If an entry already exists, just update the fields */
 	if (rt) {
 		MRT6_DLOG(DEBUG_MFC, "no upcall o %s g %s p %x",
 		    ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 		    ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 		    mfccp->mf6cc_parent);
 
 		rt->mf6c_parent = mfccp->mf6cc_parent;
 		rt->mf6c_ifset = mfccp->mf6cc_ifset;
 
 		MFC6_UNLOCK();
 		return (0);
 	}
 
 	/*
 	 * Find the entry for which the upcall was made and update
 	 */
 	hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
 			mfccp->mf6cc_mcastgrp.sin6_addr);
 	for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 				       &mfccp->mf6cc_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 				       &mfccp->mf6cc_mcastgrp.sin6_addr) &&
 		    (rt->mf6c_stall != NULL)) {
 
 			if (nstl++)
 				log(LOG_ERR,
 				    "add_m6fc: %s o %s g %s p %x dbx %p\n",
 				    "multiple kernel entries",
 				    ip6_sprintf(ip6bufo,
 					    &mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg,
 					    &mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 			MRT6_DLOG(DEBUG_MFC, "o %s g %s p %x dbg %p",
 			    ip6_sprintf(ip6bufo,
 			    &mfccp->mf6cc_origin.sin6_addr),
 			    ip6_sprintf(ip6bufg,
 				&mfccp->mf6cc_mcastgrp.sin6_addr),
 			    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 
 			rt->mf6c_expire = 0;	/* Don't clean this guy up */
 			n6expire[hash]--;
 
 			/* free packets Qed at the end of this entry */
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 				ip6_mdq(rte->m, rte->ifp, rt);
 				m_freem(rte->m);
 #ifdef UPCALL_TIMING
 				collate(&(rte->t));
 #endif /* UPCALL_TIMING */
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			rt->mf6c_stall = NULL;
 		}
 	}
 
 	/*
 	 * It is possible that an entry is being inserted without an upcall
 	 */
 	if (nstl == 0) {
 		MRT6_DLOG(DEBUG_MFC, "no upcall h %lu o %s g %s p %x", hash,
 		    ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 		    ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 		    mfccp->mf6cc_parent);
 
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 
 			if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 					       &mfccp->mf6cc_origin.sin6_addr)&&
 			    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 					       &mfccp->mf6cc_mcastgrp.sin6_addr)) {
 
 				rt->mf6c_origin     = mfccp->mf6cc_origin;
 				rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 				rt->mf6c_parent     = mfccp->mf6cc_parent;
 				rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 				/* initialize pkt counters per src-grp */
 				rt->mf6c_pkt_cnt    = 0;
 				rt->mf6c_byte_cnt   = 0;
 				rt->mf6c_wrong_if   = 0;
 
 				if (rt->mf6c_expire)
 					n6expire[hash]--;
 				rt->mf6c_expire	   = 0;
 			}
 		}
 		if (rt == NULL) {
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 
 			/* insert new entry at head of hash chain */
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 			rt->mf6c_expire     = 0;
 			rt->mf6c_stall = NULL;
 
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 		}
 	}
 
 	MFC6_UNLOCK();
 	return (0);
 }
 
 #ifdef UPCALL_TIMING
 /*
  * collect delay statistics on the upcalls
  */
 static void
 collate(struct timeval *t)
 {
 	u_long d;
 	struct timeval tp;
 	u_long delta;
 
 	GET_TIME(tp);
 
 	if (TV_LT(*t, tp))
 	{
 		TV_DELTA(tp, *t, delta);
 
 		d = delta >> 10;
 		if (d > UPCALL_MAX)
 			d = UPCALL_MAX;
 
 		++upcall_data[d];
 	}
 }
 #endif /* UPCALL_TIMING */
 
 /*
  * Delete an mfc entry
  */
 static int
 del_m6fc(struct mf6cctl *mfccp)
 {
 #ifdef MRT6DEBUG
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 #endif
 	struct sockaddr_in6	origin;
 	struct sockaddr_in6	mcastgrp;
 	struct mf6c		*rt;
 	struct mf6c		**nptr;
 	u_long		hash;
 
 	origin = mfccp->mf6cc_origin;
 	mcastgrp = mfccp->mf6cc_mcastgrp;
 	hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
 
 	MRT6_DLOG(DEBUG_MFC, "orig %s mcastgrp %s",
 	    ip6_sprintf(ip6bufo, &origin.sin6_addr),
 	    ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr));
 
 	MFC6_LOCK();
 
 	nptr = &mf6ctable[hash];
 	while ((rt = *nptr) != NULL) {
 		if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
 				       &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
 				       &rt->mf6c_mcastgrp.sin6_addr) &&
 		    rt->mf6c_stall == NULL)
 			break;
 
 		nptr = &rt->mf6c_next;
 	}
 	if (rt == NULL) {
 		MFC6_UNLOCK();
 		return (EADDRNOTAVAIL);
 	}
 
 	*nptr = rt->mf6c_next;
 	free(rt, M_MRTABLE6);
 
 	MFC6_UNLOCK();
 
 	return (0);
 }
 
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
 {
 
 	if (s) {
 		if (sbappendaddr(&s->so_rcv,
 				 (struct sockaddr *)src,
 				 mm, (struct mbuf *)0) != 0) {
 			sorwakeup(s);
 			return (0);
 		}
 	}
 	m_freem(mm);
 	return (-1);
 }
 
 /*
  * IPv6 multicast forwarding function. This function assumes that the packet
  * pointed to by "ip6" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IPv6 multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  *
  * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff
  * this function is called in the originating context (i.e., not when
  * forwarding a packet from other node).  ip6_output(), which is currently the
  * only function that calls this function is called in the originating context,
  * explicitly ensures this condition.  It is caller's responsibility to ensure
  * that if this function is called from somewhere else in the originating
  * context in the future.
  */
 int
 X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
 {
 	struct rtdetq *rte;
 	struct mbuf *mb0;
 	struct mf6c *rt;
 	struct mif6 *mifp;
 	struct mbuf *mm;
 	u_long hash;
 	mifi_t mifi;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #ifdef UPCALL_TIMING
 	struct timeval tp;
 
 	GET_TIME(tp);
 #endif /* UPCALL_TIMING */
 
 	MRT6_DLOG(DEBUG_FORWARD, "src %s, dst %s, ifindex %d",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp->if_index);
 
 	/*
 	 * Don't forward a packet with Hop limit of zero or one,
 	 * or a packet destined to a local-only group.
 	 */
 	if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 		return (0);
 	ip6->ip6_hlim--;
 
 	/*
 	 * Source address check: do not forward packets with unspecified
 	 * source. It was discussed in July 2000, on ipngwg mailing list.
 	 * This is rather more serious than unicast cases, because some
 	 * MLD packets can be sent with the unspecified source address
 	 * (although such packets must normally set 1 to the hop limit field).
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 		IP6STAT_INC(ip6s_cantforward);
 		if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
 			V_ip6_log_time = time_uptime;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif));
 		}
 		return (0);
 	}
 
 	MFC6_LOCK();
 
 	/*
 	 * Determine forwarding mifs from the forwarding cache table
 	 */
 	MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
 	MRT6STAT_INC(mrt6s_mfc_lookups);
 
 	/* Entry exists, so forward if necessary */
 	if (rt) {
 		MFC6_UNLOCK();
 		return (ip6_mdq(m, ifp, rt));
 	}
 
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon.
 	 */
 	MRT6STAT_INC(mrt6s_no_route);
 	MRT6_DLOG(DEBUG_FORWARD | DEBUG_MFC, "no rte s %s g %s",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we
 	 * are just going to fail anyway.
 	 */
 	rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6, M_NOWAIT);
 	if (rte == NULL) {
 		MFC6_UNLOCK();
 		return (ENOBUFS);
 	}
 	mb0 = m_copy(m, 0, M_COPYALL);
 	/*
 	 * Pullup packet header if needed before storing it,
 	 * as other references may modify it in the meantime.
 	 */
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < sizeof(struct ip6_hdr)))
 		mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
 	if (mb0 == NULL) {
 		free(rte, M_MRTABLE6);
 		MFC6_UNLOCK();
 		return (ENOBUFS);
 	}
 
 	/* is there an upcall waiting for this packet? */
 	hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
 	for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
 		    &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 		    &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL))
 			break;
 	}
 
 	if (rt == NULL) {
 		struct mrt6msg *im;
 #ifdef MRT6_OINIT
 		struct omrt6msg *oim;
 #endif
 		/* no upcall, so make a new entry */
 		rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT);
 		if (rt == NULL) {
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 		/*
 		 * Make a copy of the header to send to the user
 		 * level process
 		 */
 		mm = m_copy(mb0, 0, sizeof(struct ip6_hdr));
 		if (mm == NULL) {
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Send message to routing daemon
 		 */
 		sin6.sin6_addr = ip6->ip6_src;
 		im = NULL;
 #ifdef MRT6_OINIT
 		oim = NULL;
 #endif
 		switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 		case MRT6_OINIT:
 			oim = mtod(mm, struct omrt6msg *);
 			oim->im6_msgtype = MRT6MSG_NOCACHE;
 			oim->im6_mbz = 0;
 			break;
 #endif
 		case MRT6_INIT:
 			im = mtod(mm, struct mrt6msg *);
 			im->im6_msgtype = MRT6MSG_NOCACHE;
 			im->im6_mbz = 0;
 			break;
 		default:
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (EINVAL);
 		}
 
 		MRT6_DLOG(DEBUG_FORWARD, "getting the iif info in the kernel");
 		for (mifp = mif6table, mifi = 0;
 		    mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++)
 				;
 
 		switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 		case MRT6_OINIT:
 			oim->im6_mif = mifi;
 			break;
 #endif
 		case MRT6_INIT:
 			im->im6_mif = mifi;
 			break;
 		}
 
 		if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 			log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
 			    "socket queue full\n");
 			MRT6STAT_INC(mrt6s_upq_sockfull);
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 
 		MRT6STAT_INC(mrt6s_upcalls);
 
 		/* insert new entry at head of hash chain */
 		bzero(rt, sizeof(*rt));
 		rt->mf6c_origin.sin6_family = AF_INET6;
 		rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6);
 		rt->mf6c_origin.sin6_addr = ip6->ip6_src;
 		rt->mf6c_mcastgrp.sin6_family = AF_INET6;
 		rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6);
 		rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst;
 		rt->mf6c_expire = UPCALL_EXPIRE;
 		n6expire[hash]++;
 		rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
 
 		/* link into table */
 		rt->mf6c_next  = mf6ctable[hash];
 		mf6ctable[hash] = rt;
 		/* Add this entry to the end of the queue */
 		rt->mf6c_stall = rte;
 	} else {
 		/* determine if q has overflowed */
 		struct rtdetq **p;
 		int npkts = 0;
 
 		for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next)
 			if (++npkts > MAX_UPQ6) {
 				MRT6STAT_INC(mrt6s_upq_ovflw);
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				MFC6_UNLOCK();
 				return (0);
 			}
 
 		/* Add this entry to the end of the queue */
 		*p = rte;
 	}
 
 	rte->next = NULL;
 	rte->m = mb0;
 	rte->ifp = ifp;
 #ifdef UPCALL_TIMING
 	rte->t = tp;
 #endif /* UPCALL_TIMING */
 
 	MFC6_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Clean up cache entries if upcalls are not serviced
  * Call from the Slow Timeout mechanism, every half second.
  */
 static void
 expire_upcalls(void *unused)
 {
 #ifdef MRT6DEBUG
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 #endif
 	struct rtdetq *rte;
 	struct mf6c *mfc, **nptr;
 	u_long i;
 
 	MFC6_LOCK();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		if (n6expire[i] == 0)
 			continue;
 		nptr = &mf6ctable[i];
 		while ((mfc = *nptr) != NULL) {
 			rte = mfc->mf6c_stall;
 			/*
 			 * Skip real cache entries
 			 * Make sure it wasn't marked to not expire (shouldn't happen)
 			 * If it expires now
 			 */
 			if (rte != NULL &&
 			    mfc->mf6c_expire != 0 &&
 			    --mfc->mf6c_expire == 0) {
 				MRT6_DLOG(DEBUG_EXPIRE, "expiring (%s %s)",
 				    ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr));
 				/*
 				 * drop all the packets
 				 * free the mbuf with the pkt, if, timing info
 				 */
 				do {
 					struct rtdetq *n = rte->next;
 					m_freem(rte->m);
 					free(rte, M_MRTABLE6);
 					rte = n;
 				} while (rte != NULL);
 				MRT6STAT_INC(mrt6s_cache_cleanups);
 				n6expire[i]--;
 
 				*nptr = mfc->mf6c_next;
 				free(mfc, M_MRTABLE6);
 			} else {
 				nptr = &mfc->mf6c_next;
 			}
 		}
 	}
 	MFC6_UNLOCK();
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	mifi_t mifi, iif;
 	struct mif6 *mifp;
 	int plen = m->m_pkthdr.len;
 	struct in6_addr src0, dst0; /* copies for local work */
 	u_int32_t iszone, idzone, oszone, odzone;
 	int error = 0;
 
 /*
  * Macro to send packet on mif.  Since RSVP packets don't get counted on
  * input, they shouldn't get counted on output, so statistics keeping is
  * separate.
  */
 
 #define MC6_SEND(ip6, mifp, m) do {				\
 	if ((mifp)->m6_flags & MIFF_REGISTER)			\
 		register_send((ip6), (mifp), (m));		\
 	else							\
 		phyint_send((ip6), (mifp), (m));		\
 } while (/*CONSTCOND*/ 0)
 
 	/*
 	 * Don't forward if it didn't arrive from the parent mif
 	 * for its origin.
 	 */
 	mifi = rt->mf6c_parent;
 	if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
 		/* came in the wrong interface */
 		MRT6_DLOG(DEBUG_FORWARD,
 		    "wrong if: ifid %d mifi %d mififid %x", ifp->if_index,
 		    mifi, mif6table[mifi].m6_ifp->if_index);
 		MRT6STAT_INC(mrt6s_wrong_if);
 		rt->mf6c_wrong_if++;
 		/*
 		 * If we are doing PIM processing, and we are forwarding
 		 * packets on this interface, send a message to the
 		 * routing daemon.
 		 */
 		/* have to make sure this is a valid mif */
 		if (mifi < nummifs && mif6table[mifi].m6_ifp)
 			if (V_pim6 && (m->m_flags & M_LOOP) == 0) {
 				/*
 				 * Check the M_LOOP flag to avoid an
 				 * unnecessary PIM assert.
 				 * XXX: M_LOOP is an ad-hoc hack...
 				 */
 				static struct sockaddr_in6 sin6 =
 				{ sizeof(sin6), AF_INET6 };
 
 				struct mbuf *mm;
 				struct mrt6msg *im;
 #ifdef MRT6_OINIT
 				struct omrt6msg *oim;
 #endif
 
 				mm = m_copy(m, 0, sizeof(struct ip6_hdr));
 				if (mm &&
 				    (!M_WRITABLE(mm) ||
 				     mm->m_len < sizeof(struct ip6_hdr)))
 					mm = m_pullup(mm, sizeof(struct ip6_hdr));
 				if (mm == NULL)
 					return (ENOBUFS);
 
 #ifdef MRT6_OINIT
 				oim = NULL;
 #endif
 				im = NULL;
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim = mtod(mm, struct omrt6msg *);
 					oim->im6_msgtype = MRT6MSG_WRONGMIF;
 					oim->im6_mbz = 0;
 					break;
 #endif
 				case MRT6_INIT:
 					im = mtod(mm, struct mrt6msg *);
 					im->im6_msgtype = MRT6MSG_WRONGMIF;
 					im->im6_mbz = 0;
 					break;
 				default:
 					m_freem(mm);
 					return (EINVAL);
 				}
 
 				for (mifp = mif6table, iif = 0;
 				     iif < nummifs && mifp &&
 					     mifp->m6_ifp != ifp;
 				     mifp++, iif++)
 					;
 
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim->im6_mif = iif;
 					sin6.sin6_addr = oim->im6_src;
 					break;
 #endif
 				case MRT6_INIT:
 					im->im6_mif = iif;
 					sin6.sin6_addr = im->im6_src;
 					break;
 				}
 
 				MRT6STAT_INC(mrt6s_upcalls);
 
 				if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 					MRT6_DLOG(DEBUG_ANY,
 					    "ip6_mrouter socket queue full");
 					MRT6STAT_INC(mrt6s_upq_sockfull);
 					return (ENOBUFS);
 				}	/* if socket Q full */
 			}		/* if PIM */
 		return (0);
 	}			/* if wrong iif */
 
 	/* If I sourced this packet, it counts as output, else it was input. */
 	if (m->m_pkthdr.rcvif == NULL) {
 		/* XXX: is rcvif really NULL when output?? */
 		mif6table[mifi].m6_pkt_out++;
 		mif6table[mifi].m6_bytes_out += plen;
 	} else {
 		mif6table[mifi].m6_pkt_in++;
 		mif6table[mifi].m6_bytes_in += plen;
 	}
 	rt->mf6c_pkt_cnt++;
 	rt->mf6c_byte_cnt += plen;
 
 	/*
 	 * For each mif, forward a copy of the packet if there are group
 	 * members downstream on the interface.
 	 */
 	src0 = ip6->ip6_src;
 	dst0 = ip6->ip6_dst;
 	if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
 	    (error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
 		IP6STAT_INC(ip6s_badscope);
 		return (error);
 	}
 	for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
 		if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
 			/*
 			 * check if the outgoing packet is going to break
 			 * a scope boundary.
 			 * XXX For packets through PIM register tunnel
 			 * interface, we believe a routing daemon.
 			 */
 			if (!(mif6table[rt->mf6c_parent].m6_flags &
 			      MIFF_REGISTER) &&
 			    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 				if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
 				    &oszone) ||
 				    in6_setscope(&dst0, mif6table[mifi].m6_ifp,
 				    &odzone) ||
 				    iszone != oszone ||
 				    idzone != odzone) {
 					IP6STAT_INC(ip6s_badscope);
 					continue;
 				}
 			}
 
 			mifp->m6_pkt_out++;
 			mifp->m6_bytes_out += plen;
 			MC6_SEND(ip6, mifp, m);
 		}
 	}
 	return (0);
 }
 
 static void
 phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
 {
 #ifdef MRT6DEBUG
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 	struct mbuf *mb_copy;
 	struct ifnet *ifp = mifp->m6_ifp;
 	int error = 0;
 	u_long linkmtu;
 
 	/*
 	 * Make a new reference to the packet; make sure that
 	 * the IPv6 header is actually copied, not just referenced,
 	 * so that ip6_output() only scribbles on the copy.
 	 */
 	mb_copy = m_copy(m, 0, M_COPYALL);
 	if (mb_copy &&
 	    (!M_WRITABLE(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr)))
 		mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
 	if (mb_copy == NULL) {
 		return;
 	}
 	/* set MCAST flag to the outgoing packet */
 	mb_copy->m_flags |= M_MCAST;
 
 	/*
 	 * If we sourced the packet, call ip6_output since we may devide
 	 * the packet into fragments when the packet is too big for the
 	 * outgoing interface.
 	 * Otherwise, we can simply send the packet to the interface
 	 * sending queue.
 	 */
 	if (m->m_pkthdr.rcvif == NULL) {
 		struct ip6_moptions im6o;
 
 		im6o.im6o_multicast_ifp = ifp;
 		/* XXX: ip6_output will override ip6->ip6_hlim */
 		im6o.im6o_multicast_hlim = ip6->ip6_hlim;
 		im6o.im6o_multicast_loop = 1;
 		error = ip6_output(mb_copy, NULL, NULL, IPV6_FORWARDING, &im6o,
 		    NULL, NULL);
 
 		MRT6_DLOG(DEBUG_XMIT, "mif %u err %d",
 		    (uint16_t)(mifp - mif6table), error);
 		return;
 	}
 
 	/*
 	 * If configured to loop back multicasts by default,
 	 * loop back a copy now.
 	 */
 	if (in6_mcast_loop)
 		ip6_mloopback(ifp, m);
 
 	/*
 	 * Put the packet into the sending queue of the outgoing interface
 	 * if it would fit in the MTU of the interface.
 	 */
 	linkmtu = IN6_LINKMTU(ifp);
 	if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) {
 		struct sockaddr_in6 dst6;
 
 		bzero(&dst6, sizeof(dst6));
 		dst6.sin6_len = sizeof(struct sockaddr_in6);
 		dst6.sin6_family = AF_INET6;
 		dst6.sin6_addr = ip6->ip6_dst;
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 		/*
 		 * We just call if_output instead of nd6_output here, since
 		 * we need no ND for a multicast forwarded packet...right?
 		 */
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 		error = (*ifp->if_output)(ifp, mb_copy,
 		    (struct sockaddr *)&dst6, NULL);
 		MRT6_DLOG(DEBUG_XMIT, "mif %u err %d",
 		    (uint16_t)(mifp - mif6table), error);
 	} else {
 		/*
 		 * pMTU discovery is intentionally disabled by default, since
 		 * various router may notify pMTU in multicast, which can be
 		 * a DDoS to a router
 		 */
 		if (V_ip6_mcast_pmtu)
 			icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu);
 		else {
 			MRT6_DLOG(DEBUG_XMIT, " packet too big on %s o %s "
 			    "g %s size %d (discarded)", if_name(ifp),
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    mb_copy->m_pkthdr.len);
 			m_freem(mb_copy); /* simply discard the packet */
 		}
 	}
 }
 
 static int
 register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
 {
 #ifdef MRT6DEBUG
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 	struct mbuf *mm;
 	int i, len = m->m_pkthdr.len;
 	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 	struct mrt6msg *im6;
 
 	MRT6_DLOG(DEBUG_ANY, "src %s dst %s",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 	PIM6STAT_INC(pim6s_snd_registers);
 
 	/* Make a copy of the packet to send to the user level process. */
 	mm = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mm == NULL)
 		return (ENOBUFS);
 	mm->m_data += max_linkhdr;
 	mm->m_len = sizeof(struct ip6_hdr);
 
 	if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 		m_freem(mm);
 		return (ENOBUFS);
 	}
 	i = MHLEN - M_LEADINGSPACE(mm);
 	if (i > len)
 		i = len;
 	mm = m_pullup(mm, i);
 	if (mm == NULL)
 		return (ENOBUFS);
 /* TODO: check it! */
 	mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
 
 	/*
 	 * Send message to routing daemon
 	 */
 	sin6.sin6_addr = ip6->ip6_src;
 
 	im6 = mtod(mm, struct mrt6msg *);
 	im6->im6_msgtype      = MRT6MSG_WHOLEPKT;
 	im6->im6_mbz          = 0;
 
 	im6->im6_mif = mif - mif6table;
 
 	/* iif info is not given for reg. encap.n */
 	MRT6STAT_INC(mrt6s_upcalls);
 
 	if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 		MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full");
 		MRT6STAT_INC(mrt6s_upq_sockfull);
 		return (ENOBUFS);
 	}
 	return (0);
 }
 
 /*
  * pim6_encapcheck() is called by the encap6_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim6_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 
 #ifdef DIAGNOSTIC
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
 #endif
     if (proto != IPPROTO_PIM)
 	return 0;	/* not for us; reject the datagram. */
 
     return 64;		/* claim the datagram. */
 }
 
 /*
  * PIM sparse mode hook
  * Receives the pim control messages, and passes them up to the listening
  * socket, using rip6_input.
  * The only message processed is the REGISTER pim message; the pim header
  * is stripped off, and the inner packet is passed to register_mforward.
  */
 int
 pim6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct pim *pim; /* pointer to a pim struct */
 	struct ip6_hdr *ip6;
 	int pimlen;
 	struct mbuf *m = *mp;
 	int minlen;
 	int off = *offp;
 
 	PIM6STAT_INC(pim6s_rcv_total);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	pimlen = m->m_pkthdr.len - *offp;
 
 	/*
 	 * Validate lengths
 	 */
 	if (pimlen < PIM_MINLEN) {
 		PIM6STAT_INC(pim6s_rcv_tooshort);
 		MRT6_DLOG(DEBUG_PIM, "PIM packet too short");
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * if the packet is at least as big as a REGISTER, go ahead
 	 * and grab the PIM REGISTER header size, to avoid another
 	 * possible m_pullup() later.
 	 *
 	 * PIM_MINLEN       == pimhdr + u_int32 == 8
 	 * PIM6_REG_MINLEN   == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
 	 */
 	minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
 
 	/*
 	 * Make sure that the IP6 and PIM headers in contiguous memory, and
 	 * possibly the PIM REGISTER header
 	 */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE);
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf to point to the PIM header */
 	pim = (struct pim *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
 	if (pim == NULL) {
 		PIM6STAT_INC(pim6s_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 #endif
 
 #define PIM6_CHECKSUM
 #ifdef PIM6_CHECKSUM
 	{
 		int cksumlen;
 
 		/*
 		 * Validate checksum.
 		 * If PIM REGISTER, exclude the data packet
 		 */
 		if (pim->pim_type == PIM_REGISTER)
 			cksumlen = PIM_MINLEN;
 		else
 			cksumlen = pimlen;
 
 		if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
 			PIM6STAT_INC(pim6s_rcv_badsum);
 			MRT6_DLOG(DEBUG_PIM, "invalid checksum");
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 #endif /* PIM_CHECKSUM */
 
 	/* PIM version check */
 	if (pim->pim_ver != PIM_VERSION) {
 		PIM6STAT_INC(pim6s_rcv_badversion);
 		MRT6_DLOG(DEBUG_ANY | DEBUG_ERR,
 		    "incorrect version %d, expecting %d",
 		    pim->pim_ver, PIM_VERSION);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (pim->pim_type == PIM_REGISTER) {
 		/*
 		 * since this is a REGISTER, we'll make a copy of the register
 		 * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
 		 * routing daemon.
 		 */
 		static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 };
 
 		struct mbuf *mcp;
 		struct ip6_hdr *eip6;
 		u_int32_t *reghdr;
 		int rc;
 #ifdef MRT6DEBUG
 		char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 
 		PIM6STAT_INC(pim6s_rcv_registers);
 
 		if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
 			MRT6_DLOG(DEBUG_PIM, "register mif not set: %d",
 			    reg_mif_num);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		reghdr = (u_int32_t *)(pim + 1);
 
 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 			goto pim6_input_to_daemon;
 
 		/*
 		 * Validate length
 		 */
 		if (pimlen < PIM6_REG_MINLEN) {
 			PIM6STAT_INC(pim6s_rcv_tooshort);
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "register packet "
 			    "size too small %d from %s",
 			    pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src));
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		eip6 = (struct ip6_hdr *) (reghdr + 1);
 		MRT6_DLOG(DEBUG_PIM, "eip6: %s -> %s, eip6 plen %d",
 		    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &eip6->ip6_dst),
 		    ntohs(eip6->ip6_plen));
 
 		/* verify the version number of the inner packet */
 		if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_ANY, "invalid IP version (%d) "
 			    "of the inner packet",
 			    (eip6->ip6_vfc & IPV6_VERSION));
 			m_freem(m);
 			return (IPPROTO_NONE);
 		}
 
 		/* verify the inner packet is destined to a mcast group */
 		if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_PIM, "inner packet of register "
 			    "is not multicast %s",
 			    ip6_sprintf(ip6bufd, &eip6->ip6_dst));
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * make a copy of the whole header to pass to the daemon later.
 		 */
 		mcp = m_copy(m, 0, off + PIM6_REG_MINLEN);
 		if (mcp == NULL) {
 			MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "pim register: "
 			    "could not copy register head");
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * forward the inner ip6 packet; point m_data at the inner ip6.
 		 */
 		m_adj(m, off + PIM_MINLEN);
 		MRT6_DLOG(DEBUG_PIM, "forwarding decapsulated register: "
 		    "src %s, dst %s, mif %d",
 		    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &eip6->ip6_dst), reg_mif_num);
 
 		rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m,
 				dst.sin6_family, 0);
 
 		/* prepare the register head to send to the mrouting daemon */
 		m = mcp;
 	}
 
 	/*
 	 * Pass the PIM message up to the daemon; if it is a register message
 	 * pass the 'head' only up to the daemon. This includes the
 	 * encapsulator ip6 header, pim header, register header and the
 	 * encapsulated ip6 header.
 	 */
   pim6_input_to_daemon:
 	rip6_input(&m, offp, proto);
 	return (IPPROTO_DONE);
 }
 
 static int
 ip6_mroute_modevent(module_t mod, int type, void *unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		MROUTER6_LOCK_INIT();
 		MFC6_LOCK_INIT();
 		MIF6_LOCK_INIT();
 
 		pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM,
 			pim6_encapcheck,
 			(const struct protosw *)&in6_pim_protosw, NULL);
 		if (pim6_encap_cookie == NULL) {
 			printf("ip6_mroute: unable to attach pim6 encap\n");
 			MIF6_LOCK_DESTROY();
 			MFC6_LOCK_DESTROY();
 			MROUTER6_LOCK_DESTROY();
 			return (EINVAL);
 		}
 
 		ip6_mforward = X_ip6_mforward;
 		ip6_mrouter_done = X_ip6_mrouter_done;
 		ip6_mrouter_get = X_ip6_mrouter_get;
 		ip6_mrouter_set = X_ip6_mrouter_set;
 		mrt6_ioctl = X_mrt6_ioctl;
 		break;
 
 	case MOD_UNLOAD:
 		if (V_ip6_mrouter != NULL)
 			return EINVAL;
 
 		if (pim6_encap_cookie) {
 			encap_detach(pim6_encap_cookie);
 			pim6_encap_cookie = NULL;
 		}
 		X_ip6_mrouter_done();
 		ip6_mforward = NULL;
 		ip6_mrouter_done = NULL;
 		ip6_mrouter_get = NULL;
 		ip6_mrouter_set = NULL;
 		mrt6_ioctl = NULL;
 
 		MIF6_LOCK_DESTROY();
 		MFC6_LOCK_DESTROY();
 		MROUTER6_LOCK_DESTROY();
 		break;
 
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t ip6_mroutemod = {
 	"ip6_mroute",
 	ip6_mroute_modevent,
 	0
 };
 
-DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: projects/vnet/sys/netinet6/mld6.c
===================================================================
--- projects/vnet/sys/netinet6/mld6.c	(revision 294565)
+++ projects/vnet/sys/netinet6/mld6.c	(revision 294566)
@@ -1,3320 +1,3320 @@
 /*-
  * Copyright (c) 2009 Bruce Simpson.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/ktr.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6.h>
 #include <netinet6/mld6_var.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 static struct mld_ifsoftc *
 		mli_alloc_locked(struct ifnet *);
 static void	mli_delete_locked(const struct ifnet *);
 static void	mld_dispatch_packet(struct mbuf *);
 static void	mld_dispatch_queue(struct mbufq *, int);
 static void	mld_final_leave(struct in6_multi *, struct mld_ifsoftc *);
 static void	mld_fasttimo_vnet(void);
 static int	mld_handle_state_change(struct in6_multi *,
 		    struct mld_ifsoftc *);
 static int	mld_initial_join(struct in6_multi *, struct mld_ifsoftc *,
 		    const int);
 #ifdef KTR
 static char *	mld_rec_type_to_str(const int);
 #endif
 static void	mld_set_version(struct mld_ifsoftc *, const int);
 static void	mld_slowtimo_vnet(void);
 static int	mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static int	mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static void	mld_v1_process_group_timer(struct mld_ifsoftc *,
 		    struct in6_multi *);
 static void	mld_v1_process_querier_timers(struct mld_ifsoftc *);
 static int	mld_v1_transmit_report(struct in6_multi *, const int);
 static void	mld_v1_update_group(struct in6_multi *, const int);
 static void	mld_v2_cancel_link_timers(struct mld_ifsoftc *);
 static void	mld_v2_dispatch_general_query(struct mld_ifsoftc *);
 static struct mbuf *
 		mld_v2_encap_report(struct ifnet *, struct mbuf *);
 static int	mld_v2_enqueue_filter_change(struct mbufq *,
 		    struct in6_multi *);
 static int	mld_v2_enqueue_group_record(struct mbufq *,
 		    struct in6_multi *, const int, const int, const int,
 		    const int);
 static int	mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
 		    struct mbuf *, const int, const int);
 static int	mld_v2_merge_state_changes(struct in6_multi *,
 		    struct mbufq *);
 static void	mld_v2_process_group_timers(struct mld_ifsoftc *,
 		    struct mbufq *, struct mbufq *,
 		    struct in6_multi *, const int);
 static int	mld_v2_process_group_query(struct in6_multi *,
 		    struct mld_ifsoftc *mli, int, struct mbuf *, const int);
 static int	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
 
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  *
  * Locking:
  *  * The MLD subsystem lock ends up being system-wide for the moment,
  *    but could be per-VIMAGE later on.
  *  * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * IN6_MULTI_LOCK covers in_multi.
  *  * MLD_LOCK covers per-link state and any global variables in this file.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *
  *  XXX LOR PREVENTION
  *  A special case for IPv6 is the in6_setscope() routine. ip6_output()
  *  will not accept an ifp; it wants an embedded scope ID, unlike
  *  ip_output(), which happily takes the ifp given to it. The embedded
  *  scope ID is only used by MLD to select the outgoing interface.
  *
  *  During interface attach and detach, MLD will take MLD_LOCK *after*
  *  the IF_AFDATA_LOCK.
  *  As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
  *  it with MLD_LOCK held without triggering an LOR. A netisr with indirect
  *  dispatch could work around this, but we'd rather not do that, as it
  *  can introduce other races.
  *
  *  As such, we exploit the fact that the scope ID is just the interface
  *  index, and embed it in the IPv6 destination address accordingly.
  *  This is potentially NOT VALID for MLDv1 reports, as they
  *  are always sent to the multicast group itself; as MLDv2
  *  reports are always sent to ff02::16, this is not an issue
  *  when MLDv2 is in use.
  *
  *  This does not however eliminate the LOR when ip6_output() itself
  *  calls in6_setscope() internally whilst MLD_LOCK is held. This will
  *  trigger a LOR warning in WITNESS when the ifnet is detached.
  *
  *  The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
  *  how it's used across the network stack. Here we're simply exploiting
  *  the fact that MLD runs at a similar layer in the stack to scope6.c.
  *
  * VIMAGE:
  *  * Each in6_multi corresponds to an ifp, and each ifp corresponds
  *    to a vnet in ifp->if_vnet.
  */
 static struct mtx		 mld_mtx;
 static MALLOC_DEFINE(M_MLD, "mld", "mld state");
 
 #define	MLD_EMBEDSCOPE(pin6, zoneid)					\
 	if (IN6_IS_SCOPE_LINKLOCAL(pin6) ||				\
 	    IN6_IS_ADDR_MC_INTFACELOCAL(pin6))				\
 		(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF)		\
 
 /*
  * VIMAGE-wide globals.
  */
 static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
 static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head);
 static VNET_DEFINE(int, interface_timers_running6);
 static VNET_DEFINE(int, state_change_timers_running6);
 static VNET_DEFINE(int, current_state_timers_running6);
 
 #define	V_mld_gsrdelay			VNET(mld_gsrdelay)
 #define	V_mli_head			VNET(mli_head)
 #define	V_interface_timers_running6	VNET(interface_timers_running6)
 #define	V_state_change_timers_running6	VNET(state_change_timers_running6)
 #define	V_current_state_timers_running6	VNET(current_state_timers_running6)
 
 SYSCTL_DECL(_net_inet6);	/* Note: Not in any common header. */
 
 SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
     "IPv6 Multicast Listener Discovery");
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
     "Rate limit for MLDv2 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
 static int	mld_v1enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
     &mld_v1enable, 0, "Enable fallback to MLDv1");
 
 static int	mld_use_allow = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
     &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
  */
 struct mld_raopt {
 	struct ip6_hbh		hbh;
 	struct ip6_opt		pad;
 	struct ip6_opt_router	ra;
 } __packed;
 
 /*
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
 	.hbh = { 0, 0 },
 	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
 	.ra = {
 	    .ip6or_type = IP6OPT_ROUTER_ALERT,
 	    .ip6or_len = IP6OPT_RTALERT_LEN - 2,
 	    .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
 	    .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
 	}
 };
 static struct ip6_pktopts mld_po;
 
 static __inline void
 mld_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 mld_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 mld_restore_context(struct mbuf *m)
 {
 
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by MLD lock.
  */
 static int
 sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	MLD_LOCK();
 
 	i = V_mld_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
 	     V_mld_gsrdelay.tv_sec, i);
 	V_mld_gsrdelay.tv_sec = i;
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct mld_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
 	if (error)
 		return (error);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		if (ifp == mli->mli_ifp) {
 			struct mld_ifinfo info;
 
 			info.mli_version = mli->mli_version;
 			info.mli_v1_timer = mli->mli_v1_timer;
 			info.mli_v2_timer = mli->mli_v2_timer;
 			info.mli_flags = mli->mli_flags;
 			info.mli_rv = mli->mli_rv;
 			info.mli_qi = mli->mli_qi;
 			info.mli_qri = mli->mli_qri;
 			info.mli_uri = mli->mli_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 mld_dispatch_queue(struct mbufq *mq, int limit)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m);
 		mld_dispatch_packet(m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing MLD report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
  * and node-local addresses. However, kernel and socket consumers
  * always embed the KAME scope ID in the address provided, so strip it
  * when performing comparison.
  * Note: This is not the same as the *multicast* scope.
  *
  * Return zero if the given group is one for which MLD reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 mld_is_addr_reported(const struct in6_addr *addr)
 {
 
 	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
 		return (0);
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
 		struct in6_addr tmp = *addr;
 		in6_clearscope(&tmp);
 		if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * Attach MLD when PF_INET6 is attached to an interface.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 struct mld_ifsoftc *
 mld_domifattach(struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 
 	mli = mli_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		mli->mli_flags |= MLIF_SILENT;
 	if (mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_UNLOCK();
 
 	return (mli);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct mld_ifsoftc *
 mli_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli;
 
 	MLD_LOCK_ASSERT();
 
 	mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO);
 	if (mli == NULL)
 		goto out;
 
 	mli->mli_ifp = ifp;
 	mli->mli_version = MLD_VERSION_2;
 	mli->mli_flags = 0;
 	mli->mli_rv = MLD_RV_INIT;
 	mli->mli_qi = MLD_QI_INIT;
 	mli->mli_qri = MLD_QRI_INIT;
 	mli->mli_uri = MLD_URI_INIT;
 	SLIST_INIT(&mli->mli_relinmhead);
 	mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
 
 	CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)",
 	     ifp, if_name(ifp));
 
 out:
 	return (mli);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  * Run before link-layer cleanup; cleanup groups, but do not free MLD state.
  *
  * SMPng: Caller must hold IN6_MULTI_LOCK().
  * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
  * XXX This routine is also bitten by unlocked ifma_protospec access.
  */
 void
 mld_ifdetach(struct ifnet *ifp)
 {
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
 	    if_name(ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	if (mli->mli_version == MLD_VERSION_2) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			if (inm->in6m_state == MLD_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 			in6m_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 			in6m_release_locked(inm);
 		}
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  * Runs after link-layer cleanup; free MLD state.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 void
 mld_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 	mli_delete_locked(ifp);
 	MLD_UNLOCK();
 }
 
 static void
 mli_delete_locked(const struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli, *tmli;
 
 	CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
 		if (mli->mli_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&mli->mli_gq);
 
 			LIST_REMOVE(mli, mli_link);
 
 			KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(mli, M_MLD);
 			return;
 		}
 	}
 #ifdef INVARIANTS
 	panic("%s: mld_ifsoftc not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received MLDv1 general or address-specific query.
  * Assumes that the query header has been pulled up to sizeof(mld_hdr).
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct ifmultiaddr	*ifma;
 	struct mld_ifsoftc	*mli;
 	struct in6_multi	*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * Do address field validation upfront before we accept
 	 * the query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * MLDv1 General Query.
 		 * If this was not sent to the all-nodes group, ignore it.
 		 */
 		struct in6_addr		 dst;
 
 		dst = ip6->ip6_dst;
 		in6_clearscope(&dst);
 		if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * Switch to MLDv1 host compatibility mode.
 	 */
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 	mld_set_version(mli, MLD_VERSION_1);
 
 	timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	IF_ADDR_RLOCK(ifp);
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			mld_v1_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * MLDv1 Group-Specific Query.
 		 * If this is a group-specific MLDv1 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm != NULL) {
 			CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 			mld_v1_update_group(inm, timer);
 		}
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an MLDv1 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to MLDv2. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike MLDv2, the delay per group should be jittered
  * to avoid bursts of MLDv1 reports.
  */
 static void
 mld_v1_update_group(struct in6_multi *inm, const int timer)
 {
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp), timer);
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (inm->in6m_timer != 0 &&
 		    inm->in6m_timer <= timer) {
 			CTR1(KTR_MLD, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		break;
 	case MLD_SLEEPING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
 		inm->in6m_state = MLD_AWAKENING_MEMBER;
 		break;
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received MLDv2 general, group-specific or
  * group-and-source-specific query.
  *
  * Assumes that the query header has been pulled up to sizeof(mldv2_query).
  *
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     struct mbuf *m, const int off, const int icmp6len)
 {
 	struct mld_ifsoftc	*mli;
 	struct mldv2_query	*mld;
 	struct in6_multi	*inm;
 	uint32_t		 maxdelay, nsrc, qqi;
 	int			 is_general_query;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
 
 	mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off);
 
 	maxdelay = ntohs(mld->mld_maxdelay);	/* in 1/10ths of a second */
 	if (maxdelay >= 32768) {
 		maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
 			   (MLD_MRC_EXP(maxdelay) + 3);
 	}
 	timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	qrv = MLD_QRV(mld->mld_misc);
 	if (qrv < 2) {
 		CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
 		    qrv, MLD_RV_INIT);
 		qrv = MLD_RV_INIT;
 	}
 
 	qqi = mld->mld_qqi;
 	if (qqi >= 128) {
 		qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
 		     (MLD_QQIC_EXP(mld->mld_qqi) + 3);
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 	if (nsrc > MLD_MAX_GS_SOURCES)
 		return (EMSGSIZE);
 	if (icmp6len < sizeof(struct mldv2_query) +
 	    (nsrc * sizeof(struct in6_addr)))
 		return (EMSGSIZE);
 
 	/*
 	 * Do further input validation upfront to avoid resetting timers
 	 * should we need to discard this query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		if (nsrc > 0)
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks (due to KAME
 		 * locking lameness). We own this mbuf chain just now.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * Discard the v2 query if we're in Compatibility Mode.
 	 * The RFC is pretty clear that hosts need to stay in MLDv1 mode
 	 * until the Old Version Querier Present timer expires.
 	 */
 	if (mli->mli_version != MLD_VERSION_2)
 		goto out_locked;
 
 	mld_set_version(mli, MLD_VERSION_2);
 	mli->mli_rv = qrv;
 	mli->mli_qi = qqi;
 	mli->mli_qri = maxdelay;
 
 	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
 	    maxdelay);
 
 	if (is_general_query) {
 		/*
 		 * MLDv2 General Query.
 		 *
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 *
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
 			mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
 			V_interface_timers_running6 = 1;
 		}
 	} else {
 		/*
 		 * MLDv2 Group-specific or Group-and-source-specific Query.
 		 *
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm == NULL) {
 			IF_ADDR_RUNLOCK(ifp);
 			goto out_locked;
 		}
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->in6m_lastgsrtv,
 			    &V_mld_gsrdelay)) {
 				CTR1(KTR_MLD, "%s: GS query throttled.",
 				    __func__);
 				IF_ADDR_RUNLOCK(ifp);
 				goto out_locked;
 			}
 		}
 		CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
 		     ifp, if_name(ifp));
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
 			mld_v2_process_group_query(inm, mli, timer, m, off);
 
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved MLDv2 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli,
     int timer, struct mbuf *m0, const int off)
 {
 	struct mldv2_query	*mld;
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	retval = 0;
 	mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off);
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 		    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
 			in6m_clear_recorded(inm);
 			timer = min(inm->in6m_timer, timer);
 		}
 		inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->in6m_timer, timer);
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 */
 	if (inm->in6m_nsrc > 0) {
 		struct mbuf		*m;
 		uint8_t			*sp;
 		int			 i, nrecorded;
 		int			 soff;
 
 		m = m0;
 		soff = off + sizeof(struct mldv2_query);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++) {
 			sp = mtod(m, uint8_t *) + soff;
 			retval = in6m_record_source(inm,
 			    (const struct in6_addr *)sp);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 			soff += sizeof(struct in6_addr);
 			if (soff >= m->m_len) {
 				soff = soff - m->m_len;
 				m = m->m_next;
 				if (m == NULL)
 					break;
 			}
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_MLD,
 			    "%s: schedule response to SG query", __func__);
 			inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
 			inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 			V_current_state_timers_running6 = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received MLDv1 host membership report.
  * Assumes mld points to mld_hdr in pulled up mbuf chain.
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct in6_addr		 src, dst;
 	struct in6_ifaddr	*ia;
 	struct in6_multi	*inm;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	/*
 	 * MLDv1 reports must originate from a host's link-local address,
 	 * or the unspecified address (when booting).
 	 */
 	src = ip6->ip6_src;
 	in6_clearscope(&src);
 	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC2710 Section 4: MLDv1 reports must pertain to a multicast
 	 * group, and must be directed to the group itself.
 	 */
 	dst = ip6->ip6_dst;
 	in6_clearscope(&dst);
 	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
 	    !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
 		CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * Make sure we don't hear our own membership report, as fast
 	 * leave requires knowing that we are the only member of a
 	 * group. Assume we used the link-local address if available,
 	 * otherwise look for ::.
 	 *
 	 * XXX Note that scope ID comparison is needed for the address
 	 * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
 	 * performed for the on-wire address.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
 	    (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
 	    ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
 
 	/*
 	 * Embed scope ID of receiving interface in MLD query for lookup
 	 * whilst we don't hold other locks (due to KAME locking lameness).
 	 */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 	IF_ADDR_RLOCK(ifp);
 
 	/*
 	 * MLDv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 	if (inm != NULL) {
 		struct mld_ifsoftc *mli;
 
 		mli = inm->in6m_mli;
 		KASSERT(mli != NULL,
 		    ("%s: no mli for ifp %p", __func__, ifp));
 
 		/*
 		 * If we are in MLDv2 host mode, do not allow the
 		 * other host's MLDv1 report to suppress our reports.
 		 */
 		if (mli->mli_version == MLD_VERSION_2)
 			goto out_locked;
 
 		inm->in6m_timer = 0;
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			CTR3(KTR_MLD,
 			    "report suppressed for %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 		case MLD_LAZY_MEMBER:
 			inm->in6m_state = MLD_LAZY_MEMBER;
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	/* XXX Clear embedded scope ID as userland won't expect it. */
 	in6_clearscope(&mld->mld_addr);
 
 	return (0);
 }
 
 /*
  * MLD input path.
  *
  * Assume query messages which fit in a single ICMPv6 message header
  * have been pulled up.
  * Assume that userland will want to see the message, even if it
  * otherwise fails kernel input validation; do not free it.
  * Pullup may however free the mbuf chain m if it fails.
  *
  * Return IPPROTO_DONE if we freed m. Otherwise, return 0.
  */
 int
 mld_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet	*ifp;
 	struct ip6_hdr	*ip6;
 	struct mld_hdr	*mld;
 	int		 mldlen;
 
 	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
 
 	ifp = m->m_pkthdr.rcvif;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* Pullup to appropriate size. */
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 	if (mld->mld_type == MLD_LISTENER_QUERY &&
 	    icmp6len >= sizeof(struct mldv2_query)) {
 		mldlen = sizeof(struct mldv2_query);
 	} else {
 		mldlen = sizeof(struct mld_hdr);
 	}
 	IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
 	if (mld == NULL) {
 		ICMP6STAT_INC(icp6s_badlen);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Userland needs to see all of this traffic for implementing
 	 * the endpoint discovery portion of multicast routing.
 	 */
 	switch (mld->mld_type) {
 	case MLD_LISTENER_QUERY:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
 		if (icmp6len == sizeof(struct mld_hdr)) {
 			if (mld_v1_input_query(ifp, ip6, mld) != 0)
 				return (0);
 		} else if (icmp6len >= sizeof(struct mldv2_query)) {
 			if (mld_v2_input_query(ifp, ip6, m, off,
 			    icmp6len) != 0)
 				return (0);
 		}
 		break;
 	case MLD_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		if (mld_v1_input_report(ifp, ip6, mld) != 0)
 			return (0);
 		break;
 	case MLDV2_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		break;
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 mld_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 	int			 uri_fasthz;
 
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running6 &&
 	    !V_interface_timers_running6 &&
 	    !V_state_change_timers_running6)
 		return;
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv2 General Query response timer processing.
 	 */
 	if (V_interface_timers_running6) {
 		CTR1(KTR_MLD, "%s: interface timers running", __func__);
 
 		V_interface_timers_running6 = 0;
 		LIST_FOREACH(mli, &V_mli_head, mli_link) {
 			if (mli->mli_v2_timer == 0) {
 				/* Do nothing. */
 			} else if (--mli->mli_v2_timer == 0) {
 				mld_v2_dispatch_general_query(mli);
 			} else {
 				V_interface_timers_running6 = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running6 &&
 	    !V_state_change_timers_running6)
 		goto out_locked;
 
 	V_current_state_timers_running6 = 0;
 	V_state_change_timers_running6 = 0;
 
 	CTR1(KTR_MLD, "%s: state change timers running", __func__);
 
 	/*
 	 * MLD host report and state-change timer processing.
 	 * Note: Processing a v2 group timer may remove a node.
 	 */
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		ifp = mli->mli_ifp;
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			switch (mli->mli_version) {
 			case MLD_VERSION_1:
 				mld_v1_process_group_timer(mli, inm);
 				break;
 			case MLD_VERSION_2:
 				mld_v2_process_group_timers(mli, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * Transmit reports for this lifecycle.  This
 			 * is done while not holding IF_ADDR_LOCK
 			 * since this can call
 			 * in6ifa_ifpforlinklocal() which locks
 			 * IF_ADDR_LOCK internally as well as
 			 * ip6_output() to transmit a packet.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				(void)mld_v1_transmit_report(inm,
 				    MLD_LISTENER_REPORT);
 			}
 			break;
 		case MLD_VERSION_2:
 			mld_dispatch_queue(&qrq, 0);
 			mld_dispatch_queue(&scq, 0);
 
 			/*
 			 * Free the in_multi reference(s) for
 			 * this lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				in6m_release_locked(inm);
 			}
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer.
  * Will update the global pending timer flags.
  */
 static void
 mld_v1_process_group_timer(struct mld_ifsoftc *mli, struct in6_multi *inm)
 {
 	int report_timer_expired;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	if (inm->in6m_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 		return;
 	}
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->in6m_state = MLD_IDLE_MEMBER;
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 		}
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for MLDv2.
  * Will update the global pending timer flags.
  * Note: Unlocked read from mli.
  */
 static void
 mld_v2_process_group_timers(struct mld_ifsoftc *mli,
     struct mbufq *qrq, struct mbufq *scq,
     struct in6_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from compatibility mode back to MLDv2,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->in6m_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 	}
 
 	if (inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running6 = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
 			    0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			in6m_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case MLD_REPORTING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->in6m_scrv > 0) {
 				inm->in6m_sctimer = uri_fasthz;
 				V_state_change_timers_running6 = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)mld_v2_merge_state_changes(inm, scq);
 
 			in6m_commit(inm);
 			CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp));
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release MLD's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER &&
 			    inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Switch to a different version on the given interface,
  * as per Section 9.12.
  */
 static void
 mld_set_version(struct mld_ifsoftc *mli, const int version)
 {
 	int old_version_timer;
 
 	MLD_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, mli->mli_ifp, if_name(mli->mli_ifp));
 
 	if (version == MLD_VERSION_1) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 9.12.
 		 */
 		old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
 		old_version_timer *= PR_SLOWHZ;
 		mli->mli_v1_timer = old_version_timer;
 	}
 
 	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
 		mli->mli_version = MLD_VERSION_1;
 		mld_v2_cancel_link_timers(mli);
 	}
 }
 
 /*
  * Cancel pending MLDv2 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  */
 static void
 mld_v2_cancel_link_timers(struct mld_ifsoftc *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
 	    mli->mli_ifp, if_name(mli->mli_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * Fast-track this potentially expensive operation
 	 * by checking all the global 'timer pending' flags.
 	 */
 	if (!V_interface_timers_running6 &&
 	    !V_state_change_timers_running6 &&
 	    !V_current_state_timers_running6)
 		return;
 
 	mli->mli_v2_timer = 0;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			break;
 		case MLD_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching
 			 * version, we need to release the final
 			 * reference held for issuing the INCLUDE {}.
 			 */
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 			/* FALLTHROUGH */
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 			in6m_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case MLD_REPORTING_MEMBER:
 			inm->in6m_sctimer = 0;
 			inm->in6m_timer = 0;
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			/*
 			 * Free any pending MLDv2 state-change records.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 		in6m_release_locked(inm);
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 mld_slowtimo_vnet(void)
 {
 	struct mld_ifsoftc *mli;
 
 	MLD_LOCK();
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		mld_v1_process_querier_timers(mli);
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 9.12 of RFC 3810.
  */
 static void
 mld_v1_process_querier_timers(struct mld_ifsoftc *mli)
 {
 
 	MLD_LOCK_ASSERT();
 
 	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
 		/*
 		 * MLDv1 Querier Present timer expired; revert to MLDv2.
 		 */
 		CTR5(KTR_MLD,
 		    "%s: transition from v%d -> v%d on %p(%s)",
 		    __func__, mli->mli_version, MLD_VERSION_2,
 		    mli->mli_ifp, if_name(mli->mli_ifp));
 		mli->mli_version = MLD_VERSION_2;
 	}
 }
 
 /*
  * Transmit an MLDv1 report immediately.
  */
 static int
 mld_v1_transmit_report(struct in6_multi *in6m, const int type)
 {
 	struct ifnet		*ifp;
 	struct in6_ifaddr	*ia;
 	struct ip6_hdr		*ip6;
 	struct mbuf		*mh, *md;
 	struct mld_hdr		*mld;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	ifp = in6m->in6m_ifp;
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	/* ia may be NULL if link-local address is tentative. */
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	md = m_get(M_NOWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	mh->m_next = md;
 
 	/*
 	 * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
 	 * that ether_output() does not need to allocate another mbuf
 	 * for the header in the most common case.
 	 */
 	M_ALIGN(mh, sizeof(struct ip6_hdr));
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	ip6->ip6_dst = in6m->in6m_addr;
 
 	md->m_len = sizeof(struct mld_hdr);
 	mld = mtod(md, struct mld_hdr *);
 	mld->mld_type = type;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_maxdelay = 0;
 	mld->mld_reserved = 0;
 	mld->mld_addr = in6m->in6m_addr;
 	in6_clearscope(&mld->mld_addr);
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
 
 	mld_save_context(mh, ifp);
 	mh->m_flags |= M_MLDV1;
 
 	mld_dispatch_packet(mh);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv6 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to MLD to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the MLDv2 state machine at group level. The MLd module
  * however makes the decision as to which MLD protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * If delay is non-zero, and the state change is an initial multicast
  * join, the state change report will be delayed by 'delay' ticks
  * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
  * the initial MLDv2 state change report will be delayed by whichever
  * is sooner, a pending state-change timer or delay itself.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 mld_change_state(struct in6_multi *inm, const int delay)
 {
 	struct mld_ifsoftc *mli;
 	struct ifnet *ifp;
 	int error;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet6's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an MLD
 	 * life cycle for this group.
 	 */
 	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
 		CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
 		    inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
 		if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: initial join", __func__);
 			error = mld_initial_join(inm, mli, delay);
 			goto out_locked;
 		} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: final leave", __func__);
 			mld_final_leave(inm, mli);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_MLD, "%s: filter set change", __func__);
 	}
 
 	error = mld_handle_state_change(inm, mli);
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an MLD group.
  *
  * When joining a group:
  *  If the group should have its MLD traffic suppressed, do nothing.
  *  MLDv1 starts sending MLDv1 host membership reports.
  *  MLDv2 will schedule an MLDv2 state-change report containing the
  *  initial state of the membership.
  *
  * If the delay argument is non-zero, then we must delay sending the
  * initial state change for delay ticks (in units of PR_FASTHZ).
  */
 static int
 mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli,
     const int delay)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 	int			 odelay;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * enter the MLD_SILENT_MEMBER state and
 	 * are never reported in any protocol exchanges.
 	 * All other groups enter the appropriate state machine
 	 * for the version in use on this link.
 	 * A link marked as MLIF_SILENT causes MLD to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr)) {
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		inm->in6m_state = MLD_SILENT_MEMBER;
 		inm->in6m_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (mli->mli_version == MLD_VERSION_2 &&
 		    inm->in6m_state == MLD_LEAVING_MEMBER)
 			in6m_release_locked(inm);
 
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * If a delay was provided, only use it if
 			 * it is greater than the delay normally
 			 * used for an MLDv1 state change report,
 			 * and delay sending the initial MLDv1 report
 			 * by not transitioning to the IDLE state.
 			 */
 			odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
 			if (delay) {
 				inm->in6m_timer = max(delay, odelay);
 				V_current_state_timers_running6 = 1;
 			} else {
 				inm->in6m_state = MLD_IDLE_MEMBER;
 				error = mld_v1_transmit_report(inm,
 				     MLD_LISTENER_REPORT);
 				if (error == 0) {
 					inm->in6m_timer = odelay;
 					V_current_state_timers_running6 = 1;
 				}
 			}
 			break;
 
 		case MLD_VERSION_2:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->in6m_scq;
 			mbufq_drain(mq);
 			retval = mld_v2_enqueue_group_record(mq, inm, 1,
 			    0, 0, (mli->mli_flags & MLIF_USEALLOW));
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next mld_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 *
 			 * If a delay was provided to this function, only
 			 * use this delay if sooner than the existing one.
 			 */
 			KASSERT(mli->mli_rv > 1,
 			   ("%s: invalid robustness %d", __func__,
 			    mli->mli_rv));
 			inm->in6m_scrv = mli->mli_rv;
 			if (delay) {
 				if (inm->in6m_sctimer > 1) {
 					inm->in6m_sctimer =
 					    min(inm->in6m_sctimer, delay);
 				} else
 					inm->in6m_sctimer = delay;
 			} else
 				inm->in6m_sctimer = 1;
 			V_state_change_timers_running6 = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the life-cycle.
  */
 static int
 mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp,
 	    ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr) ||
 	    (mli->mli_version != MLD_VERSION_2)) {
 		if (!mld_is_addr_reported(&inm->in6m_addr)) {
 			CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_MLD, "%s: nothing to do", __func__);
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	mbufq_drain(&inm->in6m_scq);
 
 	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
 	    (mli->mli_flags & MLIF_USEALLOW));
 	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->in6m_scrv = mli->mli_rv;
 	inm->in6m_sctimer = 1;
 	V_state_change_timers_running6 = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for a multicast address.
  *
  * When leaving a group:
  *  MLDv1 sends a DONE message, if and only if we are the reporter.
  *  MLDv2 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	int syncstates;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	syncstates = 1;
 
 	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		if (mli->mli_version == MLD_VERSION_1) {
 #ifdef INVARIANTS
 			if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 			    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
 			panic("%s: MLDv2 state reached, not MLDv2 mode",
 			     __func__);
 #endif
 			mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
 			inm->in6m_state = MLD_NOT_MEMBER;
 			V_current_state_timers_running6 = 1;
 		} else if (mli->mli_version == MLD_VERSION_2) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			inm->in6m_timer = 0;
 			inm->in6m_scrv = mli->mli_rv;
 			CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp), inm->in6m_scrv);
 			if (inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				inm->in6m_sctimer = 0;
 			} else {
 				int retval;
 
 				in6m_acquire_locked(inm);
 
 				retval = mld_v2_enqueue_group_record(
 				    &inm->in6m_scq, inm, 1, 0, 0,
 				    (mli->mli_flags & MLIF_USEALLOW));
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->in6m_state = MLD_LEAVING_MEMBER;
 				inm->in6m_sctimer = 1;
 				V_state_change_timers_running6 = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
 		    __func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
 	}
 }
 
 /*
  * Enqueue an MLDv2 group record to the given output queue.
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * If use_block_allow is non-zero, state change reports for initial join
  * and final leave, on an inclusive mode group with a source list, will be
  * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IPv6+ICMP headers to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query, const int use_block_allow)
 {
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ifnet		*ifp;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	uint8_t			 mode;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->in6m_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pmr = NULL;
 	type = MLD_DO_NOTHING;
 	mode = inm->in6m_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
 	    inm->in6m_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 *
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources. A transition to/from this state is
 		 * considered inclusive with some special treatment.
 		 *
 		 * If we are rewriting initial joins/leaves to use
 		 * ALLOW/BLOCK, and the group's membership is inclusive,
 		 * we need to send sources in all cases.
 		 */
 		if (mode != inm->in6m_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_MLD, "%s: change to EXCLUDE",
 				    __func__);
 				type = MLD_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_MLD, "%s: change to INCLUDE",
 				    __func__);
 				if (use_block_allow) {
 					/*
 					 * XXX
 					 * Here we're interested in state
 					 * edges either direction between
 					 * MCAST_UNDEFINED and MCAST_INCLUDE.
 					 * Perhaps we should just check
 					 * the group state, rather than
 					 * the filter mode.
 					 */
 					if (mode == MCAST_UNDEFINED) {
 						type = MLD_BLOCK_OLD_SOURCES;
 					} else {
 						type = MLD_ALLOW_NEW_SOURCES;
 					}
 				} else {
 					type = MLD_CHANGE_TO_INCLUDE_MODE;
 					if (mode == MCAST_UNDEFINED)
 						record_has_sources = 0;
 				}
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = MLD_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = MLD_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = MLD_MODE_IS_INCLUDE;
 			KASSERT(inm->in6m_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->in6m_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (mld_v2_enqueue_filter_change(mq, inm));
 
 	if (type == MLD_DO_NOTHING) {
 		CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
 		    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct mldv2_record);
 	if (record_has_sources)
 		minrec0len += sizeof(struct in6_addr);
 
 	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
 	    mld_rec_type_to_str(type),
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp));
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP6+RA+ICMPV6+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - MLD_MTUSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct mldv2_record)) /
 			    sizeof(struct in6_addr);
 		m = m0;
 		CTR1(KTR_MLD, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 		if (!is_state_change && !is_group_query)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 
 		mld_save_context(m, ifp);
 
 		CTR1(KTR_MLD, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	mr.mr_type = type;
 	mr.mr_datalen = 0;
 	mr.mr_numsrc = 0;
 	mr.mr_addr = inm->in6m_addr;
 	in6_clearscope(&mr.mr_addr);
 	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct mldv2_record);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 *
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, and
 	 * use_block_allow is zero, do not include source entries.
 	 * Otherwise, we need to include this source in the report.
 	 *
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
 		    nims) {
 			CTR2(KTR_MLD, "%s: visit node %s", __func__,
 			    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			CTR2(KTR_MLD, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(struct in6_addr);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 		mld_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_MLD, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct mldv2_record);
 
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 			CTR2(KTR_MLD, "%s: visit node %s",
 			    __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 
 		CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an MLDv2 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct mldv2_record) + sizeof(struct in6_addr);
 	struct ifnet		*ifp;
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	if (inm->in6m_nsrc == 0 ||
 	    (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->in6m_ifp;			/* interface */
 	mode = inm->in6m_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	npbytes = 0;	/* # of bytes appended this packet */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     MLD_V2_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - MLD_MTUSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct mldv2_record)) /
 					    sizeof(struct in6_addr);
 				CTR1(KTR_MLD,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m == NULL)
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 				if (m == NULL) {
 					CTR1(KTR_MLD,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				mld_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 				    sizeof(struct mldv2_record)) /
 				    sizeof(struct in6_addr);
 				npbytes = 0;
 				CTR1(KTR_MLD,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the MLD group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&mr, 0, sizeof(mr));
 			mr.mr_addr = inm->in6m_addr;
 			in6_clearscope(&mr.mr_addr);
 			if (!m_append(m, sizeof(mr), (void *)&mr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct mldv2_record);
 			if (m != m0) {
 				/* new packet; offset in chain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct mldv2_record), &off);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct mldv2_record));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL) {
 				nims = RB_MIN(ip6_msource_tree,
 				    &inm->in6m_srcs);
 			}
 			RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 				CTR2(KTR_MLD, "%s: visit node %s", __func__,
 				    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 				now = im6s_get_mode(inm, ims, 1);
 				then = im6s_get_mode(inm, ims, 0);
 				CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_MLD,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_MLD,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				if (!m_append(m, sizeof(struct in6_addr),
 				    (void *)&ims->im6s_addr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_MLD,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct mldv2_record);
 				if (m != m0) {
 					CTR1(KTR_MLD,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_MLD,
 					    "%s: m_adj(m, -mr)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct mldv2_record)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(struct in6_addr));
 			if (crt == REC_ALLOW)
 				pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
 			pmr->mr_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->in6m_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->in6m_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an MLDv2 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    MLD_V2_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_MLD,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_MLD, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_MLD, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending MLDv2 General Query.
  */
 static void
 mld_v2_dispatch_general_query(struct mld_ifsoftc *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 	int			 retval;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli->mli_version == MLD_VERSION_2,
 	    ("%s: called when version %d", __func__, mli->mli_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&mli->mli_gq) != 0)
 		goto send;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->in6m_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			retval = mld_v2_enqueue_group_record(&mli->mli_gq,
 			    inm, 0, 0, 0, 0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&mli->mli_gq) != NULL) {
 		mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
 		    MLD_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running6 = 1;
 	}
 }
 
 /*
  * Transmit the next pending message in the output queue.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as MLD traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 mld_dispatch_packet(struct mbuf *m)
 {
 	struct ip6_moptions	 im6o;
 	struct ifnet		*ifp;
 	struct ifnet		*oifp;
 	struct mbuf		*m0;
 	struct mbuf		*md;
 	struct ip6_hdr		*ip6;
 	struct mld_hdr		*mld;
 	int			 error;
 	int			 off;
 	int			 type;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	ifindex = mld_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IP6STAT_INC(ip6s_noroute);
 		goto out;
 	}
 
 	im6o.im6o_multicast_hlim  = 1;
 	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
 	im6o.im6o_multicast_ifp = ifp;
 
 	if (m->m_flags & M_MLDV1) {
 		m0 = m;
 	} else {
 		m0 = mld_v2_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
 			IP6STAT_INC(ip6s_odropped);
 			goto out;
 		}
 	}
 
 	mld_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 #if 0
 	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL);	/* XXX LOR */
 #else
 	/*
 	 * XXX XXX Break some KPI rules to prevent an LOR which would
 	 * occur if we called in6_setscope() at transmission.
 	 * See comments at top of file.
 	 */
 	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
 #endif
 
 	/*
 	 * Retrieve the ICMPv6 type before handoff to ip6_output(),
 	 * so we can bump the stats.
 	 */
 	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
 	mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
 	type = mld->mld_type;
 
 	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
 	    &oifp, NULL);
 	if (error) {
 		CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	if (oifp != NULL) {
 		icmp6_ifstat_inc(oifp, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_REPORT:
 		case MLDV2_LISTENER_REPORT:
 			icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
 			break;
 		}
 	}
 out:
 	return;
 }
 
 /*
  * Encapsulate an MLDv2 report.
  *
  * KAME IPv6 requires that hop-by-hop options be passed separately,
  * and that the IPv6 header be prepended in a separate mbuf.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf		*mh;
 	struct mldv2_report	*mld;
 	struct ip6_hdr		*ip6;
 	struct in6_ifaddr	*ia;
 	int			 mldreclen;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	/*
 	 * RFC3590: OK to send as :: or tentative during DAD.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if (ia == NULL)
 		CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		m_freem(m);
 		return (NULL);
 	}
 	M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
 
 	mldreclen = m_length(m, NULL);
 	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
 
 	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
 	    sizeof(struct mldv2_report) + mldreclen;
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	ip6->ip6_dst = in6addr_linklocal_allv2routers;
 	/* scope ID will be set in netisr */
 
 	mld = (struct mldv2_report *)(ip6 + 1);
 	mld->mld_type = MLDV2_LISTENER_REPORT;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_v2_reserved = 0;
 	mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	mh->m_next = m;
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
 	return (mh);
 }
 
 #ifdef KTR
 static char *
 mld_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case MLD_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case MLD_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case MLD_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case MLD_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case MLD_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case MLD_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 mld_init(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 	MLD_LOCK_INIT();
 
 	ip6_initpktopts(&mld_po);
 	mld_po.ip6po_hlim = 1;
 	mld_po.ip6po_hbh = &mld_ra.hbh;
 	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
 	mld_po.ip6po_flags = IP6PO_DONTFRAG;
 }
-SYSINIT(mld_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_init, NULL);
+SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL);
 
 static void
 mld_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 	MLD_LOCK_DESTROY();
 }
-SYSUNINIT(mld_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_uninit, NULL);
+SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL);
 
 static void
 vnet_mld_init(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 
 	LIST_INIT(&V_mli_head);
 }
-VNET_SYSINIT(vnet_mld_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_init,
+VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init,
     NULL);
 
 static void
 vnet_mld_uninit(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 
 	KASSERT(LIST_EMPTY(&V_mli_head),
 	    ("%s: mli list not empty; ifnets not detached?", __func__));
 }
-VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_uninit,
+VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit,
     NULL);
 
 static int
 mld_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t mld_mod = {
     "mld",
     mld_modevent,
     0
 };
-DECLARE_MODULE(mld, mld_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: projects/vnet/sys/netinet6/nd6.c
===================================================================
--- projects/vnet/sys/netinet6/nd6.c	(revision 294565)
+++ projects/vnet/sys/netinet6/nd6.c	(revision 294566)
@@ -1,2569 +1,2573 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/fddi.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/icmp6.h>
 #include <netinet6/send.h>
 
 #include <sys/limits.h>
 
 #include <security/mac/mac_framework.h>
 
 #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
 #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
 
 #define SIN6(s) ((const struct sockaddr_in6 *)(s))
 
 /* timer values */
 VNET_DEFINE(int, nd6_prune)	= 1;	/* walk list every 1 seconds */
 VNET_DEFINE(int, nd6_delay)	= 5;	/* delay first probe time 5 second */
 VNET_DEFINE(int, nd6_umaxtries)	= 3;	/* maximum unicast query */
 VNET_DEFINE(int, nd6_mmaxtries)	= 3;	/* maximum multicast query */
 VNET_DEFINE(int, nd6_useloopback) = 1;	/* use loopback interface for
 					 * local traffic */
 VNET_DEFINE(int, nd6_gctimer)	= (60 * 60 * 24); /* 1 day: garbage
 					 * collection timer */
 
 /* preventing too many loops in ND option parsing */
 static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
 
 VNET_DEFINE(int, nd6_maxnudhint) = 0;	/* max # of subsequent upper
 					 * layer hints */
 static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
 					 * ND entries */
 #define	V_nd6_maxndopt			VNET(nd6_maxndopt)
 #define	V_nd6_maxqueuelen		VNET(nd6_maxqueuelen)
 
 #ifdef ND6_DEBUG
 VNET_DEFINE(int, nd6_debug) = 1;
 #else
 VNET_DEFINE(int, nd6_debug) = 0;
 #endif
 
 static eventhandler_tag lle_event_eh;
 
 /* for debugging? */
 #if 0
 static int nd6_inuse, nd6_allocated;
 #endif
 
 VNET_DEFINE(struct nd_drhead, nd_defrouter);
 VNET_DEFINE(struct nd_prhead, nd_prefix);
 
 VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 int	(*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
 
 static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *,
 	struct ifnet *);
 static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
 static void nd6_slowtimo(void *);
 static int regen_tmpaddr(struct in6_ifaddr *);
 static void nd6_free(struct llentry *, int);
 static void nd6_free_redirect(const struct llentry *);
 static void nd6_llinfo_timer(void *);
 static void nd6_llinfo_settimer_locked(struct llentry *, long);
 static void clear_llinfo_pqueue(struct llentry *);
 static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int nd6_resolve_slow(struct ifnet *, struct mbuf *,
     const struct sockaddr_in6 *, u_char *, uint32_t *);
 static int nd6_need_cache(struct ifnet *);
  
 
 static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
 #define	V_nd6_slowtimo_ch		VNET(nd6_slowtimo_ch)
 
 VNET_DEFINE(struct callout, nd6_timer_ch);
+#define	V_nd6_timer_ch			VNET(nd6_timer_ch)
 
 static void
 nd6_lle_event(void *arg __unused, struct llentry *lle, int evt)
 {
 	struct rt_addrinfo rtinfo;
 	struct sockaddr_in6 dst;
 	struct sockaddr_dl gw;
 	struct ifnet *ifp;
 	int type;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	if (lltable_get_af(lle->lle_tbl) != AF_INET6)
 		return;
 
 	switch (evt) {
 	case LLENTRY_RESOLVED:
 		type = RTM_ADD;
 		KASSERT(lle->la_flags & LLE_VALID,
 		    ("%s: %p resolved but not valid?", __func__, lle));
 		break;
 	case LLENTRY_EXPIRED:
 		type = RTM_DELETE;
 		break;
 	default:
 		return;
 	}
 
 	ifp = lltable_get_ifp(lle->lle_tbl);
 
 	bzero(&dst, sizeof(dst));
 	bzero(&gw, sizeof(gw));
 	bzero(&rtinfo, sizeof(rtinfo));
 	lltable_fill_sa_entry(lle, (struct sockaddr *)&dst);
 	dst.sin6_scope_id = in6_getscopezone(ifp,
 	    in6_addrscope(&dst.sin6_addr));
 	gw.sdl_len = sizeof(struct sockaddr_dl);
 	gw.sdl_family = AF_LINK;
 	gw.sdl_alen = ifp->if_addrlen;
 	gw.sdl_index = ifp->if_index;
 	gw.sdl_type = ifp->if_type;
 	if (evt == LLENTRY_RESOLVED)
 		bcopy(&lle->ll_addr, gw.sdl_data, ifp->if_addrlen);
 	rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst;
 	rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw;
 	rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY;
 	rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | (
 	    type == RTM_ADD ? RTF_UP: 0), 0, RT_DEFAULT_FIB);
 }
 
 void
 nd6_init(void)
 {
 
 	LIST_INIT(&V_nd_prefix);
 
 	/* initialization of the default router list */
 	TAILQ_INIT(&V_nd_defrouter);
 
-	/* start timer */
+	/* Start timers. */
 	callout_init(&V_nd6_slowtimo_ch, 0);
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
+
+	callout_init(&V_nd6_timer_ch, 0);
+	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
 
 	nd6_dad_init();
 	if (IS_DEFAULT_VNET(curvnet))
 		lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event,
 		    NULL, EVENTHANDLER_PRI_ANY);
 }
 
 #ifdef VIMAGE
 void
 nd6_destroy()
 {
 
 	callout_drain(&V_nd6_slowtimo_ch);
 	callout_drain(&V_nd6_timer_ch);
 	if (IS_DEFAULT_VNET(curvnet))
 		EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
 }
 #endif
 
 struct nd_ifinfo *
 nd6_ifattach(struct ifnet *ifp)
 {
 	struct nd_ifinfo *nd;
 
 	nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK|M_ZERO);
 	nd->initialized = 1;
 
 	nd->chlim = IPV6_DEFHLIM;
 	nd->basereachable = REACHABLE_TIME;
 	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
 	nd->retrans = RETRANS_TIMER;
 
 	nd->flags = ND6_IFF_PERFORMNUD;
 
 	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
 	 * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_auto_linklocal configuration to
 	 * give a reasonable default behavior.
 	 */
 	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
 	    (ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
 	/*
 	 * A loopback interface does not need to accept RTADV.
 	 * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_accept_rtadv configuration to
 	 * prevent the interface from accepting RA messages arrived
 	 * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (V_ip6_accept_rtadv &&
 	    !(ifp->if_flags & IFF_LOOPBACK) &&
 	    (ifp->if_type != IFT_BRIDGE))
 			nd->flags |= ND6_IFF_ACCEPT_RTADV;
 	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_NO_RADR;
 
 	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
 	nd6_setmtu0(ifp, nd);
 
 	return nd;
 }
 
 void
 nd6_ifdetach(struct nd_ifinfo *nd)
 {
 
 	free(nd, M_IP6NDP);
 }
 
 /*
  * Reset ND level link MTU. This function is called when the physical MTU
  * changes, which means we might have to adjust the ND level MTU.
  */
 void
 nd6_setmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	nd6_setmtu0(ifp, ND_IFINFO(ifp));
 }
 
 /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
 void
 nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
 {
 	u_int32_t omaxmtu;
 
 	omaxmtu = ndi->maxmtu;
 
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 		ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
 		break;
 	case IFT_FDDI:
 		ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
 		break;
 	case IFT_ISO88025:
 		 ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
 		 break;
 	default:
 		ndi->maxmtu = ifp->if_mtu;
 		break;
 	}
 
 	/*
 	 * Decreasing the interface MTU under IPV6 minimum MTU may cause
 	 * undesirable situation.  We thus notify the operator of the change
 	 * explicitly.  The check for omaxmtu is necessary to restrict the
 	 * log to the case of changing the MTU, not initializing it.
 	 */
 	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
 		log(LOG_NOTICE, "nd6_setmtu0: "
 		    "new link MTU on %s (%lu) is too small for IPv6\n",
 		    if_name(ifp), (unsigned long)ndi->maxmtu);
 	}
 
 	if (ndi->maxmtu > V_in6_maxmtu)
 		in6_setmaxmtu(); /* check all interfaces just in case */
 
 }
 
 void
 nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
 {
 
 	bzero(ndopts, sizeof(*ndopts));
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
 	ndopts->nd_opts_last
 		= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
 
 	if (icmp6len == 0) {
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 }
 
 /*
  * Take one ND option.
  */
 struct nd_opt_hdr *
 nd6_option(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int olen;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return NULL;
 	if (ndopts->nd_opts_done)
 		return NULL;
 
 	nd_opt = ndopts->nd_opts_search;
 
 	/* make sure nd_opt_len is inside the buffer */
 	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	olen = nd_opt->nd_opt_len << 3;
 	if (olen == 0) {
 		/*
 		 * Message validation requires that all included
 		 * options have a length that is greater than zero.
 		 */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
 	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
 		/* option overruns the end of buffer, invalid */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
 		/* reached the end of options chain */
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 	return nd_opt;
 }
 
 /*
  * Parse multiple ND options.
  * This function is much easier to use, for ND routines that do not need
  * multiple options of the same type.
  */
 int
 nd6_options(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int i = 0;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return 0;
 
 	while (1) {
 		nd_opt = nd6_option(ndopts);
 		if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
 			/*
 			 * Message validation requires that all included
 			 * options have a length that is greater than zero.
 			 */
 			ICMP6STAT_INC(icp6s_nd_badopt);
 			bzero(ndopts, sizeof(*ndopts));
 			return -1;
 		}
 
 		if (nd_opt == NULL)
 			goto skip1;
 
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LINKADDR:
 		case ND_OPT_TARGET_LINKADDR:
 		case ND_OPT_MTU:
 		case ND_OPT_REDIRECTED_HEADER:
 		case ND_OPT_NONCE:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				nd6log((LOG_INFO,
 				    "duplicated ND6 option found (type=%d)\n",
 				    nd_opt->nd_opt_type));
 				/* XXX bark? */
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			break;
 		case ND_OPT_PREFIX_INFORMATION:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			ndopts->nd_opts_pi_end =
 				(struct nd_opt_prefix_info *)nd_opt;
 			break;
 		/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
 		case ND_OPT_RDNSS:	/* RFC 6106 */
 		case ND_OPT_DNSSL:	/* RFC 6106 */
 			/*
 			 * Silently ignore options we know and do not care about
 			 * in the kernel.
 			 */
 			break;
 		default:
 			/*
 			 * Unknown options must be silently ignored,
 			 * to accomodate future extension to the protocol.
 			 */
 			nd6log((LOG_DEBUG,
 			    "nd6_options: unsupported option %d - "
 			    "option ignored\n", nd_opt->nd_opt_type));
 		}
 
 skip1:
 		i++;
 		if (i > V_nd6_maxndopt) {
 			ICMP6STAT_INC(icp6s_nd_toomanyopt);
 			nd6log((LOG_INFO, "too many loop in nd opt\n"));
 			break;
 		}
 
 		if (ndopts->nd_opts_done)
 			break;
 	}
 
 	return 0;
 }
 
 /*
  * ND6 timer routine to handle ND6 entries
  */
 static void
 nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
 {
 	int canceled;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	if (tick < 0) {
 		ln->la_expire = 0;
 		ln->ln_ntick = 0;
 		canceled = callout_stop(&ln->lle_timer);
 	} else {
 		ln->la_expire = time_uptime + tick / hz;
 		LLE_ADDREF(ln);
 		if (tick > INT_MAX) {
 			ln->ln_ntick = tick - INT_MAX;
 			canceled = callout_reset(&ln->lle_timer, INT_MAX,
 			    nd6_llinfo_timer, ln);
 		} else {
 			ln->ln_ntick = 0;
 			canceled = callout_reset(&ln->lle_timer, tick,
 			    nd6_llinfo_timer, ln);
 		}
 	}
 	if (canceled > 0)
 		LLE_REMREF(ln);
 }
 
 /*
  * Gets source address of the first packet in hold queue
  * and stores it in @src.
  * Returns pointer to @src (if hold queue is not empty) or NULL.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline struct in6_addr *
 nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src)
 {
 	struct ip6_hdr hdr;
 	struct mbuf *m;
 
 	if (ln->la_hold == NULL)
 		return (NULL);
 
 	/*
 	 * assume every packet in la_hold has the same IP header
 	 */
 	m = ln->la_hold;
 	if (sizeof(hdr) > m->m_len)
 		return (NULL);
 
 	m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr);
 	*src = hdr.ip6_src;
 
 	return (src);
 }
 
 /*
  * Checks if we need to switch from STALE state.
  *
  * RFC 4861 requires switching from STALE to DELAY state
  * on first packet matching entry, waiting V_nd6_delay and
  * transition to PROBE state (if upper layer confirmation was
  * not received).
  *
  * This code performs a bit differently:
  * On packet hit we don't change state (but desired state
  * can be guessed by control plane). However, after V_nd6_delay
  * seconds code will transition to PROBE state (so DELAY state
  * is kinda skipped in most situations).
  *
  * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so
  * we perform the following upon entering STALE state:
  *
  * 1) Arm timer to run each V_nd6_delay seconds to make sure that
  * if packet was transmitted at the start of given interval, we
  * would be able to switch to PROBE state in V_nd6_delay seconds
  * as user expects.
  *
  * 2) Reschedule timer until original V_nd6_gctimer expires keeping
  * lle in STALE state (remaining timer value stored in lle_remtime).
  *
  * 3) Reschedule timer if packet was transmitted less that V_nd6_delay
  * seconds ago.
  *
  * Returns non-zero value if the entry is still STALE (storing
  * the next timer interval in @pdelay).
  *
  * Returns zero value if original timer expired or we need to switch to
  * PROBE (store that in @do_switch variable).
  */
 static int
 nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch)
 {
 	int nd_delay, nd_gctimer, r_skip_req;
 	time_t lle_hittime;
 	long delay;
 
 	*do_switch = 0;
 	nd_gctimer = V_nd6_gctimer;
 	nd_delay = V_nd6_delay;
 
 	LLE_REQ_LOCK(lle);
 	r_skip_req = lle->r_skip_req;
 	lle_hittime = lle->lle_hittime;
 	LLE_REQ_UNLOCK(lle);
 
 	if (r_skip_req > 0) {
 
 		/*
 		 * Nonzero r_skip_req value was set upon entering
 		 * STALE state. Since value was not changed, no
 		 * packets were passed using this lle. Ask for
 		 * timer reschedule and keep STALE state.
 		 */
 		delay = (long)(MIN(nd_gctimer, nd_delay));
 		delay *= hz;
 		if (lle->lle_remtime > delay)
 			lle->lle_remtime -= delay;
 		else {
 			delay = lle->lle_remtime;
 			lle->lle_remtime = 0;
 		}
 
 		if (delay == 0) {
 
 			/*
 			 * The original ng6_gctime timeout ended,
 			 * no more rescheduling.
 			 */
 			return (0);
 		}
 
 		*pdelay = delay;
 		return (1);
 	}
 
 	/*
 	 * Packet received. Verify timestamp
 	 */
 	delay = (long)(time_uptime - lle_hittime);
 	if (delay < nd_delay) {
 
 		/*
 		 * V_nd6_delay still not passed since the first
 		 * hit in STALE state.
 		 * Reshedule timer and return.
 		 */
 		*pdelay = (long)(nd_delay - delay) * hz;
 		return (1);
 	}
 
 	/* Request switching to probe */
 	*do_switch = 1;
 	return (0);
 }
 
 
 /*
  * Switch @lle state to new state optionally arming timers.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline void
 nd6_llinfo_setstate(struct llentry *lle, int newstate)
 {
 	struct ifnet *ifp;
 	int nd_gctimer, nd_delay;
 	long delay, remtime;
 
 	delay = 0;
 	remtime = 0;
 
 	switch (newstate) {
 	case ND6_LLINFO_INCOMPLETE:
 		ifp = lle->lle_tbl->llt_ifp;
 		delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000;
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(lle)) {
 			ifp = lle->lle_tbl->llt_ifp;
 			delay = (long)ND_IFINFO(ifp)->reachable * hz;
 		}
 		break;
 	case ND6_LLINFO_STALE:
 
 		/*
 		 * Notify fast path that we want to know if any packet
 		 * is transmitted by setting r_skip_req.
 		 */
 		LLE_REQ_LOCK(lle);
 		lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(lle);
 		nd_delay = V_nd6_delay;
 		nd_gctimer = V_nd6_gctimer;
 
 		delay = (long)(MIN(nd_gctimer, nd_delay)) * hz;
 		remtime = (long)nd_gctimer * hz - delay;
 		break;
 	case ND6_LLINFO_DELAY:
 		lle->la_asked = 0;
 		delay = (long)V_nd6_delay * hz;
 		break;
 	}
 
 	if (delay > 0)
 		nd6_llinfo_settimer_locked(lle, delay);
 
 	lle->lle_remtime = remtime;
 	lle->ln_state = newstate;
 }
 
 /*
  * Timer-dependent part of nd state machine.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_llinfo_timer(void *arg)
 {
 	struct llentry *ln;
 	struct in6_addr *dst, *pdst, *psrc, src;
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi = NULL;
 	int do_switch, send_ns;
 	long delay;
 
 	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
 	ln = (struct llentry *)arg;
 	LLE_WLOCK(ln);
 	if (callout_pending(&ln->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by nd6_llinfo_settimer_locked above since canceled
 		 * would have been 1.
 		 */
 		LLE_WUNLOCK(ln);
 		return;
 	}
 	ifp = ln->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 	ndi = ND_IFINFO(ifp);
 	send_ns = 0;
 	dst = &ln->r_l3addr.addr6;
 	pdst = dst;
 
 	if (ln->ln_ntick > 0) {
 		if (ln->ln_ntick > INT_MAX) {
 			ln->ln_ntick -= INT_MAX;
 			nd6_llinfo_settimer_locked(ln, INT_MAX);
 		} else {
 			ln->ln_ntick = 0;
 			nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
 		}
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_STATIC) {
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_DELETED) {
 		nd6_free(ln, 0);
 		ln = NULL;
 		goto done;
 	}
 
 	switch (ln->ln_state) {
 	case ND6_LLINFO_INCOMPLETE:
 		if (ln->la_asked < V_nd6_mmaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 			/* Send NS to multicast address */
 			pdst = NULL;
 		} else {
 			struct mbuf *m = ln->la_hold;
 			if (m) {
 				struct mbuf *m0;
 
 				/*
 				 * assuming every packet in la_hold has the
 				 * same IP header.  Send error after unlock.
 				 */
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				ln->la_hold = m0;
 				clear_llinfo_pqueue(ln);
 			}
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT);
 			nd6_free(ln, 0);
 			ln = NULL;
 			if (m != NULL)
 				icmp6_error2(m, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_ADDR, 0, ifp);
 		}
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(ln))
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 		break;
 
 	case ND6_LLINFO_STALE:
 		if (nd6_is_stale(ln, &delay, &do_switch) != 0) {
 
 			/*
 			 * No packet has used this entry and GC timeout
 			 * has not been passed. Reshedule timer and
 			 * return.
 			 */
 			nd6_llinfo_settimer_locked(ln, delay);
 			break;
 		}
 
 		if (do_switch == 0) {
 
 			/*
 			 * GC timer has ended and entry hasn't been used.
 			 * Run Garbage collector (RFC 4861, 5.3)
 			 */
 			if (!ND6_LLINFO_PERMANENT(ln)) {
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_EXPIRED);
 				nd6_free(ln, 1);
 				ln = NULL;
 			}
 			break;
 		}
 
 		/* Entry has been used AND delay timer has ended. */
 
 		/* FALLTHROUGH */
 
 	case ND6_LLINFO_DELAY:
 		if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
 			/* We need NUD */
 			ln->la_asked = 1;
 			nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE);
 			send_ns = 1;
 		} else
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */
 		break;
 	case ND6_LLINFO_PROBE:
 		if (ln->la_asked < V_nd6_umaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 		} else {
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			nd6_free(ln, 0);
 			ln = NULL;
 		}
 		break;
 	default:
 		panic("%s: paths in a dark night can be confusing: %d",
 		    __func__, ln->ln_state);
 	}
 done:
 	if (send_ns != 0) {
 		nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 		psrc = nd6_llinfo_get_holdsrc(ln, &src);
 		LLE_FREE_LOCKED(ln);
 		ln = NULL;
 		nd6_ns_output(ifp, psrc, pdst, dst, NULL);
 	}
 
 	if (ln != NULL)
 		LLE_FREE_LOCKED(ln);
 	CURVNET_RESTORE();
 }
 
 
 /*
  * ND6 timer routine to expire default route list and prefix list
  */
 void
 nd6_timer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 	struct in6_ifaddr *ia6, *nia6;
 
 	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
 	    nd6_timer, curvnet);
 
 	/* expire default router list */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->expire && dr->expire < time_uptime)
 			defrtrlist_del(dr);
 	}
 
 	/*
 	 * expire interface addresses.
 	 * in the past the loop was inside prefix expiry processing.
 	 * However, from a stricter speci-confrmance standpoint, we should
 	 * rather separate address lifetimes and prefix lifetimes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
   addrloop:
 	TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
 		/* check address lifetime */
 		if (IFA6_IS_INVALID(ia6)) {
 			int regen = 0;
 
 			/*
 			 * If the expiring address is temporary, try
 			 * regenerating a new one.  This would be useful when
 			 * we suspended a laptop PC, then turned it on after a
 			 * period that could invalidate all temporary
 			 * addresses.  Although we may have to restart the
 			 * loop (see below), it must be after purging the
 			 * address.  Otherwise, we'd see an infinite loop of
 			 * regeneration.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 				if (regen_tmpaddr(ia6) == 0)
 					regen = 1;
 			}
 
 			in6_purgeaddr(&ia6->ia_ifa);
 
 			if (regen)
 				goto addrloop; /* XXX: see below */
 		} else if (IFA6_IS_DEPRECATED(ia6)) {
 			int oldflags = ia6->ia6_flags;
 
 			ia6->ia6_flags |= IN6_IFF_DEPRECATED;
 
 			/*
 			 * If a temporary address has just become deprecated,
 			 * regenerate a new one if possible.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (oldflags & IN6_IFF_DEPRECATED) == 0) {
 
 				if (regen_tmpaddr(ia6) == 0) {
 					/*
 					 * A new temporary address is
 					 * generated.
 					 * XXX: this means the address chain
 					 * has changed while we are still in
 					 * the loop.  Although the change
 					 * would not cause disaster (because
 					 * it's not a deletion, but an
 					 * addition,) we'd rather restart the
 					 * loop just for safety.  Or does this
 					 * significantly reduce performance??
 					 */
 					goto addrloop;
 				}
 			}
 		} else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) {
 			/*
 			 * Schedule DAD for a tentative address.  This happens
 			 * if the interface was down or not running
 			 * when the address was configured.
 			 */
 			int delay;
 
 			delay = arc4random() %
 			    (MAX_RTR_SOLICITATION_DELAY * hz);
 			nd6_dad_start((struct ifaddr *)ia6, delay);
 		} else {
 			/*
 			 * Check status of the interface.  If it is down,
 			 * mark the address as tentative for future DAD.
 			 */
 			if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 ||
 			    (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING)
 				== 0 ||
 			    (ND_IFINFO(ia6->ia_ifp)->flags &
 				ND6_IFF_IFDISABLED) != 0) {
 				ia6->ia6_flags &= ~IN6_IFF_DUPLICATED;
 				ia6->ia6_flags |= IN6_IFF_TENTATIVE;
 			}
 			/*
 			 * A new RA might have made a deprecated address
 			 * preferred.
 			 */
 			ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
 		}
 	}
 
 	/* expire prefix list */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		/*
 		 * check prefix lifetime.
 		 * since pltime is just for autoconf, pltime processing for
 		 * prefix is not necessary.
 		 */
 		if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME &&
 		    time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) {
 
 			/*
 			 * address expiration and prefix expiration are
 			 * separate.  NEVER perform in6_purgeaddr here.
 			 */
 			prelist_remove(pr);
 		}
 	}
 	CURVNET_RESTORE();
 }
 
 /*
  * ia6 - deprecated/invalidated temporary address
  */
 static int
 regen_tmpaddr(struct in6_ifaddr *ia6)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in6_ifaddr *public_ifa6 = NULL;
 
 	ifp = ia6->ia_ifa.ifa_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *it6;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		it6 = (struct in6_ifaddr *)ifa;
 
 		/* ignore no autoconf addresses. */
 		if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 			continue;
 
 		/* ignore autoconf addresses with different prefixes. */
 		if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
 			continue;
 
 		/*
 		 * Now we are looking at an autoconf address with the same
 		 * prefix as ours.  If the address is temporary and is still
 		 * preferred, do not create another one.  It would be rare, but
 		 * could happen, for example, when we resume a laptop PC after
 		 * a long period.
 		 */
 		if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    !IFA6_IS_DEPRECATED(it6)) {
 			public_ifa6 = NULL;
 			break;
 		}
 
 		/*
 		 * This is a public autoconf address that has the same prefix
 		 * as ours.  If it is preferred, keep it.  We can't break the
 		 * loop here, because there may be a still-preferred temporary
 		 * address with the prefix.
 		 */
 		if (!IFA6_IS_DEPRECATED(it6))
 			public_ifa6 = it6;
 	}
 	if (public_ifa6 != NULL)
 		ifa_ref(&public_ifa6->ia_ifa);
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (public_ifa6 != NULL) {
 		int e;
 
 		if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
 			ifa_free(&public_ifa6->ia_ifa);
 			log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
 			    " tmp addr,errno=%d\n", e);
 			return (-1);
 		}
 		ifa_free(&public_ifa6->ia_ifa);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * Nuke neighbor cache/prefix/default router management table, right before
  * ifp goes away.
  */
 void
 nd6_purge(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 
 	/*
 	 * Nuke default router list entries toward ifp.
 	 * We defer removal of default router list entries that is installed
 	 * in the routing table, in order to keep additional side effects as
 	 * small as possible.
 	 */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	/* Nuke prefix list entries toward ifp */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		if (pr->ndpr_ifp == ifp) {
 			/*
 			 * Because if_detach() does *not* release prefixes
 			 * while purging addresses the reference count will
 			 * still be above zero. We therefore reset it to
 			 * make sure that the prefix really gets purged.
 			 */
 			pr->ndpr_refcnt = 0;
 
 			/*
 			 * Previously, pr->ndpr_addr is removed as well,
 			 * but I strongly believe we don't have to do it.
 			 * nd6_purge() is only called from in6_ifdetach(),
 			 * which removes all the associated interface addresses
 			 * by itself.
 			 * (jinmei@kame.net 20010129)
 			 */
 			prelist_remove(pr);
 		}
 	}
 
 	/* cancel default outgoing interface setting */
 	if (V_nd6_defifindex == ifp->if_index)
 		nd6_setdefaultiface(0);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/* Refresh default router list. */
 		defrouter_select();
 	}
 
 	/* XXXXX
 	 * We do not nuke the neighbor cache entries here any more
 	 * because the neighbor cache is kept in if_afdata[AF_INET6].
 	 * nd6_purge() is invoked by in6_ifdetach() which is called
 	 * from if_detach() where everything gets purged. So let
 	 * in6_domifdetach() do the actual L2 table purging work.
 	 */
 }
 
 /* 
  * the caller acquires and releases the lock on the lltbls
  * Returns the llentry locked
  */
 struct llentry *
 nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 	
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 
 	ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6);
 
 	return (ln);
 }
 
 struct llentry *
 nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6);
 	if (ln != NULL)
 		ln->ln_state = ND6_LLINFO_NOSTATE;
 
 	return (ln);
 }
 
 /*
  * Test whether a given IPv6 address is a neighbor or not, ignoring
  * the actual neighbor cache.  The neighbor cache is ignored in order
  * to not reenter the routing code from within itself.
  */
 static int
 nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct nd_prefix *pr;
 	struct ifaddr *dstaddr;
 
 	/*
 	 * A link-local address is always a neighbor.
 	 * XXX: a link does not necessarily specify a single interface.
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
 		struct sockaddr_in6 sin6_copy;
 		u_int32_t zone;
 
 		/*
 		 * We need sin6_copy since sa6_recoverscope() may modify the
 		 * content (XXX).
 		 */
 		sin6_copy = *addr;
 		if (sa6_recoverscope(&sin6_copy))
 			return (0); /* XXX: should be impossible */
 		if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
 			return (0);
 		if (sin6_copy.sin6_scope_id == zone)
 			return (1);
 		else
 			return (0);
 	}
 
 	/*
 	 * If the address matches one of our addresses,
 	 * it should be a neighbor.
 	 * If the address matches one of our on-link prefixes, it should be a
 	 * neighbor.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_ifp != ifp)
 			continue;
 
 		if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) {
 			struct rtentry *rt;
 
 			/* Always use the default FIB here. */
 			rt = in6_rtalloc1((struct sockaddr *)&pr->ndpr_prefix,
 			    0, 0, RT_DEFAULT_FIB);
 			if (rt == NULL)
 				continue;
 			/*
 			 * This is the case where multiple interfaces
 			 * have the same prefix, but only one is installed 
 			 * into the routing table and that prefix entry
 			 * is not the one being examined here. In the case
 			 * where RADIX_MPATH is enabled, multiple route
 			 * entries (of the same rt_key value) will be 
 			 * installed because the interface addresses all
 			 * differ.
 			 */
 			if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 			       &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)) {
 				RTFREE_LOCKED(rt);
 				continue;
 			}
 			RTFREE_LOCKED(rt);
 		}
 
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 		    &addr->sin6_addr, &pr->ndpr_mask))
 			return (1);
 	}
 
 	/*
 	 * If the address is assigned on the node of the other side of
 	 * a p2p interface, the address should be a neighbor.
 	 */
 	dstaddr = ifa_ifwithdstaddr((const struct sockaddr *)addr, RT_ALL_FIBS);
 	if (dstaddr != NULL) {
 		if (dstaddr->ifa_ifp == ifp) {
 			ifa_free(dstaddr);
 			return (1);
 		}
 		ifa_free(dstaddr);
 	}
 
 	/*
 	 * If the default router list is empty, all addresses are regarded
 	 * as on-link, and thus, as a neighbor.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
 	    TAILQ_EMPTY(&V_nd_defrouter) &&
 	    V_nd6_defifindex == ifp->if_index) {
 		return (1);
 	}
 
 	return (0);
 }
 
 
 /*
  * Detect if a given IPv6 address identifies a neighbor on a given link.
  * XXX: should take care of the destination of a p2p link?
  */
 int
 nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct llentry *lle;
 	int rc = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (nd6_is_new_addr_neighbor(addr, ifp))
 		return (1);
 
 	/*
 	 * Even if the address matches none of our addresses, it might be
 	 * in the neighbor cache.
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
 		LLE_RUNLOCK(lle);
 		rc = 1;
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 	return (rc);
 }
 
 /*
  * Free an nd6 llinfo entry.
  * Since the function would cause significant changes in the kernel, DO NOT
  * make it global, unless you have a strong reason for the change, and are sure
  * that the change is safe.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_free(struct llentry *ln, int gc)
 {
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	/*
 	 * we used to have pfctlinput(PRC_HOSTDEAD) here.
 	 * even though it is not harmful, it was not really necessary.
 	 */
 
 	/* cancel timer */
 	nd6_llinfo_settimer_locked(ln, -1);
 
 	ifp = ln->lle_tbl->llt_ifp;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		dr = defrouter_lookup(&ln->r_l3addr.addr6, ifp);
 
 		if (dr != NULL && dr->expire &&
 		    ln->ln_state == ND6_LLINFO_STALE && gc) {
 			/*
 			 * If the reason for the deletion is just garbage
 			 * collection, and the neighbor is an active default
 			 * router, do not delete it.  Instead, reset the GC
 			 * timer using the router's lifetime.
 			 * Simply deleting the entry would affect default
 			 * router selection, which is not necessarily a good
 			 * thing, especially when we're using router preference
 			 * values.
 			 * XXX: the check for ln_state would be redundant,
 			 *      but we intentionally keep it just in case.
 			 */
 			if (dr->expire > time_uptime)
 				nd6_llinfo_settimer_locked(ln,
 				    (dr->expire - time_uptime) * hz);
 			else
 				nd6_llinfo_settimer_locked(ln,
 				    (long)V_nd6_gctimer * hz);
 
 			LLE_REMREF(ln);
 			LLE_WUNLOCK(ln);
 			return;
 		}
 
 		if (dr) {
 			/*
 			 * Unreachablity of a router might affect the default
 			 * router selection and on-link detection of advertised
 			 * prefixes.
 			 */
 
 			/*
 			 * Temporarily fake the state to choose a new default
 			 * router and to perform on-link determination of
 			 * prefixes correctly.
 			 * Below the state will be set correctly,
 			 * or the entry itself will be deleted.
 			 */
 			ln->ln_state = ND6_LLINFO_INCOMPLETE;
 		}
 
 		if (ln->ln_router || dr) {
 
 			/*
 			 * We need to unlock to avoid a LOR with rt6_flush() with the
 			 * rnh and for the calls to pfxlist_onlink_check() and
 			 * defrouter_select() in the block further down for calls
 			 * into nd6_lookup().  We still hold a ref.
 			 */
 			LLE_WUNLOCK(ln);
 
 			/*
 			 * rt6_flush must be called whether or not the neighbor
 			 * is in the Default Router List.
 			 * See a corresponding comment in nd6_na_input().
 			 */
 			rt6_flush(&ln->r_l3addr.addr6, ifp);
 		}
 
 		if (dr) {
 			/*
 			 * Since defrouter_select() does not affect the
 			 * on-link determination and MIP6 needs the check
 			 * before the default router selection, we perform
 			 * the check now.
 			 */
 			pfxlist_onlink_check();
 
 			/*
 			 * Refresh default router list.
 			 */
 			defrouter_select();
 		}
 
 		/*
 		 * If this entry was added by an on-link redirect, remove the
 		 * corresponding host route.
 		 */
 		if (ln->la_flags & LLE_REDIRECT)
 			nd6_free_redirect(ln);
 
 		if (ln->ln_router || dr)
 			LLE_WLOCK(ln);
 	}
 
 	/*
 	 * Save to unlock. We still hold an extra reference and will not
 	 * free(9) in llentry_free() if someone else holds one as well.
 	 */
 	LLE_WUNLOCK(ln);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Guard against race with other llentry_free(). */
 	if (ln->la_flags & LLE_LINKED) {
 		/* Remove callout reference */
 		LLE_REMREF(ln);
 		lltable_unlink_entry(ln->lle_tbl, ln);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	llentry_free(ln);
 }
 
 static int
 nd6_isdynrte(const struct rtentry *rt, void *xap)
 {
 
 	if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC))
 		return (1);
 
 	return (0);
 }
 /*
  * Remove the rtentry for the given llentry,
  * both of which were installed by a redirect.
  */
 static void
 nd6_free_redirect(const struct llentry *ln)
 {
 	int fibnum;
 	struct sockaddr_in6 sin6;
 	struct rt_addrinfo info;
 
 	lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
 	memset(&info, 0, sizeof(info));
 	info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6;
 	info.rti_filter = nd6_isdynrte;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 		rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
 }
 
 /*
  * Rejuvenate this function for routing operations related
  * processing.
  */
 void
 nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct sockaddr_in6 *gateway;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	gateway = (struct sockaddr_in6 *)rt->rt_gateway;
 	ifp = rt->rt_ifp;
 
 	switch (req) {
 	case RTM_ADD:
 		break;
 
 	case RTM_DELETE:
 		if (!ifp)
 			return;
 		/*
 		 * Only indirect routes are interesting.
 		 */
 		if ((rt->rt_flags & RTF_GATEWAY) == 0)
 			return;
 		/*
 		 * check for default route
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&in6addr_any, 
 				       &SIN6(rt_key(rt))->sin6_addr)) {
 
 			dr = defrouter_lookup(&gateway->sin6_addr, ifp);
 			if (dr != NULL)
 				dr->installed = 0;
 		}
 		break;
 	}
 }
 
 
 int
 nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_ndireq *ndi = (struct in6_ndireq *)data;
 	struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
 	struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
 	int error = 0;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 	switch (cmd) {
 	case OSIOCGIFINFO_IN6:
 #define ND	ndi->ndi
 		/* XXX: old ndp(8) assumes a positive value for linkmtu. */
 		bzero(&ND, sizeof(ND));
 		ND.linkmtu = IN6_LINKMTU(ifp);
 		ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
 		ND.basereachable = ND_IFINFO(ifp)->basereachable;
 		ND.reachable = ND_IFINFO(ifp)->reachable;
 		ND.retrans = ND_IFINFO(ifp)->retrans;
 		ND.flags = ND_IFINFO(ifp)->flags;
 		ND.recalctm = ND_IFINFO(ifp)->recalctm;
 		ND.chlim = ND_IFINFO(ifp)->chlim;
 		break;
 	case SIOCGIFINFO_IN6:
 		ND = *ND_IFINFO(ifp);
 		break;
 	case SIOCSIFINFO_IN6:
 		/*
 		 * used to change host variables from userland.
 		 * intented for a use on router to reflect RA configurations.
 		 */
 		/* 0 means 'unspecified' */
 		if (ND.linkmtu != 0) {
 			if (ND.linkmtu < IPV6_MMTU ||
 			    ND.linkmtu > IN6_LINKMTU(ifp)) {
 				error = EINVAL;
 				break;
 			}
 			ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
 		}
 
 		if (ND.basereachable != 0) {
 			int obasereachable = ND_IFINFO(ifp)->basereachable;
 
 			ND_IFINFO(ifp)->basereachable = ND.basereachable;
 			if (ND.basereachable != obasereachable)
 				ND_IFINFO(ifp)->reachable =
 				    ND_COMPUTE_RTIME(ND.basereachable);
 		}
 		if (ND.retrans != 0)
 			ND_IFINFO(ifp)->retrans = ND.retrans;
 		if (ND.chlim != 0)
 			ND_IFINFO(ifp)->chlim = ND.chlim;
 		/* FALLTHROUGH */
 	case SIOCSIFINFO_FLAGS:
 	{
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia;
 
 		if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 		    !(ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 1->0 transision */
 
 			/*
 			 * If the interface is marked as ND6_IFF_IFDISABLED and
 			 * has an link-local address with IN6_IFF_DUPLICATED,
 			 * do not clear ND6_IFF_IFDISABLED.
 			 * See RFC 4862, Section 5.4.5.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
 				    IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 					break;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 
 			if (ifa != NULL) {
 				/* LLA is duplicated. */
 				ND.flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "Cannot enable an interface"
 				    " with a link-local address marked"
 				    " duplicate.\n");
 			} else {
 				ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
 				if (ifp->if_flags & IFF_UP)
 					in6_if_up(ifp);
 			}
 		} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 			    (ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 0->1 transision */
 			/* Mark all IPv6 address as tentative. */
 
 			ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 			if (V_ip6_dad_count > 0 &&
 			    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					ia->ia6_flags |= IN6_IFF_TENTATIVE;
 				}
 				IF_ADDR_RUNLOCK(ifp);
 			}
 		}
 
 		if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
 			if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
 				/* auto_linklocal 0->1 transision */
 
 				/* If no link-local address on ifp, configure */
 				ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
 				in6_ifattach(ifp, NULL);
 			} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
 			    ifp->if_flags & IFF_UP) {
 				/*
 				 * When the IF already has
 				 * ND6_IFF_AUTO_LINKLOCAL, no link-local
 				 * address is assigned, and IFF_UP, try to
 				 * assign one.
 				 */
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 						break;
 				}
 				IF_ADDR_RUNLOCK(ifp);
 				if (ifa != NULL)
 					/* No LLA is configured. */
 					in6_ifattach(ifp, NULL);
 			}
 		}
 	}
 		ND_IFINFO(ifp)->flags = ND.flags;
 		break;
 #undef ND
 	case SIOCSNDFLUSH_IN6:	/* XXX: the ioctl name is confusing... */
 		/* sync kernel routing table with the default router list */
 		defrouter_reset();
 		defrouter_select();
 		break;
 	case SIOCSPFXFLUSH_IN6:
 	{
 		/* flush all the prefix advertised by routers */
 		struct nd_prefix *pr, *next;
 
 		LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
 			struct in6_ifaddr *ia, *ia_next;
 
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue; /* XXX */
 
 			/* do we really have to remove addresses as well? */
 			/* XXXRW: in6_ifaddrhead locking. */
 			TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
 			    ia_next) {
 				if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 					continue;
 
 				if (ia->ia6_ndpr == pr)
 					in6_purgeaddr(&ia->ia_ifa);
 			}
 			prelist_remove(pr);
 		}
 		break;
 	}
 	case SIOCSRTRFLUSH_IN6:
 	{
 		/* flush all the default routers */
 		struct nd_defrouter *dr, *next;
 
 		defrouter_reset();
 		TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, next) {
 			defrtrlist_del(dr);
 		}
 		defrouter_select();
 		break;
 	}
 	case SIOCGNBRINFO_IN6:
 	{
 		struct llentry *ln;
 		struct in6_addr nb_addr = nbi->addr; /* make local for safety */
 
 		if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
 			return (error);
 
 		IF_AFDATA_RLOCK(ifp);
 		ln = nd6_lookup(&nb_addr, 0, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if (ln == NULL) {
 			error = EINVAL;
 			break;
 		}
 		nbi->state = ln->ln_state;
 		nbi->asked = ln->la_asked;
 		nbi->isrouter = ln->ln_router;
 		if (ln->la_expire == 0)
 			nbi->expire = 0;
 		else
 			nbi->expire = ln->la_expire + ln->lle_remtime / hz +
 			    (time_second - time_uptime);
 		LLE_RUNLOCK(ln);
 		break;
 	}
 	case SIOCGDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		ndif->ifindex = V_nd6_defifindex;
 		break;
 	case SIOCSDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		return (nd6_setdefaultiface(ndif->ifindex));
 	}
 	return (error);
 }
 
 /*
  * Calculates new isRouter value based on provided parameters and
  * returns it.
  */
 static int
 nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr,
     int ln_router)
 {
 
 	/*
 	 * ICMP6 type dependent behavior.
 	 *
 	 * NS: clear IsRouter if new entry
 	 * RS: clear IsRouter
 	 * RA: set IsRouter if there's lladdr
 	 * redir: clear IsRouter if new entry
 	 *
 	 * RA case, (1):
 	 * The spec says that we must set IsRouter in the following cases:
 	 * - If lladdr exist, set IsRouter.  This means (1-5).
 	 * - If it is old entry (!newentry), set IsRouter.  This means (7).
 	 * So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
 	 * A quetion arises for (1) case.  (1) case has no lladdr in the
 	 * neighbor cache, this is similar to (6).
 	 * This case is rare but we figured that we MUST NOT set IsRouter.
 	 *
 	 *   is_new  old_addr new_addr 	    NS  RS  RA	redir
 	 *							D R
 	 *	0	n	n	(1)	c   ?     s
 	 *	0	y	n	(2)	c   s     s
 	 *	0	n	y	(3)	c   s     s
 	 *	0	y	y	(4)	c   s     s
 	 *	0	y	y	(5)	c   s     s
 	 *	1	--	n	(6) c	c	c s
 	 *	1	--	y	(7) c	c   s	c s
 	 *
 	 *					(c=clear s=set)
 	 */
 	switch (type & 0xff) {
 	case ND_NEIGHBOR_SOLICIT:
 		/*
 		 * New entry must have is_router flag cleared.
 		 */
 		if (is_new)					/* (6-7) */
 			ln_router = 0;
 		break;
 	case ND_REDIRECT:
 		/*
 		 * If the icmp is a redirect to a better router, always set the
 		 * is_router flag.  Otherwise, if the entry is newly created,
 		 * clear the flag.  [RFC 2461, sec 8.3]
 		 */
 		if (code == ND_REDIRECT_ROUTER)
 			ln_router = 1;
 		else {
 			if (is_new)				/* (6-7) */
 				ln_router = 0;
 		}
 		break;
 	case ND_ROUTER_SOLICIT:
 		/*
 		 * is_router flag must always be cleared.
 		 */
 		ln_router = 0;
 		break;
 	case ND_ROUTER_ADVERT:
 		/*
 		 * Mark an entry with lladdr as a router.
 		 */
 		if ((!is_new && (old_addr || new_addr)) ||	/* (2-5) */
 		    (is_new && new_addr)) {			/* (7) */
 			ln_router = 1;
 		}
 		break;
 	}
 
 	return (ln_router);
 }
 
 /*
  * Create neighbor cache entry and cache link-layer address,
  * on reception of inbound ND6 packets.  (RS/RA/NS/redirect)
  *
  * type - ICMP6 type
  * code - type dependent information
  *
  */
 void
 nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
     int lladdrlen, int type, int code)
 {
 	struct llentry *ln = NULL, *ln_tmp;
 	int is_newentry;
 	int do_update;
 	int olladdr;
 	int llchange;
 	int flags;
 	uint16_t router = 0;
 	struct sockaddr_in6 sin6;
 	struct mbuf *chain = NULL;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 
 	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
 	KASSERT(from != NULL, ("%s: from == NULL", __func__));
 
 	/* nothing must be updated for unspecified address */
 	if (IN6_IS_ADDR_UNSPECIFIED(from))
 		return;
 
 	/*
 	 * Validation about ifp->if_addrlen and lladdrlen must be done in
 	 * the caller.
 	 *
 	 * XXX If the link does not have link-layer adderss, what should
 	 * we do? (ifp->if_addrlen == 0)
 	 * Spec says nothing in sections for RA, RS and NA.  There's small
 	 * description on it in NS section (RFC 2461 7.2.3).
 	 */
 	flags = lladdr ? LLE_EXCLUSIVE : 0;
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(from, flags, ifp);
 	IF_AFDATA_RUNLOCK(ifp);
 	is_newentry = 0;
 	if (ln == NULL) {
 		flags |= LLE_EXCLUSIVE;
 		ln = nd6_alloc(from, 0, ifp);
 		if (ln == NULL)
 			return;
 
 		/*
 		 * Since we already know all the data for the new entry,
 		 * fill it before insertion.
 		 */
 		if (lladdr != NULL)
 			lltable_set_entry_addr(ifp, ln, lladdr);
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(ln);
 		/* Prefer any existing lle over newly-created one */
 		ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp);
 		if (ln_tmp == NULL)
 			lltable_link_entry(LLTABLE6(ifp), ln);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (ln_tmp == NULL) {
 			/* No existing lle, mark as new entry (6,7) */
 			is_newentry = 1;
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 			if (lladdr != NULL)	/* (7) */
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_RESOLVED);
 		} else {
 			lltable_free_entry(LLTABLE6(ifp), ln);
 			ln = ln_tmp;
 			ln_tmp = NULL;
 		}
 	} 
 	/* do nothing if static ndp is set */
 	if ((ln->la_flags & LLE_STATIC)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		return;
 	}
 
 	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
 	if (olladdr && lladdr) {
 		llchange = bcmp(lladdr, &ln->ll_addr,
 		    ifp->if_addrlen);
 	} else if (!olladdr && lladdr)
 		llchange = 1;
 	else
 		llchange = 0;
 
 	/*
 	 * newentry olladdr  lladdr  llchange	(*=record)
 	 *	0	n	n	--	(1)
 	 *	0	y	n	--	(2)
 	 *	0	n	y	y	(3) * STALE
 	 *	0	y	y	n	(4) *
 	 *	0	y	y	y	(5) * STALE
 	 *	1	--	n	--	(6)   NOSTATE(= PASSIVE)
 	 *	1	--	y	--	(7) * STALE
 	 */
 
 	do_update = 0;
 	if (is_newentry == 0 && llchange != 0) {
 		do_update = 1;	/* (3,5) */
 
 		/*
 		 * Record source link-layer address
 		 * XXX is it dependent to ifp->if_type?
 		 */
 		if (lltable_try_set_entry_addr(ifp, ln, lladdr) == 0) {
 			/* Entry was deleted */
 			return;
 		}
 
 		nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 		if (ln->la_hold != NULL)
 			nd6_grab_holdchain(ln, &chain, &sin6);
 	}
 
 	/* Calculates new router status */
 	router = nd6_is_router(type, code, is_newentry, olladdr,
 	    lladdr != NULL ? 1 : 0, ln->ln_router);
 
 	ln->ln_router = router;
 	/* Mark non-router redirects with special flag */
 	if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER)
 		ln->la_flags |= LLE_REDIRECT;
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WUNLOCK(ln);
 	else
 		LLE_RUNLOCK(ln);
 
 	if (chain != NULL)
 		nd6_flush_holdchain(ifp, ifp, chain, &sin6);
 	
 	/*
 	 * When the link-layer address of a router changes, select the
 	 * best router again.  In particular, when the neighbor entry is newly
 	 * created, it might affect the selection policy.
 	 * Question: can we restrict the first condition to the "is_newentry"
 	 * case?
 	 * XXX: when we hear an RA from a new router with the link-layer
 	 * address option, defrouter_select() is called twice, since
 	 * defrtrlist_update called the function as well.  However, I believe
 	 * we can compromise the overhead, since it only happens the first
 	 * time.
 	 * XXX: although defrouter_select() should not have a bad effect
 	 * for those are not autoconfigured hosts, we explicitly avoid such
 	 * cases for safety.
 	 */
 	if ((do_update || is_newentry) && router &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/*
 		 * guaranteed recursion
 		 */
 		defrouter_select();
 	}
 }
 
 static void
 nd6_slowtimo(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *nd6if;
 	struct ifnet *ifp;
 
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		nd6if = ND_IFINFO(ifp);
 		if (nd6if->basereachable && /* already initialized */
 		    (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
 			/*
 			 * Since reachable time rarely changes by router
 			 * advertisements, we SHOULD insure that a new random
 			 * value gets recomputed at least once every few hours.
 			 * (RFC 2461, 6.3.4)
 			 */
 			nd6if->recalctm = V_nd6_recalc_reachtm_interval;
 			nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	CURVNET_RESTORE();
 }
 
 void
 nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain,
     struct sockaddr_in6 *sin6)
 {
 
 	LLE_WLOCK_ASSERT(ln);
 
 	*chain = ln->la_hold;
 	ln->la_hold = NULL;
 	lltable_fill_sa_entry(ln, (struct sockaddr *)sin6);
 
 	if (ln->ln_state == ND6_LLINFO_STALE) {
 
 		/*
 		 * The first time we send a packet to a
 		 * neighbor whose entry is STALE, we have
 		 * to change the state to DELAY and a sets
 		 * a timer to expire in DELAY_FIRST_PROBE_TIME
 		 * seconds to ensure do neighbor unreachability
 		 * detection on expiration.
 		 * (RFC 2461 7.3.3)
 		 */
 		nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY);
 	}
 }
 
 int
 nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct sockaddr_in6 *dst, struct route *ro)
 {
 	int error;
 	int ip6len;
 	struct ip6_hdr *ip6;
 	struct m_tag *mtag;
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		origifp = ifp;
 
 	error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro);
 	return (error);
 }
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * If destination LLE does not exists or lle state modification
  * is required, call "slow" version.
  *
  * Return values:
  * - 0 on success (address copied to buffer).
  * - EWOULDBLOCK (no local error, but address is still unresolved)
  * - other errors (alloc failure, etc)
  */
 int
 nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
     const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *ln = NULL;
 	const struct sockaddr_in6 *dst6;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	dst6 = (const struct sockaddr_in6 *)sa_dst;
 
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return (ENETDOWN); /* better error? */
 	}
 
 	if (m != NULL && m->m_flags & M_MCAST) {
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 		case IFT_L2VLAN:
 		case IFT_IEEE80211:
 		case IFT_BRIDGE:
 		case IFT_ISO88025:
 			ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr,
 						 desten);
 			return (0);
 		default:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(&dst6->sin6_addr, LLE_UNLOCKED, ifp);
 	if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(&ln->ll_addr, desten, ifp->if_addrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR);
 		/* Check if we have feedback request from nd6 timer */
 		if (ln->r_skip_req != 0) {
 			LLE_REQ_LOCK(ln);
 			ln->r_skip_req = 0; /* Notify that entry was used */
 			ln->lle_hittime = time_uptime;
 			LLE_REQ_UNLOCK(ln);
 		}
 		IF_AFDATA_RUNLOCK(ifp);
 		return (0);
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (nd6_resolve_slow(ifp, m, dst6, desten, pflags));
 }
 
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * Heavy version.
  * Function assume that destination LLE does not exist,
  * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline int
 nd6_resolve_slow(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *lle = NULL, *lle_tmp;
 	struct in6_addr *psrc, src;
 	int send_ns;
 
 	/*
 	 * Address resolution or Neighbor Unreachability Detection
 	 * for the next hop.
 	 * At this point, the destination of the packet must be a unicast
 	 * or an anycast address(i.e. not a multicast).
 	 */
 	if (lle == NULL) {
 		IF_AFDATA_RLOCK(ifp);
 		lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 		if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp))  {
 			/*
 			 * Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
 			 * the condition below is not very efficient.  But we believe
 			 * it is tolerable, because this should be a rare case.
 			 */
 			lle = nd6_alloc(&dst->sin6_addr, 0, ifp);
 			if (lle == NULL) {
 				char ip6buf[INET6_ADDRSTRLEN];
 				log(LOG_DEBUG,
 				    "nd6_output: can't allocate llinfo for %s "
 				    "(ln=%p)\n",
 				    ip6_sprintf(ip6buf, &dst->sin6_addr), lle);
 				m_freem(m);
 				return (ENOBUFS);
 			}
 
 			IF_AFDATA_WLOCK(ifp);
 			LLE_WLOCK(lle);
 			/* Prefer any existing entry over newly-created one */
 			lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
 			if (lle_tmp == NULL)
 				lltable_link_entry(LLTABLE6(ifp), lle);
 			IF_AFDATA_WUNLOCK(ifp);
 			if (lle_tmp != NULL) {
 				lltable_free_entry(LLTABLE6(ifp), lle);
 				lle = lle_tmp;
 				lle_tmp = NULL;
 			}
 		}
 	} 
 	if (lle == NULL) {
 		if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 
 		if (m != NULL)
 			m_freem(m);
 		return (ENOBUFS);
 	}
 
 	LLE_WLOCK_ASSERT(lle);
 
 	/*
 	 * The first time we send a packet to a neighbor whose entry is
 	 * STALE, we have to change the state to DELAY and a sets a timer to
 	 * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
 	 * neighbor unreachability detection on expiration.
 	 * (RFC 2461 7.3.3)
 	 */
 	if (lle->ln_state == ND6_LLINFO_STALE)
 		nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY);
 
 	/*
 	 * If the neighbor cache entry has a state other than INCOMPLETE
 	 * (i.e. its link-layer address is already resolved), just
 	 * send the packet.
 	 */
 	if (lle->ln_state > ND6_LLINFO_INCOMPLETE) {
 		bcopy(&lle->ll_addr, desten, ifp->if_addrlen);
 		if (pflags != NULL)
 			*pflags = lle->la_flags;
 		LLE_WUNLOCK(lle);
 		return (0);
 	}
 
 	/*
 	 * There is a neighbor cache entry, but no ethernet address
 	 * response yet.  Append this latest packet to the end of the
 	 * packet queue in the mbuf, unless the number of the packet
 	 * does not exceed nd6_maxqueuelen.  When it exceeds nd6_maxqueuelen,
 	 * the oldest packet in the queue will be removed.
 	 */
 
 	if (lle->la_hold != NULL) {
 		struct mbuf *m_hold;
 		int i;
 		
 		i = 0;
 		for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){
 			i++;
 			if (m_hold->m_nextpkt == NULL) {
 				m_hold->m_nextpkt = m;
 				break;
 			}
 		}
 		while (i >= V_nd6_maxqueuelen) {
 			m_hold = lle->la_hold;
 			lle->la_hold = lle->la_hold->m_nextpkt;
 			m_freem(m_hold);
 			i--;
 		}
 	} else {
 		lle->la_hold = m;
 	}
 
 	/*
 	 * If there has been no NS for the neighbor after entering the
 	 * INCOMPLETE state, send the first solicitation.
 	 * Note that for newly-created lle la_asked will be 0,
 	 * so we will transition from ND6_LLINFO_NOSTATE to
 	 * ND6_LLINFO_INCOMPLETE state here.
 	 */
 	psrc = NULL;
 	send_ns = 0;
 	if (lle->la_asked == 0) {
 		lle->la_asked++;
 		send_ns = 1;
 		psrc = nd6_llinfo_get_holdsrc(lle, &src);
 
 		nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE);
 	}
 	LLE_WUNLOCK(lle);
 	if (send_ns != 0)
 		nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL);
 
 	return (EWOULDBLOCK);
 }
 
 
 int
 nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain,
     struct sockaddr_in6 *dst)
 {
 	struct mbuf *m, *m_head;
 	struct ifnet *outifp;
 	int error = 0;
 
 	m_head = chain;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		outifp = origifp;
 	else
 		outifp = ifp;
 	
 	while (m_head) {
 		m = m_head;
 		m_head = m_head->m_nextpkt;
 		error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
 	}
 
 	/*
 	 * XXX
 	 * note that intermediate errors are blindly ignored
 	 */
 	return (error);
 }	
 
 static int
 nd6_need_cache(struct ifnet *ifp)
 {
 	/*
 	 * XXX: we currently do not make neighbor cache on any interface
 	 * other than ARCnet, Ethernet, FDDI and GIF.
 	 *
 	 * RFC2893 says:
 	 * - unidirectional tunnels needs no ND
 	 */
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 	case IFT_L2VLAN:
 	case IFT_IEEE80211:
 	case IFT_INFINIBAND:
 	case IFT_BRIDGE:
 	case IFT_PROPVIRTUAL:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Add pernament ND6 link-layer record for given
  * interface address.
  *
  * Very similar to IPv4 arp_ifinit(), but:
  * 1) IPv6 DAD is performed in different place
  * 2) It is called by IPv6 protocol stack in contrast to
  * arp_ifinit() which is typically called in SIOCSIFADDR
  * driver ioctl handler.
  *
  */
 int
 nd6_add_ifa_lle(struct in6_ifaddr *ia)
 {
 	struct ifnet *ifp;
 	struct llentry *ln, *ln_tmp;
 	struct sockaddr *dst;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	if (nd6_need_cache(ifp) == 0)
 		return (0);
 
 	ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
 	dst = (struct sockaddr *)&ia->ia_addr;
 	ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst);
 	if (ln == NULL)
 		return (ENOBUFS);
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Unlink any entry if exists */
 	ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst);
 	if (ln_tmp != NULL)
 		lltable_unlink_entry(LLTABLE6(ifp), ln_tmp);
 	lltable_link_entry(LLTABLE6(ifp), ln);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (ln_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED);
 	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 	LLE_WUNLOCK(ln);
 	if (ln_tmp != NULL)
 		llentry_free(ln_tmp);
 
 	return (0);
 }
 
 /*
  * Removes either all lle entries for given @ia, or lle
  * corresponding to @ia address.
  */
 void
 nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all)
 {
 	struct sockaddr_in6 mask, addr;
 	struct sockaddr *saddr, *smask;
 	struct ifnet *ifp;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	saddr = (struct sockaddr *)&addr;
 	smask = (struct sockaddr *)&mask;
 
 	if (all != 0)
 		lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC);
 	else
 		lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr);
 }
 
 static void 
 clear_llinfo_pqueue(struct llentry *ln)
 {
 	struct mbuf *m_hold, *m_hold_next;
 
 	for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
 		m_hold_next = m_hold->m_nextpkt;
 		m_freem(m_hold);
 	}
 
 	ln->la_hold = NULL;
 	return;
 }
 
 static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
 static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet6_icmp6);
 #endif
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLFLAG_RD, nd6_sysctl_drlist, "");
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
 	CTLFLAG_RD, nd6_sysctl_prlist, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
 SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	/*
 	 * XXX locking
 	 */
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			return (error);
 		d.flags = dr->flags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_prefix p;
 	struct sockaddr_in6 s6;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfr;
 	time_t maxexpire;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&p, sizeof(p));
 	p.origin = PR_ORIG_RA;
 	bzero(&s6, sizeof(s6));
 	s6.sin6_family = AF_INET6;
 	s6.sin6_len = sizeof(s6);
 
 	/*
 	 * XXX locking
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		p.prefix = pr->ndpr_prefix;
 		if (sa6_recoverscope(&p.prefix)) {
 			log(LOG_ERR, "scope error in prefix list (%s)\n",
 			    ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
 			/* XXX: press on... */
 		}
 		p.raflags = pr->ndpr_raf;
 		p.prefixlen = pr->ndpr_plen;
 		p.vltime = pr->ndpr_vltime;
 		p.pltime = pr->ndpr_pltime;
 		p.if_index = pr->ndpr_ifp->if_index;
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 			p.expire = 0;
 		else {
 			/* XXX: we assume time_t is signed. */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
 				p.expire = pr->ndpr_lastupdate +
 				    pr->ndpr_vltime +
 				    (time_second - time_uptime);
 			else
 				p.expire = maxexpire;
 		}
 		p.refcnt = pr->ndpr_refcnt;
 		p.flags = pr->ndpr_stateflags;
 		p.advrtrs = 0;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
 			p.advrtrs++;
 		error = SYSCTL_OUT(req, &p, sizeof(p));
 		if (error != 0)
 			return (error);
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 			s6.sin6_addr = pfr->router->rtaddr;
 			if (sa6_recoverscope(&s6))
 				log(LOG_ERR,
 				    "scope error in prefix list (%s)\n",
 				    ip6_sprintf(ip6buf, &pfr->router->rtaddr));
 			error = SYSCTL_OUT(req, &s6, sizeof(s6));
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
Index: projects/vnet/sys/netinet6/nd6.h
===================================================================
--- projects/vnet/sys/netinet6/nd6.h	(revision 294565)
+++ projects/vnet/sys/netinet6/nd6.h	(revision 294566)
@@ -1,458 +1,455 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_ND6_H_
 #define _NETINET6_ND6_H_
 
 /* see net/route.h, or net/if_inarp.h */
 #ifndef RTF_ANNOUNCE
 #define RTF_ANNOUNCE	RTF_PROTO2
 #endif
 
 #include <sys/queue.h>
 #include <sys/callout.h>
 
 struct llentry;
 
 #define ND6_LLINFO_NOSTATE	-2
 /*
  * We don't need the WAITDELETE state any more, but we keep the definition
  * in a comment line instead of removing it. This is necessary to avoid
  * unintentionally reusing the value for another purpose, which might
  * affect backward compatibility with old applications.
  * (20000711 jinmei@kame.net)
  */
 /* #define ND6_LLINFO_WAITDELETE	-1 */
 #define ND6_LLINFO_INCOMPLETE	0
 #define ND6_LLINFO_REACHABLE	1
 #define ND6_LLINFO_STALE	2
 #define ND6_LLINFO_DELAY	3
 #define ND6_LLINFO_PROBE	4
 
 #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
 #define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE))
 
 struct nd_ifinfo {
 	u_int32_t linkmtu;		/* LinkMTU */
 	u_int32_t maxmtu;		/* Upper bound of LinkMTU */
 	u_int32_t basereachable;	/* BaseReachableTime */
 	u_int32_t reachable;		/* Reachable Time */
 	u_int32_t retrans;		/* Retrans Timer */
 	u_int32_t flags;		/* Flags */
 	int recalctm;			/* BaseReacable re-calculation timer */
 	u_int8_t chlim;			/* CurHopLimit */
 	u_int8_t initialized; /* Flag to see the entry is initialized */
 	/* the following 3 members are for privacy extension for addrconf */
 	u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
 	u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
 	u_int8_t randomid[8];	/* current random ID */
 };
 
 #define ND6_IFF_PERFORMNUD	0x1
 #define ND6_IFF_ACCEPT_RTADV	0x2
 #define ND6_IFF_PREFER_SOURCE	0x4 /* Not used in FreeBSD. */
 #define ND6_IFF_IFDISABLED	0x8 /* IPv6 operation is disabled due to
 				     * DAD failure.  (XXX: not ND-specific)
 				     */
 #define ND6_IFF_DONT_SET_IFROUTE	0x10
 #define ND6_IFF_AUTO_LINKLOCAL	0x20
 #define	ND6_IFF_NO_RADR		0x40
 #define ND6_IFF_NO_PREFER_IFACE	0x80 /* XXX: not related to ND. */
 #define ND6_IFF_NO_DAD		0x100
 
 #ifdef _KERNEL
 #define ND_IFINFO(ifp) \
 	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo)
 #define IN6_LINKMTU(ifp) \
 	((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \
 	    ? ND_IFINFO(ifp)->linkmtu \
 	    : ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \
 		? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))
 #endif
 
 struct in6_nbrinfo {
 	char ifname[IFNAMSIZ];	/* if name, e.g. "en0" */
 	struct in6_addr addr;	/* IPv6 address of the neighbor */
 	long	asked;		/* number of queries already sent for this addr */
 	int	isrouter;	/* if it acts as a router */
 	int	state;		/* reachability state */
 	int	expire;		/* lifetime for NDP state transition */
 };
 
 #define DRLSTSIZ 10
 #define PRLSTSIZ 10
 struct	in6_drlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr rtaddr;
 		u_char	flags;
 		u_short	rtlifetime;
 		u_long	expire;
 		u_short if_index;
 	} defrouter[DRLSTSIZ];
 };
 
 struct	in6_defrouter {
 	struct	sockaddr_in6 rtaddr;
 	u_char	flags;
 	u_short	rtlifetime;
 	u_long	expire;
 	u_short if_index;
 };
 
 #ifdef _KERNEL
 struct	in6_oprlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr prefix;
 		struct prf_ra raflags;
 		u_char	prefixlen;
 		u_char	origin;
 		u_long vltime;
 		u_long pltime;
 		u_long expire;
 		u_short if_index;
 		u_short advrtrs; /* number of advertisement routers */
 		struct	in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
 	} prefix[PRLSTSIZ];
 };
 #endif
 
 struct	in6_prlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr prefix;
 		struct prf_ra raflags;
 		u_char	prefixlen;
 		u_char	origin;
 		u_int32_t vltime;
 		u_int32_t pltime;
 		time_t expire;
 		u_short if_index;
 		u_short advrtrs; /* number of advertisement routers */
 		struct	in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
 	} prefix[PRLSTSIZ];
 };
 
 struct in6_prefix {
 	struct	sockaddr_in6 prefix;
 	struct prf_ra raflags;
 	u_char	prefixlen;
 	u_char	origin;
 	u_int32_t vltime;
 	u_int32_t pltime;
 	time_t expire;
 	u_int32_t flags;
 	int refcnt;
 	u_short if_index;
 	u_short advrtrs; /* number of advertisement routers */
 	/* struct sockaddr_in6 advrtr[] */
 };
 
 #ifdef _KERNEL
 struct	in6_ondireq {
 	char ifname[IFNAMSIZ];
 	struct {
 		u_int32_t linkmtu;	/* LinkMTU */
 		u_int32_t maxmtu;	/* Upper bound of LinkMTU */
 		u_int32_t basereachable; /* BaseReachableTime */
 		u_int32_t reachable;	/* Reachable Time */
 		u_int32_t retrans;	/* Retrans Timer */
 		u_int32_t flags;	/* Flags */
 		int recalctm;		/* BaseReacable re-calculation timer */
 		u_int8_t chlim;		/* CurHopLimit */
 		u_int8_t receivedra;
 	} ndi;
 };
 #endif
 
 struct	in6_ndireq {
 	char ifname[IFNAMSIZ];
 	struct nd_ifinfo ndi;
 };
 
 struct	in6_ndifreq {
 	char ifname[IFNAMSIZ];
 	u_long ifindex;
 };
 
 /* Prefix status */
 #define NDPRF_ONLINK		0x1
 #define NDPRF_DETACHED		0x2
 
 /* protocol constants */
 #define MAX_RTR_SOLICITATION_DELAY	1	/* 1sec */
 #define RTR_SOLICITATION_INTERVAL	4	/* 4sec */
 #define MAX_RTR_SOLICITATIONS		3
 
 #define ND6_INFINITE_LIFETIME		0xffffffff
 
 #ifdef _KERNEL
 /* node constants */
 #define MAX_REACHABLE_TIME		3600000	/* msec */
 #define REACHABLE_TIME			30000	/* msec */
 #define RETRANS_TIMER			1000	/* msec */
 #define MIN_RANDOM_FACTOR		512	/* 1024 * 0.5 */
 #define MAX_RANDOM_FACTOR		1536	/* 1024 * 1.5 */
 #define DEF_TEMP_VALID_LIFETIME		604800	/* 1 week */
 #define DEF_TEMP_PREFERRED_LIFETIME	86400	/* 1 day */
 #define TEMPADDR_REGEN_ADVANCE		5	/* sec */
 #define MAX_TEMP_DESYNC_FACTOR		600	/* 10 min */
 #define ND_COMPUTE_RTIME(x) \
 		(((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \
 		((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000)
 
 TAILQ_HEAD(nd_drhead, nd_defrouter);
 struct	nd_defrouter {
 	TAILQ_ENTRY(nd_defrouter) dr_entry;
 	struct	in6_addr rtaddr;
 	u_char	flags;		/* flags on RA message */
 	u_short	rtlifetime;
 	u_long	expire;
 	struct  ifnet *ifp;
 	int	installed;	/* is installed into kernel routing table */
 };
 
 struct nd_prefixctl {
 	struct ifnet *ndpr_ifp;
 
 	/* prefix */
 	struct sockaddr_in6 ndpr_prefix;
 	u_char	ndpr_plen;
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	struct prf_ra ndpr_flags;
 };
 
 
 struct nd_prefix {
 	struct ifnet *ndpr_ifp;
 	LIST_ENTRY(nd_prefix) ndpr_entry;
 	struct sockaddr_in6 ndpr_prefix;	/* prefix */
 	struct in6_addr ndpr_mask; /* netmask derived from the prefix */
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	time_t ndpr_expire;	/* expiration time of the prefix */
 	time_t ndpr_preferred;	/* preferred time of the prefix */
 	time_t ndpr_lastupdate; /* reception time of last advertisement */
 
 	struct prf_ra ndpr_flags;
 	u_int32_t ndpr_stateflags; /* actual state flags */
 	/* list of routers that advertise the prefix: */
 	LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs;
 	u_char	ndpr_plen;
 	int	ndpr_refcnt;	/* reference couter from addresses */
 };
 
 #define ndpr_raf		ndpr_flags
 #define ndpr_raf_onlink		ndpr_flags.onlink
 #define ndpr_raf_auto		ndpr_flags.autonomous
 #define ndpr_raf_router		ndpr_flags.router
 
 /*
  * Message format for use in obtaining information about prefixes
  * from inet6 sysctl function
  */
 struct inet6_ndpr_msghdr {
 	u_short	inpm_msglen;	/* to skip over non-understood messages */
 	u_char	inpm_version;	/* future binary compatibility */
 	u_char	inpm_type;	/* message type */
 	struct in6_addr inpm_prefix;
 	u_long	prm_vltim;
 	u_long	prm_pltime;
 	u_long	prm_expire;
 	u_long	prm_preferred;
 	struct in6_prflags prm_flags;
 	u_short	prm_index;	/* index for associated ifp */
 	u_char	prm_plen;	/* length of prefix in bits */
 };
 
 #define prm_raf_onlink		prm_flags.prf_ra.onlink
 #define prm_raf_auto		prm_flags.prf_ra.autonomous
 
 #define prm_statef_onlink	prm_flags.prf_state.onlink
 
 #define prm_rrf_decrvalid	prm_flags.prf_rr.decrvalid
 #define prm_rrf_decrprefd	prm_flags.prf_rr.decrprefd
 
 struct nd_pfxrouter {
 	LIST_ENTRY(nd_pfxrouter) pfr_entry;
 	struct nd_defrouter *router;
 };
 
 LIST_HEAD(nd_prhead, nd_prefix);
 
 /* nd6.c */
 VNET_DECLARE(int, nd6_prune);
 VNET_DECLARE(int, nd6_delay);
 VNET_DECLARE(int, nd6_umaxtries);
 VNET_DECLARE(int, nd6_mmaxtries);
 VNET_DECLARE(int, nd6_useloopback);
 VNET_DECLARE(int, nd6_maxnudhint);
 VNET_DECLARE(int, nd6_gctimer);
 VNET_DECLARE(struct nd_drhead, nd_defrouter);
 VNET_DECLARE(struct nd_prhead, nd_prefix);
 VNET_DECLARE(int, nd6_debug);
 VNET_DECLARE(int, nd6_onlink_ns_rfc4861);
 #define	V_nd6_prune			VNET(nd6_prune)
 #define	V_nd6_delay			VNET(nd6_delay)
 #define	V_nd6_umaxtries			VNET(nd6_umaxtries)
 #define	V_nd6_mmaxtries			VNET(nd6_mmaxtries)
 #define	V_nd6_useloopback		VNET(nd6_useloopback)
 #define	V_nd6_maxnudhint		VNET(nd6_maxnudhint)
 #define	V_nd6_gctimer			VNET(nd6_gctimer)
 #define	V_nd_defrouter			VNET(nd_defrouter)
 #define	V_nd_prefix			VNET(nd_prefix)
 #define	V_nd6_debug			VNET(nd6_debug)
 #define	V_nd6_onlink_ns_rfc4861		VNET(nd6_onlink_ns_rfc4861)
 
 #define nd6log(x)	do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0)
 
-VNET_DECLARE(struct callout, nd6_timer_ch);
-#define	V_nd6_timer_ch			VNET(nd6_timer_ch)
-
 /* nd6_rtr.c */
 VNET_DECLARE(int, nd6_defifindex);
 VNET_DECLARE(int, ip6_desync_factor);	/* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */
 VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */
 #define	V_nd6_defifindex		VNET(nd6_defifindex)
 #define	V_ip6_desync_factor		VNET(ip6_desync_factor)
 #define	V_ip6_temp_preferred_lifetime	VNET(ip6_temp_preferred_lifetime)
 #define	V_ip6_temp_valid_lifetime	VNET(ip6_temp_valid_lifetime)
 #define	V_ip6_temp_regen_advance	VNET(ip6_temp_regen_advance)
 
 union nd_opts {
 	struct nd_opt_hdr *nd_opt_array[16];	/* max = ND_OPT_NONCE */
 	struct {
 		struct nd_opt_hdr *zero;
 		struct nd_opt_hdr *src_lladdr;
 		struct nd_opt_hdr *tgt_lladdr;
 		struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */
 		struct nd_opt_rd_hdr *rh;
 		struct nd_opt_mtu *mtu;
 		struct nd_opt_hdr *__res6;
 		struct nd_opt_hdr *__res7;
 		struct nd_opt_hdr *__res8;
 		struct nd_opt_hdr *__res9;
 		struct nd_opt_hdr *__res10;
 		struct nd_opt_hdr *__res11;
 		struct nd_opt_hdr *__res12;
 		struct nd_opt_hdr *__res13;
 		struct nd_opt_nonce *nonce;
 		struct nd_opt_hdr *__res15;
 		struct nd_opt_hdr *search;	/* multiple opts */
 		struct nd_opt_hdr *last;	/* multiple opts */
 		int done;
 		struct nd_opt_prefix_info *pi_end;/* multiple opts, end */
 	} nd_opt_each;
 };
 #define nd_opts_src_lladdr	nd_opt_each.src_lladdr
 #define nd_opts_tgt_lladdr	nd_opt_each.tgt_lladdr
 #define nd_opts_pi		nd_opt_each.pi_beg
 #define nd_opts_pi_end		nd_opt_each.pi_end
 #define nd_opts_rh		nd_opt_each.rh
 #define nd_opts_mtu		nd_opt_each.mtu
 #define nd_opts_nonce		nd_opt_each.nonce
 #define nd_opts_search		nd_opt_each.search
 #define nd_opts_last		nd_opt_each.last
 #define nd_opts_done		nd_opt_each.done
 
 /* XXX: need nd6_var.h?? */
 /* nd6.c */
 void nd6_init(void);
 #ifdef VIMAGE
 void nd6_destroy(void);
 #endif
 struct nd_ifinfo *nd6_ifattach(struct ifnet *);
 void nd6_ifdetach(struct nd_ifinfo *);
 int nd6_is_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *);
 void nd6_option_init(void *, int, union nd_opts *);
 struct nd_opt_hdr *nd6_option(union nd_opts *);
 int nd6_options(union nd_opts *);
 struct llentry *nd6_lookup(const struct in6_addr *, int, struct ifnet *);
 struct llentry *nd6_alloc(const struct in6_addr *, int, struct ifnet *);
 void nd6_setmtu(struct ifnet *);
 void nd6_llinfo_setstate(struct llentry *lle, int newstate);
 void nd6_timer(void *);
 void nd6_purge(struct ifnet *);
 int nd6_resolve(struct ifnet *, int, struct mbuf *,
     const struct sockaddr *, u_char *, uint32_t *);
 int nd6_ioctl(u_long, caddr_t, struct ifnet *);
 void nd6_cache_lladdr(struct ifnet *, struct in6_addr *,
 	char *, int, int, int);
 void nd6_grab_holdchain(struct llentry *, struct mbuf **,
     struct sockaddr_in6 *);
 int nd6_flush_holdchain(struct ifnet *, struct ifnet *, struct mbuf *,
     struct sockaddr_in6 *);
 int nd6_add_ifa_lle(struct in6_ifaddr *);
 void nd6_rem_ifa_lle(struct in6_ifaddr *, int);
 int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *,
     struct sockaddr_in6 *, struct route *);
 
 /* nd6_nbr.c */
 void nd6_na_input(struct mbuf *, int, int);
 void nd6_na_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, u_long, int, struct sockaddr *);
 void nd6_ns_input(struct mbuf *, int, int);
 void nd6_ns_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, const struct in6_addr *, uint8_t *);
 caddr_t nd6_ifptomac(struct ifnet *);
 void nd6_dad_init(void);
 void nd6_dad_start(struct ifaddr *, int);
 void nd6_dad_stop(struct ifaddr *);
 
 /* nd6_rtr.c */
 void nd6_rs_input(struct mbuf *, int, int);
 void nd6_ra_input(struct mbuf *, int, int);
 void prelist_del(struct nd_prefix *);
 void defrouter_reset(void);
 void defrouter_select(void);
 void defrtrlist_del(struct nd_defrouter *);
 void prelist_remove(struct nd_prefix *);
 int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *,
 	struct nd_prefix **);
 void pfxlist_onlink_check(void);
 struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *);
 struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *);
 void rt6_flush(struct in6_addr *, struct ifnet *);
 int nd6_setdefaultiface(int);
 int in6_tmpifadd(const struct in6_ifaddr *, int, int);
 
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_ND6_H_ */
Index: projects/vnet/sys/netipsec/ipsec.c
===================================================================
--- projects/vnet/sys/netipsec/ipsec.c	(revision 294565)
+++ projects/vnet/sys/netipsec/ipsec.c	(revision 294566)
@@ -1,1745 +1,1745 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPsec controller part.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/hhook.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_enc.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <sys/types.h>
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/ah_var.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp.h>		/*XXX*/
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/xform.h>
 
 #include <machine/in_cksum.h>
 
 #include <opencrypto/cryptodev.h>
 
 #ifdef IPSEC_DEBUG
 VNET_DEFINE(int, ipsec_debug) = 1;
 #else
 VNET_DEFINE(int, ipsec_debug) = 0;
 #endif
 
 /* NB: name changed so netstat doesn't use it. */
 VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec4stat);
 VNET_PCPUSTAT_SYSINIT(ipsec4stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipsec4stat);
 #endif /* VIMAGE */
 
 VNET_DEFINE(int, ip4_ah_offsetmask) = 0;	/* maybe IP_DF? */
 /* DF bit on encap. 0: clear 1: set 2: copy */
 VNET_DEFINE(int, ip4_ipsec_dfbit) = 0;
 VNET_DEFINE(int, ip4_esp_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_esp_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_ah_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_ah_net_deflev) = IPSEC_LEVEL_USE;
 /* ECN ignore(-1)/forbidden(0)/allowed(1) */
 VNET_DEFINE(int, ip4_ipsec_ecn) = 0;
 VNET_DEFINE(int, ip4_esp_randpad) = -1;
 
 static VNET_DEFINE(struct secpolicy, def_policy);
 #define	V_def_policy	VNET(def_policy)
 /*
  * Crypto support requirements:
  *
  *  1	require hardware support
  * -1	require software support
  *  0	take anything
  */
 VNET_DEFINE(int, crypto_support) = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE;
 
 FEATURE(ipsec, "Internet Protocol Security (IPsec)");
 #ifdef IPSEC_NAT_T
 FEATURE(ipsec_natt, "UDP Encapsulation of IPsec ESP Packets ('NAT-T')");
 #endif
 
 SYSCTL_DECL(_net_inet_ipsec);
 
 /* net.inet.ipsec */
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(def_policy).policy, 0,
 	"IPsec default policy.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_trans_deflev), 0,
 	"Default ESP transport mode level");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_net_deflev), 0,
 	"Default ESP tunnel mode level.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_trans_deflev), 0,
 	"AH transfer mode default level.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_net_deflev), 0,
 	"AH tunnel mode default level.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0,
 	"If set clear type-of-service field when doing AH computation.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_offsetmask), 0,
 	"If not set clear offset field mask when doing AH computation.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_dfbit), 0,
 	"Do not fragment bit on encap.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_ecn), 0,
 	"Explicit Congestion Notification handling.");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
 	"Enable IPsec debugging output when set.");
 SYSCTL_INT(_net_inet_ipsec, OID_AUTO, crypto_support,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(crypto_support), 0,
 	"Crypto driver selection.");
 SYSCTL_VNET_PCPUSTAT(_net_inet_ipsec, OID_AUTO, ipsecstats, struct ipsecstat,
     ipsec4stat, "IPsec IPv4 statistics.");
 
 #ifdef REGRESSION
 /*
  * When set to 1, IPsec will send packets with the same sequence number.
  * This allows to verify if the other side has proper replay attacks detection.
  */
 VNET_DEFINE(int, ipsec_replay) = 0;
 SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_replay,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_replay), 0,
 	"Emulate replay attack");
 /*
  * When set 1, IPsec will send packets with corrupted HMAC.
  * This allows to verify if the other side properly detects modified packets.
  */
 VNET_DEFINE(int, ipsec_integrity) = 0;
 SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_integrity,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_integrity), 0,
 	"Emulate man-in-the-middle attack");
 #endif
 
 #ifdef INET6 
 VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec6stat);
 VNET_PCPUSTAT_SYSINIT(ipsec6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipsec6stat);
 #endif /* VIMAGE */
 
 VNET_DEFINE(int, ip6_esp_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_esp_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ah_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ah_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ipsec_ecn) = 0;	/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 
 SYSCTL_DECL(_net_inet6_ipsec6);
 
 /* net.inet6.ipsec6 */
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(def_policy).policy, 0,
 	"IPsec default policy.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_trans_deflev), 0,
 	"Default ESP transport mode level.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_net_deflev), 0,
 	"Default ESP tunnel mode level.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_trans_deflev), 0,
 	"AH transfer mode default level.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_net_deflev), 0,
 	"AH tunnel mode default level.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, ecn,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ipsec_ecn), 0,
 	"Explicit Congestion Notification handling.");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
 	"Enable IPsec debugging output when set.");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ipsec6, IPSECCTL_STATS, ipsecstats,
     struct ipsecstat, ipsec6stat, "IPsec IPv6 statistics.");
 #endif /* INET6 */
 
 static int ipsec_in_reject(struct secpolicy *, struct mbuf *);
 static int ipsec_setspidx_inpcb(struct mbuf *, struct inpcb *);
 static int ipsec_setspidx(struct mbuf *, struct secpolicyindex *, int);
 static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *, int);
 static int ipsec4_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *);
 #ifdef INET6
 static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *, int);
 static int ipsec6_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *);
 #endif
 static void ipsec_delpcbpolicy(struct inpcbpolicy *);
 static struct secpolicy *ipsec_deepcopy_policy(struct secpolicy *src);
 static void vshiftl(unsigned char *, int, int);
 
 MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy");
 
 /*
  * Return a held reference to the default SP.
  */
 static struct secpolicy *
 key_allocsp_default(const char* where, int tag)
 {
 	struct secpolicy *sp;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default from %s:%u\n", where, tag));
 
 	sp = &V_def_policy;
 	if (sp->policy != IPSEC_POLICY_DISCARD &&
 	    sp->policy != IPSEC_POLICY_NONE) {
 		ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n",
 		    sp->policy, IPSEC_POLICY_NONE));
 		sp->policy = IPSEC_POLICY_NONE;
 	}
 	key_addref(sp);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default returns SP:%p (%u)\n",
 			sp, sp->refcnt));
 	return (sp);
 }
 #define	KEY_ALLOCSP_DEFAULT() \
 	key_allocsp_default(__FILE__, __LINE__)
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 struct secpolicy *
 ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(tdbi != NULL, ("null tdbi"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir);
 	if (sp == NULL)			/*XXX????*/
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return (sp);
 }
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 static struct secpolicy *
 ipsec_getpolicybysock(struct mbuf *m, u_int dir, struct inpcb *inp, int *error)
 {
 	struct inpcbpolicy *pcbsp;
 	struct secpolicy *currsp = NULL;	/* Policy on socket. */
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(inp != NULL, ("null inpcb"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	if (!key_havesp(dir)) {
 		/* No SP found, use system default. */
 		sp = KEY_ALLOCSP_DEFAULT();
 		return (sp);
 	}
 
 	/* Set spidx in pcb. */
 	*error = ipsec_setspidx_inpcb(m, inp);
 	if (*error)
 		return (NULL);
 
 	pcbsp = inp->inp_sp;
 	IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp"));
 	switch (dir) {
 	case IPSEC_DIR_INBOUND:
 		currsp = pcbsp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		currsp = pcbsp->sp_out;
 		break;
 	}
 	IPSEC_ASSERT(currsp != NULL, ("null currsp"));
 
 	if (pcbsp->priv) {			/* When privilieged socket. */
 		switch (currsp->policy) {
 		case IPSEC_POLICY_BYPASS:
 		case IPSEC_POLICY_IPSEC:
 			key_addref(currsp);
 			sp = currsp;
 			break;
 
 		case IPSEC_POLICY_ENTRUST:
 			/* Look for a policy in SPD. */
 			sp = KEY_ALLOCSP(&currsp->spidx, dir);
 			if (sp == NULL)		/* No SP found. */
 				sp = KEY_ALLOCSP_DEFAULT();
 			break;
 
 		default:
 			ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n",
 				__func__, currsp->policy));
 			*error = EINVAL;
 			return (NULL);
 		}
 	} else {				/* Unpriv, SPD has policy. */
 		sp = KEY_ALLOCSP(&currsp->spidx, dir);
 		if (sp == NULL) {		/* No SP found. */
 			switch (currsp->policy) {
 			case IPSEC_POLICY_BYPASS:
 				ipseclog((LOG_ERR, "%s: Illegal policy for "
 					"non-priviliged defined %d\n",
 					__func__, currsp->policy));
 				*error = EINVAL;
 				return (NULL);
 
 			case IPSEC_POLICY_ENTRUST:
 				sp = KEY_ALLOCSP_DEFAULT();
 				break;
 
 			case IPSEC_POLICY_IPSEC:
 				key_addref(currsp);
 				sp = currsp;
 				break;
 
 			default:
 				ipseclog((LOG_ERR, "%s: Invalid policy for "
 					"PCB %d\n", __func__, currsp->policy));
 				*error = EINVAL;
 				return (NULL);
 			}
 		}
 	}
 	IPSEC_ASSERT(sp != NULL,
 		("null SP (priv %u policy %u", pcbsp->priv, currsp->policy));
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n",
 			__func__, pcbsp->priv, currsp->policy, sp, sp->refcnt));
 	return (sp);
 }
 
 /*
  * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	positive: a pointer to the entry for security policy leaf matched.
  *	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  */
 struct secpolicy *
 ipsec_getpolicybyaddr(struct mbuf *m, u_int dir, int *error)
 {
 	struct secpolicyindex spidx;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = NULL;
 	*error = 0;
 	if (key_havesp(dir)) {
 		/* Make an index to look for a policy. */
 		*error = ipsec_setspidx(m, &spidx, 0);
 		if (*error != 0) {
 			DPRINTF(("%s: setpidx failed, dir %u\n",
 				__func__, dir));
 			return (NULL);
 		}
 		spidx.dir = dir;
 		sp = KEY_ALLOCSP(&spidx, dir);
 	}
 	if (sp == NULL)			/* No SP found, use system default. */
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return (sp);
 }
 
 struct secpolicy *
 ipsec4_checkpolicy(struct mbuf *m, u_int dir, int *error, struct inpcb *inp)
 {
 	struct secpolicy *sp;
 
 	*error = 0;
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, error);
 	if (sp == NULL) {
 		IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error"));
 		IPSECSTAT_INC(ips_out_inval);
 		return (NULL);
 	}
 	IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error));
 	switch (sp->policy) {
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		printf("%s: invalid policy %u\n", __func__, sp->policy);
 		/* FALLTHROUGH */
 	case IPSEC_POLICY_DISCARD:
 		IPSECSTAT_INC(ips_out_polvio);
 		*error = -EINVAL;	/* Packet is discarded by caller. */
 		break;
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		KEY_FREESP(&sp);
 		sp = NULL;		/* NB: force NULL result. */
 		break;
 	case IPSEC_POLICY_IPSEC:
 		if (sp->req == NULL)	/* Acquire a SA. */
 			*error = key_spdacquire(sp);
 		break;
 	}
 	if (*error != 0) {
 		KEY_FREESP(&sp);
 		sp = NULL;
 	}
 	return (sp);
 }
 
 static int
 ipsec_setspidx_inpcb(struct mbuf *m, struct inpcb *inp)
 {
 	int error;
 
 	IPSEC_ASSERT(inp != NULL, ("null inp"));
 	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
 	IPSEC_ASSERT(inp->inp_sp->sp_out != NULL && inp->inp_sp->sp_in != NULL,
 		("null sp_in || sp_out"));
 
 	error = ipsec_setspidx(m, &inp->inp_sp->sp_in->spidx, 1);
 	if (error == 0) {
 		inp->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND;
 		inp->inp_sp->sp_out->spidx = inp->inp_sp->sp_in->spidx;
 		inp->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND;
 	} else {
 		bzero(&inp->inp_sp->sp_in->spidx,
 			sizeof (inp->inp_sp->sp_in->spidx));
 		bzero(&inp->inp_sp->sp_out->spidx,
 			sizeof (inp->inp_sp->sp_in->spidx));
 	}
 	return (error);
 }
 
 /*
  * Configure security policy index (src/dst/proto/sport/dport)
  * by looking at the content of mbuf.
  * The caller is responsible for error recovery (like clearing up spidx).
  */
 static int
 ipsec_setspidx(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	struct ip *ip = NULL;
 	struct ip ipbuf;
 	u_int v;
 	struct mbuf *n;
 	int len;
 	int error;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/*
 	 * Validate m->m_pkthdr.len.  We see incorrect length if we
 	 * mistakenly call this function with inconsistent mbuf chain
 	 * (like 4.4BSD tcp/udp processing).  XXX Should we panic here?
 	 */
 	len = 0;
 	for (n = m; n; n = n->m_next)
 		len += n->m_len;
 	if (m->m_pkthdr.len != len) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n",
 				__func__, len, m->m_pkthdr.len));
 		return (EINVAL);
 	}
 
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) too small (v4), ignored.\n",
 			    __func__, m->m_pkthdr.len));
 		return (EINVAL);
 	}
 
 	if (m->m_len >= sizeof(*ip))
 		ip = mtod(m, struct ip *);
 	else {
 		m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
 		ip = &ipbuf;
 	}
 	v = ip->ip_v;
 	switch (v) {
 	case 4:
 		error = ipsec4_setspidx_ipaddr(m, spidx);
 		if (error)
 			return (error);
 		ipsec4_get_ulp(m, spidx, needport);
 		return (0);
 #ifdef INET6
 	case 6:
 		if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
 			KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				printf("%s: pkthdr len(%d) too small (v6), "
 				"ignored\n", __func__, m->m_pkthdr.len));
 			return (EINVAL);
 		}
 		error = ipsec6_setspidx_ipaddr(m, spidx);
 		if (error)
 			return (error);
 		ipsec6_get_ulp(m, spidx, needport);
 		return (0);
 #endif
 	default:
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: " "unknown IP version %u, ignored.\n",
 				__func__, v));
 		return (EINVAL);
 	}
 }
 
 static void
 ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	u_int8_t nxt;
 	int off;
 
 	/* Sanity check. */
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short"));
 
 	if (m->m_len >= sizeof (struct ip)) {
 		struct ip *ip = mtod(m, struct ip *);
 		if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
 			goto done;
 		off = ip->ip_hl << 2;
 		nxt = ip->ip_p;
 	} else {
 		struct ip ih;
 
 		m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih);
 		if (ih.ip_off & htons(IP_MF | IP_OFFMASK))
 			goto done;
 		off = ih.ip_hl << 2;
 		nxt = ih.ip_p;
 	}
 
 	while (off < m->m_pkthdr.len) {
 		struct ip6_ext ip6e;
 		struct tcphdr th;
 		struct udphdr uh;
 
 		switch (nxt) {
 		case IPPROTO_TCP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (th), (caddr_t) &th);
 			spidx->src.sin.sin_port = th.th_sport;
 			spidx->dst.sin.sin_port = th.th_dport;
 			return;
 		case IPPROTO_UDP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (uh), (caddr_t) &uh);
 			spidx->src.sin.sin_port = uh.uh_sport;
 			spidx->dst.sin.sin_port = uh.uh_dport;
 			return;
 		case IPPROTO_AH:
 			if (off + sizeof(ip6e) > m->m_pkthdr.len)
 				goto done;
 			/* XXX Sigh, this works but is totally bogus. */
 			m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e);
 			off += (ip6e.ip6e_len + 2) << 2;
 			nxt = ip6e.ip6e_nxt;
 			break;
 		case IPPROTO_ICMP:
 		default:
 			/* XXX Intermediate headers??? */
 			spidx->ul_proto = nxt;
 			goto done_proto;
 		}
 	}
 done:
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 done_proto:
 	spidx->src.sin.sin_port = IPSEC_PORT_ANY;
 	spidx->dst.sin.sin_port = IPSEC_PORT_ANY;
 }
 
 /* Assumes that m is sane. */
 static int
 ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
 {
 	static const struct sockaddr_in template = {
 		sizeof (struct sockaddr_in),
 		AF_INET,
 		0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }
 	};
 
 	spidx->src.sin = template;
 	spidx->dst.sin = template;
 
 	if (m->m_len < sizeof (struct ip)) {
 		m_copydata(m, offsetof(struct ip, ip_src),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->src.sin.sin_addr);
 		m_copydata(m, offsetof(struct ip, ip_dst),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->dst.sin.sin_addr);
 	} else {
 		struct ip *ip = mtod(m, struct ip *);
 		spidx->src.sin.sin_addr = ip->ip_src;
 		spidx->dst.sin.sin_addr = ip->ip_dst;
 	}
 
 	spidx->prefs = sizeof(struct in_addr) << 3;
 	spidx->prefd = sizeof(struct in_addr) << 3;
 
 	return (0);
 }
 
 #ifdef INET6
 static void
 ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	int off, nxt;
 	struct tcphdr th;
 	struct udphdr uh;
 	struct icmp6_hdr ih;
 
 	/* Sanity check. */
 	if (m == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(m));
 
 	/* Set default. */
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 	((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
 	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;
 
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off < 0 || m->m_pkthdr.len < off)
 		return;
 
 	switch (nxt) {
 	case IPPROTO_TCP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(th), (caddr_t)&th);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
 		break;
 	case IPPROTO_UDP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
 		break;
 	case IPPROTO_ICMPV6:
 		spidx->ul_proto = nxt;
 		if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(ih), (caddr_t)&ih);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port =
 		    htons((uint16_t)ih.icmp6_type);
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port =
 		    htons((uint16_t)ih.icmp6_code);
 		break;
 	default:
 		/* XXX Intermediate headers??? */
 		spidx->ul_proto = nxt;
 		break;
 	}
 }
 
 /* Assumes that m is sane. */
 static int
 ipsec6_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
 {
 	struct ip6_hdr *ip6 = NULL;
 	struct ip6_hdr ip6buf;
 	struct sockaddr_in6 *sin6;
 
 	if (m->m_len >= sizeof(*ip6))
 		ip6 = mtod(m, struct ip6_hdr *);
 	else {
 		m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf);
 		ip6 = &ip6buf;
 	}
 
 	sin6 = (struct sockaddr_in6 *)&spidx->src;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 	}
 	spidx->prefs = sizeof(struct in6_addr) << 3;
 
 	sin6 = (struct sockaddr_in6 *)&spidx->dst;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 	}
 	spidx->prefd = sizeof(struct in6_addr) << 3;
 
 	return (0);
 }
 #endif
 
 int
 ipsec_run_hhooks(struct ipsec_ctx_data *ctx, int type)
 {
 	int idx;
 
 	switch (ctx->af) {
 #ifdef INET
 	case AF_INET:
 		idx = HHOOK_IPSEC_INET;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		idx = HHOOK_IPSEC_INET6;
 		break;
 #endif
 	default:
 		return (EPFNOSUPPORT);
 	}
 	if (type == HHOOK_TYPE_IPSEC_IN)
 		HHOOKS_RUN_IF(V_ipsec_hhh_in[idx], ctx, NULL);
 	else
 		HHOOKS_RUN_IF(V_ipsec_hhh_out[idx], ctx, NULL);
 	if (*ctx->mp == NULL)
 		return (EACCES);
 	return (0);
 }
 
 static void
 ipsec_delpcbpolicy(struct inpcbpolicy *p)
 {
 
 	free(p, M_IPSEC_INPCB);
 }
 
 /* Initialize policy in PCB. */
 int
 ipsec_init_policy(struct socket *so, struct inpcbpolicy **pcb_sp)
 {
 	struct inpcbpolicy *new;
 
 	/* Sanity check. */
 	if (so == NULL || pcb_sp == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy),
 					    M_IPSEC_INPCB, M_NOWAIT|M_ZERO);
 	if (new == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return (ENOBUFS);
 	}
 
 	new->priv = IPSEC_IS_PRIVILEGED_SO(so);
 
 	if ((new->sp_in = KEY_NEWSP()) == NULL) {
 		ipsec_delpcbpolicy(new);
 		return (ENOBUFS);
 	}
 	new->sp_in->policy = IPSEC_POLICY_ENTRUST;
 	if ((new->sp_out = KEY_NEWSP()) == NULL) {
 		KEY_FREESP(&new->sp_in);
 		ipsec_delpcbpolicy(new);
 		return (ENOBUFS);
 	}
 	new->sp_out->policy = IPSEC_POLICY_ENTRUST;
 	*pcb_sp = new;
 
 	return (0);
 }
 
 /* Copy old IPsec policy into new. */
 int
 ipsec_copy_policy(struct inpcbpolicy *old, struct inpcbpolicy *new)
 {
 	struct secpolicy *sp;
 
 	sp = ipsec_deepcopy_policy(old->sp_in);
 	if (sp) {
 		KEY_FREESP(&new->sp_in);
 		new->sp_in = sp;
 	} else
 		return (ENOBUFS);
 
 	sp = ipsec_deepcopy_policy(old->sp_out);
 	if (sp) {
 		KEY_FREESP(&new->sp_out);
 		new->sp_out = sp;
 	} else
 		return (ENOBUFS);
 
 	new->priv = old->priv;
 
 	return (0);
 }
 
 struct ipsecrequest *
 ipsec_newisr(void)
 {
 	struct ipsecrequest *p;
 
 	p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT|M_ZERO);
 	if (p != NULL)
 		IPSECREQUEST_LOCK_INIT(p);
 	return (p);
 }
 
 void
 ipsec_delisr(struct ipsecrequest *p)
 {
 
 	IPSECREQUEST_LOCK_DESTROY(p);
 	free(p, M_IPSEC_SR);
 }
 
 /* Deep-copy a policy in PCB. */
 static struct secpolicy *
 ipsec_deepcopy_policy(struct secpolicy *src)
 {
 	struct ipsecrequest *newchain = NULL;
 	struct ipsecrequest *p;
 	struct ipsecrequest **q;
 	struct ipsecrequest *r;
 	struct secpolicy *dst;
 
 	if (src == NULL)
 		return (NULL);
 	dst = KEY_NEWSP();
 	if (dst == NULL)
 		return (NULL);
 
 	/*
 	 * Deep-copy IPsec request chain.  This is required since struct
 	 * ipsecrequest is not reference counted.
 	 */
 	q = &newchain;
 	for (p = src->req; p; p = p->next) {
 		*q = ipsec_newisr();
 		if (*q == NULL)
 			goto fail;
 		(*q)->saidx.proto = p->saidx.proto;
 		(*q)->saidx.mode = p->saidx.mode;
 		(*q)->level = p->level;
 		(*q)->saidx.reqid = p->saidx.reqid;
 
 		bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src));
 		bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst));
 
 		(*q)->sp = dst;
 
 		q = &((*q)->next);
 	}
 
 	dst->req = newchain;
 	dst->policy = src->policy;
 	/* Do not touch the refcnt fields. */
 
 	return (dst);
 
 fail:
 	for (p = newchain; p; p = r) {
 		r = p->next;
 		ipsec_delisr(p);
 		p = NULL;
 	}
 	return (NULL);
 }
 
 /* Set policy and IPsec request if present. */
 static int
 ipsec_set_policy_internal(struct secpolicy **pcb_sp, int optname,
     caddr_t request, size_t len, struct ucred *cred)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *newsp = NULL;
 	int error;
 
 	/* Sanity check. */
 	if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL)
 		return (EINVAL);
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: passed policy\n", __func__);
 		kdebug_sadb_x_policy((struct sadb_ext *)xpl));
 
 	/* Check policy type. */
 	/* ipsec_set_policy_internal() accepts IPSEC, ENTRUST and BYPASS. */
 	if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD
 	 || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
 		return (EINVAL);
 
 	/* Check privileged socket. */
 	if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
 		error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0);
 		if (error)
 			return (EACCES);
 	}
 
 	/* Allocating new SP entry. */
 	if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
 		return (error);
 
 	/* Clear old SP and set new SP. */
 	KEY_FREESP(pcb_sp);
 	*pcb_sp = newsp;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: new policy\n", __func__);
 		kdebug_secpolicy(newsp));
 
 	return (0);
 }
 
 int
 ipsec_set_policy(struct inpcb *inp, int optname, caddr_t request,
     size_t len, struct ucred *cred)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* Sanity check. */
 	if (inp == NULL || request == NULL)
 		return (EINVAL);
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	/* Select direction. */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return (EINVAL);
 	}
 
 	return (ipsec_set_policy_internal(pcb_sp, optname, request, len, cred));
 }
 
 int
 ipsec_get_policy(struct inpcb *inp, caddr_t request, size_t len,
     struct mbuf **mp)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* Sanity check. */
 	if (inp == NULL || request == NULL || mp == NULL)
 		return (EINVAL);
 	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	/* Select direction. */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return (EINVAL);
 	}
 
 	/* Sanity check. Should be an IPSEC_ASSERT. */
 	if (pcb_sp == NULL)
 		return (EINVAL);
 
 	*mp = key_sp2msg(pcb_sp);
 	if (!*mp) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return (ENOBUFS);
 	}
 
 	(*mp)->m_type = MT_DATA;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(*mp));
 
 	return (0);
 }
 
 /* Delete policy in PCB. */
 int
 ipsec_delete_pcbpolicy(struct inpcb *inp)
 {
 	IPSEC_ASSERT(inp != NULL, ("null inp"));
 
 	if (inp->inp_sp == NULL)
 		return (0);
 
 	if (inp->inp_sp->sp_in != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_in);
 
 	if (inp->inp_sp->sp_out != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_out);
 
 	ipsec_delpcbpolicy(inp->inp_sp);
 	inp->inp_sp = NULL;
 
 	return (0);
 }
 
 /*
  * Return current level.
  * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned.
  */
 u_int
 ipsec_get_reqlevel(struct ipsecrequest *isr)
 {
 	u_int level = 0;
 	u_int esp_trans_deflev, esp_net_deflev;
 	u_int ah_trans_deflev, ah_net_deflev;
 
 	IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument"));
 	IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family,
 		("af family mismatch, src %u, dst %u",
 		 isr->sp->spidx.src.sa.sa_family,
 		 isr->sp->spidx.dst.sa.sa_family));
 
 /* XXX Note that we have ipseclog() expanded here - code sync issue. */
 #define IPSEC_CHECK_DEFAULT(lev) \
 	(((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE	      \
 			&& (lev) != IPSEC_LEVEL_UNIQUE)			      \
 		? (V_ipsec_debug						      \
 			? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\
 				(lev), IPSEC_LEVEL_REQUIRE)		      \
 			: 0),						      \
 			(lev) = IPSEC_LEVEL_REQUIRE,			      \
 			(lev)						      \
 		: (lev))
 
 	/* Set default level. */
 	switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
 #ifdef INET
 	case AF_INET:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev);
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unknown af %u",
 			__func__, isr->sp->spidx.src.sa.sa_family);
 	}
 
 #undef IPSEC_CHECK_DEFAULT
 
 	/* Set level. */
 	switch (isr->level) {
 	case IPSEC_LEVEL_DEFAULT:
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = esp_net_deflev;
 			else
 				level = esp_trans_deflev;
 			break;
 		case IPPROTO_AH:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = ah_net_deflev;
 			else
 				level = ah_trans_deflev;
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * We don't really care, as IPcomp document says that
 			 * we shouldn't compress small packets.
 			 */
 			level = IPSEC_LEVEL_USE;
 			break;
 		default:
 			panic("%s: Illegal protocol defined %u\n", __func__,
 				isr->saidx.proto);
 		}
 		break;
 
 	case IPSEC_LEVEL_USE:
 	case IPSEC_LEVEL_REQUIRE:
 		level = isr->level;
 		break;
 	case IPSEC_LEVEL_UNIQUE:
 		level = IPSEC_LEVEL_REQUIRE;
 		break;
 
 	default:
 		panic("%s: Illegal IPsec level %u\n", __func__, isr->level);
 	}
 
 	return (level);
 }
 
 /*
  * Check security policy requirements against the actual
  * packet contents.  Return one if the packet should be
  * reject as "invalid"; otherwiser return zero to have the
  * packet treated as "valid".
  *
  * OUT:
  *	0: valid
  *	1: invalid
  */
 static int
 ipsec_in_reject(struct secpolicy *sp, struct mbuf *m)
 {
 	struct ipsecrequest *isr;
 	int need_auth;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	/* Check policy. */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		return (1);
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return (0);
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	/* XXX Should compare policy against IPsec header history. */
 
 	need_auth = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE)
 			continue;
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if ((m->m_flags & M_DECRYPTED) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 
 			if (!need_auth &&
 			    isr->sav != NULL &&
 			    isr->sav->tdb_authalgxform != NULL &&
 			    (m->m_flags & M_AUTHIPDGM) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP/AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 			break;
 		case IPPROTO_AH:
 			need_auth = 1;
 			if ((m->m_flags & M_AUTHIPHDR) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * We don't really care, as IPcomp document
 			 * says that we shouldn't compress small
 			 * packets.  IPComp policy should always be
 			 * treated as being in "use" level.
 			 */
 			break;
 		}
 	}
 	return (0);		/* Valid. */
 }
 
 /*
  * Non zero return value means security policy DISCARD or policy violation.
  */
 static int
 ipsec46_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	struct secpolicy *sp;
 	int error;
 	int result;
 
 	if (!key_havesp(IPSEC_DIR_INBOUND))
 		return 0;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/* Get SP for this packet. */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, &error);
 	else
 		sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);
 
 	if (sp != NULL) {
 		result = ipsec_in_reject(sp, m);
 		KEY_FREESP(&sp);
 	} else {
 		result = 1;	/* treat errors as policy violation */
 	}
 	return (result);
 }
 
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp_input(), udp_input(),
  * and {ah,esp}4_input for tunnel mode.
  */
 int
 ipsec4_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	int result;
 
 	result = ipsec46_in_reject(m, inp);
 	if (result)
 		IPSECSTAT_INC(ips_in_polvio);
 
 	return (result);
 }
 
 #ifdef INET6
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp6_input(), udp6_input(),
  * and {ah,esp}6_input for tunnel mode.
  */
 int
 ipsec6_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	int result;
 
 	result = ipsec46_in_reject(m, inp);
 	if (result)
 		IPSEC6STAT_INC(ips_in_polvio);
 
 	return (result);
 }
 #endif
 
 /*
  * Compute the byte size to be occupied by IPsec header.
  * In case it is tunnelled, it includes the size of outer IP header.
  * NOTE: SP passed is freed in this function.
  */
 static size_t
 ipsec_hdrsiz_internal(struct secpolicy *sp)
 {
 	struct ipsecrequest *isr;
 	size_t size;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return (0);
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	size = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		size_t clen = 0;
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			clen = esp_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_AH:
 			clen = ah_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_IPCOMP:
 			clen = sizeof(struct ipcomp);
 			break;
 		}
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			switch (isr->saidx.dst.sa.sa_family) {
 			case AF_INET:
 				clen += sizeof(struct ip);
 				break;
 #ifdef INET6
 			case AF_INET6:
 				clen += sizeof(struct ip6_hdr);
 				break;
 #endif
 			default:
 				ipseclog((LOG_ERR, "%s: unknown AF %d in "
 				    "IPsec tunnel SA\n", __func__,
 				    ((struct sockaddr *)&isr->saidx.dst)->sa_family));
 				break;
 			}
 		}
 		size += clen;
 	}
 
 	return (size);
 }
 
 /* 
  * This function is called from ipsec_hdrsiz_tcp(), ip_ipsec_mtu(),
  * disabled ip6_ipsec_mtu() and ip6_forward().
  */
 size_t
 ipsec_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp)
 {
 	struct secpolicy *sp;
 	int error;
 	size_t size;
 
 	if (!key_havesp(dir))
 		return 0;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/* Get SP for this packet. */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, &error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, &error);
 
 	if (sp != NULL) {
 		size = ipsec_hdrsiz_internal(sp);
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("%s: size:%lu.\n", __func__,
 				(unsigned long)size));
 
 		KEY_FREESP(&sp);
 	} else {
 		size = 0;	/* XXX Should be panic?
 				 * -> No, we are called w/o knowing if
 				 *    IPsec processing is needed. */
 	}
 	return (size);
 }
 
 /*
  * Check the variable replay window.
  * ipsec_chkreplay() performs replay check before ICV verification.
  * ipsec_updatereplay() updates replay bitmap.  This must be called after
  * ICV verification (it also performs replay check, which is usually done
  * beforehand).
  * 0 (zero) is returned if packet disallowed, 1 if packet permitted.
  *
  * Based on RFC 2401.
  */
 int
 ipsec_chkreplay(u_int32_t seq, struct secasvar *sav)
 {
 	const struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* Constant: bits of window size. */
 	int frlast;		/* Constant: last frame. */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		return (1);	/* No need to check replay. */
 
 	/* Constant. */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* Sequence number of 0 is invalid. */
 	if (seq == 0)
 		return (0);
 
 	/* First time is always okay. */
 	if (replay->count == 0)
 		return (1);
 
 	if (seq > replay->lastseq) {
 		/* Larger sequences are okay. */
 		return (1);
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* Over range to check, i.e. too old or wrapped. */
 		if (diff >= wsizeb)
 			return (0);
 
 		fr = frlast - diff / 8;
 
 		/* This packet already seen? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return (0);
 
 		/* Out of order but good. */
 		return (1);
 	}
 }
 
 /*
  * Check replay counter whether to update or not.
  * OUT:	0:	OK
  *	1:	NG
  */
 int
 ipsec_updatereplay(u_int32_t seq, struct secasvar *sav)
 {
 	char buf[128];
 	struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* Constant: bits of window size. */
 	int frlast;		/* Constant: last frame. */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		goto ok;	/* No need to check replay. */
 
 	/* Constant. */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* Sequence number of 0 is invalid. */
 	if (seq == 0)
 		return (1);
 
 	/* First time. */
 	if (replay->count == 0) {
 		replay->lastseq = seq;
 		bzero(replay->bitmap, replay->wsize);
 		(replay->bitmap)[frlast] = 1;
 		goto ok;
 	}
 
 	if (seq > replay->lastseq) {
 		/* seq is larger than lastseq. */
 		diff = seq - replay->lastseq;
 
 		/* New larger sequence number. */
 		if (diff < wsizeb) {
 			/* In window. */
 			/* Set bit for this packet. */
 			vshiftl(replay->bitmap, diff, replay->wsize);
 			(replay->bitmap)[frlast] |= 1;
 		} else {
 			/* This packet has a "way larger". */
 			bzero(replay->bitmap, replay->wsize);
 			(replay->bitmap)[frlast] = 1;
 		}
 		replay->lastseq = seq;
 
 		/* Larger is good. */
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* Over range to check, i.e. too old or wrapped. */
 		if (diff >= wsizeb)
 			return (1);
 
 		fr = frlast - diff / 8;
 
 		/* This packet already seen? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return (1);
 
 		/* Mark as seen. */
 		(replay->bitmap)[fr] |= (1 << (diff % 8));
 
 		/* Out of order but good. */
 	}
 
 ok:
 	if (replay->count == ~0) {
 
 		/* Set overflow flag. */
 		replay->overflow++;
 
 		/* Don't increment, no more packets accepted. */
 		if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
 			return (1);
 
 		ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n",
 		    __func__, replay->overflow,
 		    ipsec_logsastr(sav, buf, sizeof(buf))));
 	}
 
 	replay->count++;
 
 	return (0);
 }
 
 /*
  * Shift variable length buffer to left.
  * IN:	bitmap: pointer to the buffer
  * 	nbit:	the number of to shift.
  *	wsize:	buffer size (bytes).
  */
 static void
 vshiftl(unsigned char *bitmap, int nbit, int wsize)
 {
 	int s, j, i;
 	unsigned char over;
 
 	for (j = 0; j < nbit; j += 8) {
 		s = (nbit - j < 8) ? (nbit - j): 8;
 		bitmap[0] <<= s;
 		for (i = 1; i < wsize; i++) {
 			over = (bitmap[i] >> (8 - s));
 			bitmap[i] <<= s;
 			bitmap[i-1] |= over;
 		}
 	}
 }
 
 /* Return a printable string for the address. */
 char*
 ipsec_address(union sockaddr_union* sa, char *buf, socklen_t size)
 {
 
 	switch (sa->sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		return (inet_ntop(AF_INET, &sa->sin.sin_addr, buf, size));
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		return (inet_ntop(AF_INET6, &sa->sin6.sin6_addr, buf, size));
 #endif /* INET6 */
 	default:
 		return ("(unknown address family)");
 	}
 }
 
 char *
 ipsec_logsastr(struct secasvar *sav, char *buf, size_t size)
 {
 	char sbuf[INET6_ADDRSTRLEN], dbuf[INET6_ADDRSTRLEN];
 
 	IPSEC_ASSERT(sav->sah->saidx.src.sa.sa_family ==
 	    sav->sah->saidx.dst.sa.sa_family, ("address family mismatch"));
 
 	snprintf(buf, size, "SA(SPI=%08lx src=%s dst=%s)",
 	    (u_long)ntohl(sav->spi),
 	    ipsec_address(&sav->sah->saidx.src, sbuf, sizeof(sbuf)),
 	    ipsec_address(&sav->sah->saidx.dst, dbuf, sizeof(dbuf)));
 	return (buf);
 }
 
 void
 ipsec_dumpmbuf(struct mbuf *m)
 {
 	int totlen;
 	int i;
 	u_char *p;
 
 	totlen = 0;
 	printf("---\n");
 	while (m) {
 		p = mtod(m, u_char *);
 		for (i = 0; i < m->m_len; i++) {
 			printf("%02x ", p[i]);
 			totlen++;
 			if (totlen % 16 == 0)
 				printf("\n");
 		}
 		m = m->m_next;
 	}
 	if (totlen % 16 != 0)
 		printf("\n");
 	printf("---\n");
 }
 
 static void
 def_policy_init(const void *unused __unused)
 {
 
 	bzero(&V_def_policy, sizeof(struct secpolicy));
 	V_def_policy.policy = IPSEC_POLICY_NONE;
 	V_def_policy.refcnt = 1;
 }
-VNET_SYSINIT(def_policy_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
+VNET_SYSINIT(def_policy_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
     def_policy_init, NULL);
 
 
 /* XXX This stuff doesn't belong here... */
 
 static	struct xformsw* xforms = NULL;
 
 /*
  * Register a transform; typically at system startup.
  */
 void
 xform_register(struct xformsw* xsp)
 {
 
 	xsp->xf_next = xforms;
 	xforms = xsp;
 }
 
 /*
  * Initialize transform support in an sav.
  */
 int
 xform_init(struct secasvar *sav, int xftype)
 {
 	struct xformsw *xsp;
 
 	if (sav->tdb_xform != NULL)	/* Previously initialized. */
 		return (0);
 	for (xsp = xforms; xsp; xsp = xsp->xf_next)
 		if (xsp->xf_type == xftype)
 			return ((*xsp->xf_init)(sav, xsp));
 	return (EINVAL);
 }
Index: projects/vnet/sys/netipsec/xform_tcp.c
===================================================================
--- projects/vnet/sys/netipsec/xform_tcp.c	(revision 294565)
+++ projects/vnet/sys/netipsec/xform_tcp.c	(revision 294566)
@@ -1,170 +1,170 @@
 /*	$FreeBSD$ */
 
 /*-
  * Copyright (c) 2003 Bruce M. Simpson <bms@spc.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *   notice, this list of conditions and the following disclaimer in the
  *   documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* TCP MD5 Signature Option (RFC2385) */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 
 #include <net/vnet.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 /*
  * Initialize a TCP-MD5 SA. Called when the SA is being set up.
  *
  * We don't need to set up the tdb prefixed fields, as we don't use the
  * opencrypto code; we just perform a key length check.
  *
  * XXX: Currently we only allow a single 'magic' SPI to be used.
  *
  * This allows per-host granularity without affecting the userland
  * interface, which is a simple socket option toggle switch,
  * TCP_SIGNATURE_ENABLE.
  *
  * To allow per-service granularity requires that we have a means
  * of mapping port to SPI. The mandated way of doing this is to
  * use SPD entries to specify packet flows which get the TCP-MD5
  * treatment, however the code to do this is currently unstable
  * and unsuitable for production use.
  *
  * Therefore we use this compromise in the meantime.
  */
 static int
 tcpsignature_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	int keylen;
 
 	if (sav->spi != htonl(TCP_SIG_SPI)) {
 		DPRINTF(("%s: SPI must be TCP_SIG_SPI (0x1000)\n",
 		    __func__));
 		return (EINVAL);
 	}
 	if (sav->alg_auth != SADB_X_AALG_TCP_MD5) {
 		DPRINTF(("%s: unsupported authentication algorithm %u\n",
 		    __func__, sav->alg_auth));
 		return (EINVAL);
 	}
 	if (sav->key_auth == NULL) {
 		DPRINTF(("%s: no authentication key present\n", __func__));
 		return (EINVAL);
 	}
 	keylen = _KEYLEN(sav->key_auth);
 	if ((keylen < TCP_KEYLEN_MIN) || (keylen > TCP_KEYLEN_MAX)) {
 		DPRINTF(("%s: invalid key length %u\n", __func__, keylen));
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /*
  * Paranoia.
  *
  * Called when the SA is deleted.
  */
 static int
 tcpsignature_zeroize(struct secasvar *sav)
 {
 
 	if (sav->key_auth)
 		bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 
 	sav->tdb_cryptoid = 0;
 	sav->tdb_authalgxform = NULL;
 	sav->tdb_xform = NULL;
 
 	return (0);
 }
 
 /*
  * Verify that an input packet passes authentication.
  * Called from the ipsec layer.
  * We do this from within tcp itself, so this routine is just a stub.
  */
 static int
 tcpsignature_input(struct mbuf *m, struct secasvar *sav, int skip,
     int protoff)
 {
 
 	return (0);
 }
 
 /*
  * Prepend the authentication header.
  * Called from the ipsec layer.
  * We do this from within tcp itself, so this routine is just a stub.
  */
 static int
 tcpsignature_output(struct mbuf *m, struct ipsecrequest *isr,
     struct mbuf **mp, int skip, int protoff)
 {
 
 	return (EINVAL);
 }
 
 static struct xformsw tcpsignature_xformsw = {
 	XF_TCPSIGNATURE,	XFT_AUTH,		"TCPMD5",
 	tcpsignature_init,	tcpsignature_zeroize,
 	tcpsignature_input,	tcpsignature_output
 };
 
 static void
 tcpsignature_attach(void)
 {
 
 	xform_register(&tcpsignature_xformsw);
 }
 
-SYSINIT(tcpsignature_xform_init, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+SYSINIT(tcpsignature_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     tcpsignature_attach, NULL);
Index: projects/vnet/sys/netpfil/ipfw/dn_sched.h
===================================================================
--- projects/vnet/sys/netpfil/ipfw/dn_sched.h	(revision 294565)
+++ projects/vnet/sys/netpfil/ipfw/dn_sched.h	(revision 294566)
@@ -1,192 +1,192 @@
 /*
  * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
  * All rights reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The API to write a packet scheduling algorithm for dummynet.
  *
  * $FreeBSD$
  */
 
 #ifndef _DN_SCHED_H
 #define _DN_SCHED_H
 
 #define	DN_MULTIQUEUE	0x01
 /*
  * Descriptor for a scheduling algorithm.
  * Contains all function pointers for a given scheduler
  * This is typically created when a module is loaded, and stored
  * in a global list of schedulers.
  */
 struct dn_alg {
 	uint32_t type;           /* the scheduler type */
 	const char *name;   /* scheduler name */
 	uint32_t flags;	/* DN_MULTIQUEUE if supports multiple queues */
 
 	/*
 	 * The following define the size of 3 optional data structures
 	 * that may need to be allocated at runtime, and are appended
 	 * to each of the base data structures: scheduler, sched.inst,
 	 * and queue. We don't have a per-flowset structure.
 	 */
 	/*    + parameters attached to the template, e.g.
 	 *	default queue sizes, weights, quantum size, and so on;
 	 */
 	size_t schk_datalen;
 
 	/*    + per-instance parameters, such as timestamps,
 	 *	containers for queues, etc;
 	 */
 	size_t si_datalen;
 
 	size_t q_datalen;	/* per-queue parameters (e.g. S,F) */
 
 	/*
 	 * Methods implemented by the scheduler:
 	 * enqueue	enqueue packet 'm' on scheduler 's', queue 'q'.
 	 *	q is NULL for !MULTIQUEUE.
 	 *	Return 0 on success, 1 on drop (packet consumed anyways).
 	 *	Note that q should be interpreted only as a hint
 	 *	on the flow that the mbuf belongs to: while a
 	 *	scheduler will normally enqueue m into q, it is ok
 	 *	to leave q alone and put the mbuf elsewhere.
 	 *	This function is called in two cases:
 	 *	 - when a new packet arrives to the scheduler;
 	 *	 - when a scheduler is reconfigured. In this case the
 	 *	   call is issued by the new_queue callback, with a 
 	 *	   non empty queue (q) and m pointing to the first
 	 *	   mbuf in the queue. For this reason, the function
 	 *	   should internally check for (m != q->mq.head)
 	 *	   before calling dn_enqueue().
 	 *
 	 * dequeue	Called when scheduler instance 's' can
 	 *	dequeue a packet. Return NULL if none are available.
 	 *	XXX what about non work-conserving ?
 	 *
 	 * config	called on 'sched X config ...', normally writes
 	 *	in the area of size sch_arg
 	 *
 	 * destroy	called on 'sched delete', frees everything
 	 *	in sch_arg (other parts are handled by more specific
 	 *	functions)
 	 *
 	 * new_sched    called when a new instance is created, e.g.
 	 *	to create the local queue for !MULTIQUEUE, set V or
 	 *	copy parameters for WFQ, and so on.
 	 *
 	 * free_sched	called when deleting an instance, cleans
 	 *	extra data in the per-instance area.
 	 *
 	 * new_fsk	called when a flowset is linked to a scheduler,
 	 *	e.g. to validate parameters such as weights etc.
 	 * free_fsk	when a flowset is unlinked from a scheduler.
 	 *	(probably unnecessary)
 	 *
 	 * new_queue	called to set the per-queue parameters,
 	 *	e.g. S and F, adjust sum of weights in the parent, etc.
 	 *
 	 *	The new_queue callback is normally called from when
 	 *	creating a new queue. In some cases (such as a
 	 *	scheduler change or reconfiguration) it can be called
 	 *	with a non empty queue. In this case, the queue
 	 *	In case of non empty queue, the new_queue callback could
 	 *	need to call the enqueue function. In this case,
 	 *	the callback should eventually call enqueue() passing
 	 *	as m the first element in the queue.
 	 *
 	 * free_queue	actions related to a queue removal, e.g. undo
 	 *	all the above. If the queue has data in it, also remove
 	 *	from the scheduler. This can e.g. happen during a reconfigure.
 	 */
 	int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
 		struct mbuf *);
 	struct mbuf * (*dequeue)(struct dn_sch_inst *);
 
 	int (*config)(struct dn_schk *);
 	int (*destroy)(struct dn_schk*);
 	int (*new_sched)(struct dn_sch_inst *);
 	int (*free_sched)(struct dn_sch_inst *);
 	int (*new_fsk)(struct dn_fsk *f);
 	int (*free_fsk)(struct dn_fsk *f);
 	int (*new_queue)(struct dn_queue *q);
 	int (*free_queue)(struct dn_queue *q);
 
 	/* run-time fields */
 	int ref_count;      /* XXX number of instances in the system */
 	SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
 };
 
 /* MSVC does not support initializers so we need this ugly macro */
 #ifdef _WIN32
 #define _SI(fld)
 #else
 #define _SI(fld)	fld
 #endif
 
 /*
  * Additionally, dummynet exports some functions and macros
  * to be used by schedulers:
  */
 
 void dn_free_pkts(struct mbuf *mnext);
 int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
 /* bound a variable between min and max */
 int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
 
 /*
  * Extract the head of a queue, update stats. Must be the very last
  * thing done on a dequeue as the queue itself may go away.
  */
 static __inline struct mbuf*
 dn_dequeue(struct dn_queue *q)
 {
 	struct mbuf *m = q->mq.head;
 	if (m == NULL)
 		return NULL;
 	q->mq.head = m->m_nextpkt;
 	q->mq.count--;
 
 	/* Update stats for the queue */
 	q->ni.length--;
 	q->ni.len_bytes -= m->m_pkthdr.len;
 	if (q->_si) {
 		q->_si->ni.length--;
 		q->_si->ni.len_bytes -= m->m_pkthdr.len;
 	}
 	if (q->ni.length == 0) /* queue is now idle */
 		q->q_time = dn_cfg.curr_time;
 	return m;
 }
 
 int dn_sched_modevent(module_t mod, int cmd, void *arg);
 
 #define DECLARE_DNSCHED_MODULE(name, dnsched)			\
 	static moduledata_t name##_mod = {			\
 		#name, dn_sched_modevent, dnsched		\
 	};							\
 	DECLARE_MODULE(name, name##_mod, 			\
-		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 	\
+		SI_SUB_FW, SI_ORDER_ANY); 			\
         MODULE_DEPEND(name, dummynet, 3, 3, 3)
 #endif /* _DN_SCHED_H */
Index: projects/vnet/sys/netpfil/ipfw/ip_dummynet.c
===================================================================
--- projects/vnet/sys/netpfil/ipfw/ip_dummynet.c	(revision 294565)
+++ projects/vnet/sys/netpfil/ipfw/ip_dummynet.c	(revision 294566)
@@ -1,2321 +1,2321 @@
 /*-
  * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Configuration and internal object management for dummynet.
  */
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/time.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
 #include <netpfil/ipfw/dn_sched.h>
 
 /* which objects to copy */
 #define DN_C_LINK 	0x01
 #define DN_C_SCH	0x02
 #define DN_C_FLOW	0x04
 #define DN_C_FS		0x08
 #define DN_C_QUEUE	0x10
 
 /* we use this argument in case of a schk_new */
 struct schk_new_arg {
 	struct dn_alg *fp;
 	struct dn_sch *sch;
 };
 
 /*---- callout hooks. ----*/
 static struct callout dn_timeout;
 static int dn_gone;
 static struct task	dn_task;
 static struct taskqueue	*dn_tq = NULL;
 
 static void
 dummynet(void *arg)
 {
 
 	(void)arg;	/* UNUSED */
 	taskqueue_enqueue_fast(dn_tq, &dn_task);
 }
 
 void
 dn_reschedule(void)
 {
 
 	if (dn_gone != 0)
 		return;
 	callout_reset_sbt(&dn_timeout, tick_sbt, 0, dummynet, NULL,
 	    C_HARDCLOCK | C_DIRECT_EXEC);
 }
 /*----- end of callout hooks -----*/
 
 /* Return a scheduler descriptor given the type or name. */
 static struct dn_alg *
 find_sched_type(int type, char *name)
 {
 	struct dn_alg *d;
 
 	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
 		if (d->type == type || (name && !strcasecmp(d->name, name)))
 			return d;
 	}
 	return NULL; /* not found */
 }
 
 int
 ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
 {
 	int oldv = *v;
 	const char *op = NULL;
 	if (dflt < lo)
 		dflt = lo;
 	if (dflt > hi)
 		dflt = hi;
 	if (oldv < lo) {
 		*v = dflt;
 		op = "Bump";
 	} else if (oldv > hi) {
 		*v = hi;
 		op = "Clamp";
 	} else
 		return *v;
 	if (op && msg)
 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
 	return *v;
 }
 
 /*---- flow_id mask, hash and compare functions ---*/
 /*
  * The flow_id includes the 5-tuple, the queue/pipe number
  * which we store in the extra area in host order,
  * and for ipv6 also the flow_id6.
  * XXX see if we want the tos byte (can store in 'flags')
  */
 static struct ipfw_flow_id *
 flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
 {
 	int is_v6 = IS_IP6_FLOW_ID(id);
 
 	id->dst_port &= mask->dst_port;
 	id->src_port &= mask->src_port;
 	id->proto &= mask->proto;
 	id->extra &= mask->extra;
 	if (is_v6) {
 		APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
 		APPLY_MASK(&id->src_ip6, &mask->src_ip6);
 		id->flow_id6 &= mask->flow_id6;
 	} else {
 		id->dst_ip &= mask->dst_ip;
 		id->src_ip &= mask->src_ip;
 	}
 	return id;
 }
 
 /* computes an OR of two masks, result in dst and also returned */
 static struct ipfw_flow_id *
 flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
 {
 	int is_v6 = IS_IP6_FLOW_ID(dst);
 
 	dst->dst_port |= src->dst_port;
 	dst->src_port |= src->src_port;
 	dst->proto |= src->proto;
 	dst->extra |= src->extra;
 	if (is_v6) {
 #define OR_MASK(_d, _s)                          \
     (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
     (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
     (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
     (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
 		OR_MASK(&dst->dst_ip6, &src->dst_ip6);
 		OR_MASK(&dst->src_ip6, &src->src_ip6);
 #undef OR_MASK
 		dst->flow_id6 |= src->flow_id6;
 	} else {
 		dst->dst_ip |= src->dst_ip;
 		dst->src_ip |= src->src_ip;
 	}
 	return dst;
 }
 
 static int
 nonzero_mask(struct ipfw_flow_id *m)
 {
 	if (m->dst_port || m->src_port || m->proto || m->extra)
 		return 1;
 	if (IS_IP6_FLOW_ID(m)) {
 		return
 			m->dst_ip6.__u6_addr.__u6_addr32[0] ||
 			m->dst_ip6.__u6_addr.__u6_addr32[1] ||
 			m->dst_ip6.__u6_addr.__u6_addr32[2] ||
 			m->dst_ip6.__u6_addr.__u6_addr32[3] ||
 			m->src_ip6.__u6_addr.__u6_addr32[0] ||
 			m->src_ip6.__u6_addr.__u6_addr32[1] ||
 			m->src_ip6.__u6_addr.__u6_addr32[2] ||
 			m->src_ip6.__u6_addr.__u6_addr32[3] ||
 			m->flow_id6;
 	} else {
 		return m->dst_ip || m->src_ip;
 	}
 }
 
 /* XXX we may want a better hash function */
 static uint32_t
 flow_id_hash(struct ipfw_flow_id *id)
 {
     uint32_t i;
 
     if (IS_IP6_FLOW_ID(id)) {
 	uint32_t *d = (uint32_t *)&id->dst_ip6;
 	uint32_t *s = (uint32_t *)&id->src_ip6;
         i = (d[0]      ) ^ (d[1])       ^
             (d[2]      ) ^ (d[3])       ^
             (d[0] >> 15) ^ (d[1] >> 15) ^
             (d[2] >> 15) ^ (d[3] >> 15) ^
             (s[0] <<  1) ^ (s[1] <<  1) ^
             (s[2] <<  1) ^ (s[3] <<  1) ^
             (s[0] << 16) ^ (s[1] << 16) ^
             (s[2] << 16) ^ (s[3] << 16) ^
             (id->dst_port << 1) ^ (id->src_port) ^
 	    (id->extra) ^
             (id->proto ) ^ (id->flow_id6);
     } else {
         i = (id->dst_ip)        ^ (id->dst_ip >> 15) ^
             (id->src_ip << 1)   ^ (id->src_ip >> 16) ^
 	    (id->extra) ^
             (id->dst_port << 1) ^ (id->src_port)     ^ (id->proto);
     }
     return i;
 }
 
 /* Like bcmp, returns 0 if ids match, 1 otherwise. */
 static int
 flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
 {
 	int is_v6 = IS_IP6_FLOW_ID(id1);
 
 	if (!is_v6) {
 	    if (IS_IP6_FLOW_ID(id2))
 		return 1; /* different address families */
 
 	    return (id1->dst_ip == id2->dst_ip &&
 		    id1->src_ip == id2->src_ip &&
 		    id1->dst_port == id2->dst_port &&
 		    id1->src_port == id2->src_port &&
 		    id1->proto == id2->proto &&
 		    id1->extra == id2->extra) ? 0 : 1;
 	}
 	/* the ipv6 case */
 	return (
 	    !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
 	    !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
 	    id1->dst_port == id2->dst_port &&
 	    id1->src_port == id2->src_port &&
 	    id1->proto == id2->proto &&
 	    id1->extra == id2->extra &&
 	    id1->flow_id6 == id2->flow_id6) ? 0 : 1;
 }
 /*--------- end of flow-id mask, hash and compare ---------*/
 
 /*--- support functions for the qht hashtable ----
  * Entries are hashed by flow-id
  */
 static uint32_t
 q_hash(uintptr_t key, int flags, void *arg)
 {
 	/* compute the hash slot from the flow id */
 	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
 		&((struct dn_queue *)key)->ni.fid :
 		(struct ipfw_flow_id *)key;
 
 	return flow_id_hash(id);
 }
 
 static int
 q_match(void *obj, uintptr_t key, int flags, void *arg)
 {
 	struct dn_queue *o = (struct dn_queue *)obj;
 	struct ipfw_flow_id *id2;
 
 	if (flags & DNHT_KEY_IS_OBJ) {
 		/* compare pointers */
 		id2 = &((struct dn_queue *)key)->ni.fid;
 	} else {
 		id2 = (struct ipfw_flow_id *)key;
 	}
 	return (0 == flow_id_cmp(&o->ni.fid,  id2));
 }
 
 /*
  * create a new queue instance for the given 'key'.
  */
 static void *
 q_new(uintptr_t key, int flags, void *arg)
 {   
 	struct dn_queue *q, *template = arg;
 	struct dn_fsk *fs = template->fs;
 	int size = sizeof(*q) + fs->sched->fp->q_datalen;
 
 	q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
 	if (q == NULL) {
 		D("no memory for new queue");
 		return NULL;
 	}
 
 	set_oid(&q->ni.oid, DN_QUEUE, size);
 	if (fs->fs.flags & DN_QHT_HASH)
 		q->ni.fid = *(struct ipfw_flow_id *)key;
 	q->fs = fs;
 	q->_si = template->_si;
 	q->_si->q_count++;
 
 	if (fs->sched->fp->new_queue)
 		fs->sched->fp->new_queue(q);
 	dn_cfg.queue_count++;
 	return q;
 }
 
 /*
  * Notify schedulers that a queue is going away.
  * If (flags & DN_DESTROY), also free the packets.
  * The version for callbacks is called q_delete_cb().
  */
 static void
 dn_delete_queue(struct dn_queue *q, int flags)
 {
 	struct dn_fsk *fs = q->fs;
 
 	// D("fs %p si %p\n", fs, q->_si);
 	/* notify the parent scheduler that the queue is going away */
 	if (fs && fs->sched->fp->free_queue)
 		fs->sched->fp->free_queue(q);
 	q->_si->q_count--;
 	q->_si = NULL;
 	if (flags & DN_DESTROY) {
 		if (q->mq.head)
 			dn_free_pkts(q->mq.head);
 		bzero(q, sizeof(*q));	// safety
 		free(q, M_DUMMYNET);
 		dn_cfg.queue_count--;
 	}
 }
 
 static int
 q_delete_cb(void *q, void *arg)
 {
 	int flags = (int)(uintptr_t)arg;
 	dn_delete_queue(q, flags);
 	return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
 }
 
 /*
  * calls dn_delete_queue/q_delete_cb on all queues,
  * which notifies the parent scheduler and possibly drains packets.
  * flags & DN_DESTROY: drains queues and destroy qht;
  */
 static void
 qht_delete(struct dn_fsk *fs, int flags)
 {
 	ND("fs %d start flags %d qht %p",
 		fs->fs.fs_nr, flags, fs->qht);
 	if (!fs->qht)
 		return;
 	if (fs->fs.flags & DN_QHT_HASH) {
 		dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
 		if (flags & DN_DESTROY) {
 			dn_ht_free(fs->qht, 0);
 			fs->qht = NULL;
 		}
 	} else {
 		dn_delete_queue((struct dn_queue *)(fs->qht), flags);
 		if (flags & DN_DESTROY)
 			fs->qht = NULL;
 	}
 }
 
 /*
  * Find and possibly create the queue for a MULTIQUEUE scheduler.
  * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
  */
 struct dn_queue *
 ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
 	struct ipfw_flow_id *id)
 {
 	struct dn_queue template;
 
 	template._si = si;
 	template.fs = fs;
 
 	if (fs->fs.flags & DN_QHT_HASH) {
 		struct ipfw_flow_id masked_id;
 		if (fs->qht == NULL) {
 			fs->qht = dn_ht_init(NULL, fs->fs.buckets,
 				offsetof(struct dn_queue, q_next),
 				q_hash, q_match, q_new);
 			if (fs->qht == NULL)
 				return NULL;
 		}
 		masked_id = *id;
 		flow_id_mask(&fs->fsk_mask, &masked_id);
 		return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
 			DNHT_INSERT, &template);
 	} else {
 		if (fs->qht == NULL)
 			fs->qht = q_new(0, 0, &template);
 		return (struct dn_queue *)fs->qht;
 	}
 }
 /*--- end of queue hash table ---*/
 
 /*--- support functions for the sch_inst hashtable ----
  *
  * These are hashed by flow-id
  */
 static uint32_t
 si_hash(uintptr_t key, int flags, void *arg)
 {
 	/* compute the hash slot from the flow id */
 	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
 		&((struct dn_sch_inst *)key)->ni.fid :
 		(struct ipfw_flow_id *)key;
 
 	return flow_id_hash(id);
 }
 
 static int
 si_match(void *obj, uintptr_t key, int flags, void *arg)
 {
 	struct dn_sch_inst *o = obj;
 	struct ipfw_flow_id *id2;
 
 	id2 = (flags & DNHT_KEY_IS_OBJ) ?
 		&((struct dn_sch_inst *)key)->ni.fid :
 		(struct ipfw_flow_id *)key;
 	return flow_id_cmp(&o->ni.fid,  id2) == 0;
 }
 
 /*
  * create a new instance for the given 'key'
  * Allocate memory for instance, delay line and scheduler private data.
  */
 static void *
 si_new(uintptr_t key, int flags, void *arg)
 {
 	struct dn_schk *s = arg;
 	struct dn_sch_inst *si;
 	int l = sizeof(*si) + s->fp->si_datalen;
 
 	si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
 	if (si == NULL)
 		goto error;
 
 	/* Set length only for the part passed up to userland. */
 	set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
 	set_oid(&(si->dline.oid), DN_DELAY_LINE,
 		sizeof(struct delay_line));
 	/* mark si and dline as outside the event queue */
 	si->ni.oid.id = si->dline.oid.id = -1;
 
 	si->sched = s;
 	si->dline.si = si;
 
 	if (s->fp->new_sched && s->fp->new_sched(si)) {
 		D("new_sched error");
 		goto error;
 	}
 	if (s->sch.flags & DN_HAVE_MASK)
 		si->ni.fid = *(struct ipfw_flow_id *)key;
 
 	dn_cfg.si_count++;
 	return si;
 
 error:
 	if (si) {
 		bzero(si, sizeof(*si)); // safety
 		free(si, M_DUMMYNET);
 	}
         return NULL;
 }
 
 /*
  * Callback from siht to delete all scheduler instances. Remove
  * si and delay line from the system heap, destroy all queues.
  * We assume that all flowset have been notified and do not
  * point to us anymore.
  */
 static int
 si_destroy(void *_si, void *arg)
 {
 	struct dn_sch_inst *si = _si;
 	struct dn_schk *s = si->sched;
 	struct delay_line *dl = &si->dline;
 
 	if (dl->oid.subtype) /* remove delay line from event heap */
 		heap_extract(&dn_cfg.evheap, dl);
 	dn_free_pkts(dl->mq.head);	/* drain delay line */
 	if (si->kflags & DN_ACTIVE) /* remove si from event heap */
 		heap_extract(&dn_cfg.evheap, si);
 	if (s->fp->free_sched)
 		s->fp->free_sched(si);
 	bzero(si, sizeof(*si));	/* safety */
 	free(si, M_DUMMYNET);
 	dn_cfg.si_count--;
 	return DNHT_SCAN_DEL;
 }
 
 /*
  * Find the scheduler instance for this packet. If we need to apply
  * a mask, do on a local copy of the flow_id to preserve the original.
  * Assume siht is always initialized if we have a mask.
  */
 struct dn_sch_inst *
 ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
 {
 
 	if (s->sch.flags & DN_HAVE_MASK) {
 		struct ipfw_flow_id id_t = *id;
 		flow_id_mask(&s->sch.sched_mask, &id_t);
 		return dn_ht_find(s->siht, (uintptr_t)&id_t,
 			DNHT_INSERT, s);
 	}
 	if (!s->siht)
 		s->siht = si_new(0, 0, s);
 	return (struct dn_sch_inst *)s->siht;
 }
 
 /* callback to flush credit for the scheduler instance */
 static int
 si_reset_credit(void *_si, void *arg)
 {
 	struct dn_sch_inst *si = _si;
 	struct dn_link *p = &si->sched->link;
 
 	si->credit = p->burst + (dn_cfg.io_fast ?  p->bandwidth : 0);
 	return 0;
 }
 
 static void
 schk_reset_credit(struct dn_schk *s)
 {
 	if (s->sch.flags & DN_HAVE_MASK)
 		dn_ht_scan(s->siht, si_reset_credit, NULL);
 	else if (s->siht)
 		si_reset_credit(s->siht, NULL);
 }
 /*---- end of sch_inst hashtable ---------------------*/
 
 /*-------------------------------------------------------
  * flowset hash (fshash) support. Entries are hashed by fs_nr.
  * New allocations are put in the fsunlinked list, from which
  * they are removed when they point to a specific scheduler.
  */
 static uint32_t
 fsk_hash(uintptr_t key, int flags, void *arg)
 {
 	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
 		((struct dn_fsk *)key)->fs.fs_nr;
 
 	return ( (i>>8)^(i>>4)^i );
 }
 
 static int
 fsk_match(void *obj, uintptr_t key, int flags, void *arg)
 {
 	struct dn_fsk *fs = obj;
 	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
 		((struct dn_fsk *)key)->fs.fs_nr;
 
 	return (fs->fs.fs_nr == i);
 }
 
 static void *
 fsk_new(uintptr_t key, int flags, void *arg)
 {
 	struct dn_fsk *fs;
 
 	fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
 	if (fs) {
 		set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
 		dn_cfg.fsk_count++;
 		fs->drain_bucket = 0;
 		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
 	}
 	return fs;
 }
 
 /*
  * detach flowset from its current scheduler. Flags as follows:
  * DN_DETACH removes from the fsk_list
  * DN_DESTROY deletes individual queues
  * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
  */
 static void
 fsk_detach(struct dn_fsk *fs, int flags)
 {
 	if (flags & DN_DELETE_FS)
 		flags |= DN_DESTROY;
 	ND("fs %d from sched %d flags %s %s %s",
 		fs->fs.fs_nr, fs->fs.sched_nr,
 		(flags & DN_DELETE_FS) ? "DEL_FS":"",
 		(flags & DN_DESTROY) ? "DEL":"",
 		(flags & DN_DETACH) ? "DET":"");
 	if (flags & DN_DETACH) { /* detach from the list */
 		struct dn_fsk_head *h;
 		h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
 		SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
 	}
 	/* Free the RED parameters, they will be recomputed on
 	 * subsequent attach if needed.
 	 */
 	if (fs->w_q_lookup)
 		free(fs->w_q_lookup, M_DUMMYNET);
 	fs->w_q_lookup = NULL;
 	qht_delete(fs, flags);
 	if (fs->sched && fs->sched->fp->free_fsk)
 		fs->sched->fp->free_fsk(fs);
 	fs->sched = NULL;
 	if (flags & DN_DELETE_FS) {
 		bzero(fs, sizeof(*fs));	/* safety */
 		free(fs, M_DUMMYNET);
 		dn_cfg.fsk_count--;
 	} else {
 		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
 	}
 }
 
 /*
  * Detach or destroy all flowsets in a list.
  * flags specifies what to do:
  * DN_DESTROY:	flush all queues
  * DN_DELETE_FS:	DN_DESTROY + destroy flowset
  *	DN_DELETE_FS implies DN_DESTROY
  */
 static void
 fsk_detach_list(struct dn_fsk_head *h, int flags)
 {
 	struct dn_fsk *fs;
 	int n = 0; /* only for stats */
 
 	ND("head %p flags %x", h, flags);
 	while ((fs = SLIST_FIRST(h))) {
 		SLIST_REMOVE_HEAD(h, sch_chain);
 		n++;
 		fsk_detach(fs, flags);
 	}
 	ND("done %d flowsets", n);
 }
 
 /*
  * called on 'queue X delete' -- removes the flowset from fshash,
  * deletes all queues for the flowset, and removes the flowset.
  */
 static int
 delete_fs(int i, int locked)
 {
 	struct dn_fsk *fs;
 	int err = 0;
 
 	if (!locked)
 		DN_BH_WLOCK();
 	fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
 	ND("fs %d found %p", i, fs);
 	if (fs) {
 		fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
 		err = 0;
 	} else
 		err = EINVAL;
 	if (!locked)
 		DN_BH_WUNLOCK();
 	return err;
 }
 
 /*----- end of flowset hashtable support -------------*/
 
 /*------------------------------------------------------------
  * Scheduler hash. When searching by index we pass sched_nr,
  * otherwise we pass struct dn_sch * which is the first field in
  * struct dn_schk so we can cast between the two. We use this trick
  * because in the create phase (but it should be fixed).
  */
 static uint32_t
 schk_hash(uintptr_t key, int flags, void *_arg)
 {
 	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
 		((struct dn_schk *)key)->sch.sched_nr;
 	return ( (i>>8)^(i>>4)^i );
 }
 
 static int
 schk_match(void *obj, uintptr_t key, int flags, void *_arg)
 {
 	struct dn_schk *s = (struct dn_schk *)obj;
 	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
 		((struct dn_schk *)key)->sch.sched_nr;
 	return (s->sch.sched_nr == i);
 }
 
 /*
  * Create the entry and intialize with the sched hash if needed.
  * Leave s->fp unset so we can tell whether a dn_ht_find() returns
  * a new object or a previously existing one.
  */
 static void *
 schk_new(uintptr_t key, int flags, void *arg)
 {
 	struct schk_new_arg *a = arg;
 	struct dn_schk *s;
 	int l = sizeof(*s) +a->fp->schk_datalen;
 
 	s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
 	if (s == NULL)
 		return NULL;
 	set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
 	s->sch = *a->sch; // copy initial values
 	s->link.link_nr = s->sch.sched_nr;
 	SLIST_INIT(&s->fsk_list);
 	/* initialize the hash table or create the single instance */
 	s->fp = a->fp;	/* si_new needs this */
 	s->drain_bucket = 0;
 	if (s->sch.flags & DN_HAVE_MASK) {
 		s->siht = dn_ht_init(NULL, s->sch.buckets,
 			offsetof(struct dn_sch_inst, si_next),
 			si_hash, si_match, si_new);
 		if (s->siht == NULL) {
 			free(s, M_DUMMYNET);
 			return NULL;
 		}
 	}
 	s->fp = NULL;	/* mark as a new scheduler */
 	dn_cfg.schk_count++;
 	return s;
 }
 
 /*
  * Callback for sched delete. Notify all attached flowsets to
  * detach from the scheduler, destroy the internal flowset, and
  * all instances. The scheduler goes away too.
  * arg is 0 (only detach flowsets and destroy instances)
  * DN_DESTROY (detach & delete queues, delete schk)
  * or DN_DELETE_FS (delete queues and flowsets, delete schk)
  */
 static int
 schk_delete_cb(void *obj, void *arg)
 {
 	struct dn_schk *s = obj;
 #if 0
 	int a = (int)arg;
 	ND("sched %d arg %s%s",
 		s->sch.sched_nr,
 		a&DN_DESTROY ? "DEL ":"",
 		a&DN_DELETE_FS ? "DEL_FS":"");
 #endif
 	fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
 	/* no more flowset pointing to us now */
 	if (s->sch.flags & DN_HAVE_MASK) {
 		dn_ht_scan(s->siht, si_destroy, NULL);
 		dn_ht_free(s->siht, 0);
 	} else if (s->siht)
 		si_destroy(s->siht, NULL);
 	if (s->profile) {
 		free(s->profile, M_DUMMYNET);
 		s->profile = NULL;
 	}
 	s->siht = NULL;
 	if (s->fp->destroy)
 		s->fp->destroy(s);
 	bzero(s, sizeof(*s));	// safety
 	free(obj, M_DUMMYNET);
 	dn_cfg.schk_count--;
 	return DNHT_SCAN_DEL;
 }
 
 /*
  * called on a 'sched X delete' command. Deletes a single scheduler.
  * This is done by removing from the schedhash, unlinking all
  * flowsets and deleting their traffic.
  */
 static int
 delete_schk(int i)
 {
 	struct dn_schk *s;
 
 	s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
 	ND("%d %p", i, s);
 	if (!s)
 		return EINVAL;
 	delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
 	/* then detach flowsets, delete traffic */
 	schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
 	return 0;
 }
 /*--- end of schk hashtable support ---*/
 
 static int
 copy_obj(char **start, char *end, void *_o, const char *msg, int i)
 {
 	struct dn_id *o = _o;
 	int have = end - *start;
 
 	if (have < o->len || o->len == 0 || o->type == 0) {
 		D("(WARN) type %d %s %d have %d need %d",
 			o->type, msg, i, have, o->len);
 		return 1;
 	}
 	ND("type %d %s %d len %d", o->type, msg, i, o->len);
 	bcopy(_o, *start, o->len);
 	if (o->type == DN_LINK) {
 		/* Adjust burst parameter for link */
 		struct dn_link *l = (struct dn_link *)*start;
 		l->burst =  div64(l->burst, 8 * hz);
 		l->delay = l->delay * 1000 / hz;
 	} else if (o->type == DN_SCH) {
 		/* Set id->id to the number of instances */
 		struct dn_schk *s = _o;
 		struct dn_id *id = (struct dn_id *)(*start);
 		id->id = (s->sch.flags & DN_HAVE_MASK) ?
 			dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
 	}
 	*start += o->len;
 	return 0;
 }
 
 /* Specific function to copy a queue.
  * Copies only the user-visible part of a queue (which is in
  * a struct dn_flow), and sets len accordingly.
  */
 static int
 copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
 {
 	struct dn_id *o = _o;
 	int have = end - *start;
 	int len = sizeof(struct dn_flow); /* see above comment */
 
 	if (have < len || o->len == 0 || o->type != DN_QUEUE) {
 		D("ERROR type %d %s %d have %d need %d",
 			o->type, msg, i, have, len);
 		return 1;
 	}
 	ND("type %d %s %d len %d", o->type, msg, i, len);
 	bcopy(_o, *start, len);
 	((struct dn_id*)(*start))->len = len;
 	*start += len;
 	return 0;
 }
 
 static int
 copy_q_cb(void *obj, void *arg)
 {
 	struct dn_queue *q = obj;
 	struct copy_args *a = arg;
 	struct dn_flow *ni = (struct dn_flow *)(*a->start);
         if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
                 return DNHT_SCAN_END;
         ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
         ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
         return 0;
 }
 
 static int
 copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
 {
 	if (!fs->qht)
 		return 0;
 	if (fs->fs.flags & DN_QHT_HASH)
 		dn_ht_scan(fs->qht, copy_q_cb, a);
 	else
 		copy_q_cb(fs->qht, a);
 	return 0;
 }
 
 /*
  * This routine only copies the initial part of a profile ? XXX
  */
 static int
 copy_profile(struct copy_args *a, struct dn_profile *p)
 {
 	int have = a->end - *a->start;
 	/* XXX here we check for max length */
 	int profile_len = sizeof(struct dn_profile) - 
 		ED_MAX_SAMPLES_NO*sizeof(int);
 
 	if (p == NULL)
 		return 0;
 	if (have < profile_len) {
 		D("error have %d need %d", have, profile_len);
 		return 1;
 	}
 	bcopy(p, *a->start, profile_len);
 	((struct dn_id *)(*a->start))->len = profile_len;
 	*a->start += profile_len;
 	return 0;
 }
 
 static int
 copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
 {
 	struct dn_fs *ufs = (struct dn_fs *)(*a->start);
 	if (!fs)
 		return 0;
 	ND("flowset %d", fs->fs.fs_nr);
 	if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
 		return DNHT_SCAN_END;
 	ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
 		dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
 	if (flags) {	/* copy queues */
 		copy_q(a, fs, 0);
 	}
 	return 0;
 }
 
 static int
 copy_si_cb(void *obj, void *arg)
 {
 	struct dn_sch_inst *si = obj;
 	struct copy_args *a = arg;
 	struct dn_flow *ni = (struct dn_flow *)(*a->start);
 	if (copy_obj(a->start, a->end, &si->ni, "inst",
 			si->sched->sch.sched_nr))
 		return DNHT_SCAN_END;
 	ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
 	ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
 	return 0;
 }
 
 static int
 copy_si(struct copy_args *a, struct dn_schk *s, int flags)
 {
 	if (s->sch.flags & DN_HAVE_MASK)
 		dn_ht_scan(s->siht, copy_si_cb, a);
 	else if (s->siht)
 		copy_si_cb(s->siht, a);
 	return 0;
 }
 
 /*
  * compute a list of children of a scheduler and copy up
  */
 static int
 copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
 {
 	struct dn_fsk *fs;
 	struct dn_id *o;
 	uint32_t *p;
 
 	int n = 0, space = sizeof(*o);
 	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
 		if (fs->fs.fs_nr < DN_MAX_ID)
 			n++;
 	}
 	space += n * sizeof(uint32_t);
 	DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
 	if (a->end - *(a->start) < space)
 		return DNHT_SCAN_END;
 	o = (struct dn_id *)(*(a->start));
 	o->len = space;
 	*a->start += o->len;
 	o->type = DN_TEXT;
 	p = (uint32_t *)(o+1);
 	SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
 		if (fs->fs.fs_nr < DN_MAX_ID)
 			*p++ = fs->fs.fs_nr;
 	return 0;
 }
 
 static int
 copy_data_helper(void *_o, void *_arg)
 {
 	struct copy_args *a = _arg;
 	uint32_t *r = a->extra->r; /* start of first range */
 	uint32_t *lim;	/* first invalid pointer */
 	int n;
 
 	lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
 
 	if (a->type == DN_LINK || a->type == DN_SCH) {
 		/* pipe|sched show, we receive a dn_schk */
 		struct dn_schk *s = _o;
 
 		n = s->sch.sched_nr;
 		if (a->type == DN_SCH && n >= DN_MAX_ID)
 			return 0;	/* not a scheduler */
 		if (a->type == DN_LINK && n <= DN_MAX_ID)
 		    return 0;	/* not a pipe */
 
 		/* see if the object is within one of our ranges */
 		for (;r < lim; r += 2) {
 			if (n < r[0] || n > r[1])
 				continue;
 			/* Found a valid entry, copy and we are done */
 			if (a->flags & DN_C_LINK) {
 				if (copy_obj(a->start, a->end,
 				    &s->link, "link", n))
 					return DNHT_SCAN_END;
 				if (copy_profile(a, s->profile))
 					return DNHT_SCAN_END;
 				if (copy_flowset(a, s->fs, 0))
 					return DNHT_SCAN_END;
 			}
 			if (a->flags & DN_C_SCH) {
 				if (copy_obj(a->start, a->end,
 				    &s->sch, "sched", n))
 					return DNHT_SCAN_END;
 				/* list all attached flowsets */
 				if (copy_fsk_list(a, s, 0))
 					return DNHT_SCAN_END;
 			}
 			if (a->flags & DN_C_FLOW)
 				copy_si(a, s, 0);
 			break;
 		}
 	} else if (a->type == DN_FS) {
 		/* queue show, skip internal flowsets */
 		struct dn_fsk *fs = _o;
 
 		n = fs->fs.fs_nr;
 		if (n >= DN_MAX_ID)
 			return 0;
 		/* see if the object is within one of our ranges */
 		for (;r < lim; r += 2) {
 			if (n < r[0] || n > r[1])
 				continue;
 			if (copy_flowset(a, fs, 0))
 				return DNHT_SCAN_END;
 			copy_q(a, fs, 0);
 			break; /* we are done */
 		}
 	}
 	return 0;
 }
 
 static inline struct dn_schk *
 locate_scheduler(int i)
 {
 	return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
 }
 
 /*
  * red parameters are in fixed point arithmetic.
  */
 static int
 config_red(struct dn_fsk *fs)
 {
 	int64_t s, idle, weight, w0;
 	int t, i;
 
 	fs->w_q = fs->fs.w_q;
 	fs->max_p = fs->fs.max_p;
 	ND("called");
 	/* Doing stuff that was in userland */
 	i = fs->sched->link.bandwidth;
 	s = (i <= 0) ? 0 :
 		hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
 
 	idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
 	fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
 	/* fs->lookup_step not scaled, */
 	if (!fs->lookup_step)
 		fs->lookup_step = 1;
 	w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
 
 	for (t = fs->lookup_step; t > 1; --t)
 		weight = SCALE_MUL(weight, w0);
 	fs->lookup_weight = (int)(weight); // scaled
 
 	/* Now doing stuff that was in kerneland */
 	fs->min_th = SCALE(fs->fs.min_th);
 	fs->max_th = SCALE(fs->fs.max_th);
 
 	if (fs->fs.max_th == fs->fs.min_th)
 		fs->c_1 = fs->max_p;
 	else
 		fs->c_1 = SCALE((int64_t)(fs->max_p)) / (fs->fs.max_th - fs->fs.min_th);
 	fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
 
 	if (fs->fs.flags & DN_IS_GENTLE_RED) {
 		fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
 		fs->c_4 = SCALE(1) - 2 * fs->max_p;
 	}
 
 	/* If the lookup table already exist, free and create it again. */
 	if (fs->w_q_lookup) {
 		free(fs->w_q_lookup, M_DUMMYNET);
 		fs->w_q_lookup = NULL;
 	}
 	if (dn_cfg.red_lookup_depth == 0) {
 		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
 		    "must be > 0\n");
 		fs->fs.flags &= ~DN_IS_RED;
 		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return (EINVAL);
 	}
 	fs->lookup_depth = dn_cfg.red_lookup_depth;
 	fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
 	    M_DUMMYNET, M_NOWAIT);
 	if (fs->w_q_lookup == NULL) {
 		printf("dummynet: sorry, cannot allocate red lookup table\n");
 		fs->fs.flags &= ~DN_IS_RED;
 		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return(ENOSPC);
 	}
 
 	/* Fill the lookup table with (1 - w_q)^x */
 	fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
 
 	for (i = 1; i < fs->lookup_depth; i++)
 		fs->w_q_lookup[i] =
 		    SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
 
 	if (dn_cfg.red_avg_pkt_size < 1)
 		dn_cfg.red_avg_pkt_size = 512;
 	fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
 	if (dn_cfg.red_max_pkt_size < 1)
 		dn_cfg.red_max_pkt_size = 1500;
 	fs->max_pkt_size = dn_cfg.red_max_pkt_size;
 	ND("exit");
 	return 0;
 }
 
 /* Scan all flowset attached to this scheduler and update red */
 static void
 update_red(struct dn_schk *s)
 {
 	struct dn_fsk *fs;
 	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
 		if (fs && (fs->fs.flags & DN_IS_RED))
 			config_red(fs);
 	}
 }
 
 /* attach flowset to scheduler s, possibly requeue */
 static void
 fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
 {
 	ND("remove fs %d from fsunlinked, link to sched %d",
 		fs->fs.fs_nr, s->sch.sched_nr);
 	SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
 	fs->sched = s;
 	SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
 	if (s->fp->new_fsk)
 		s->fp->new_fsk(fs);
 	/* XXX compute fsk_mask */
 	fs->fsk_mask = fs->fs.flow_mask;
 	if (fs->sched->sch.flags & DN_HAVE_MASK)
 		flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
 	if (fs->qht) {
 		/*
 		 * we must drain qht according to the old
 		 * type, and reinsert according to the new one.
 		 * The requeue is complex -- in general we need to
 		 * reclassify every single packet.
 		 * For the time being, let's hope qht is never set
 		 * when we reach this point.
 		 */
 		D("XXX TODO requeue from fs %d to sch %d",
 			fs->fs.fs_nr, s->sch.sched_nr);
 		fs->qht = NULL;
 	}
 	/* set the new type for qht */
 	if (nonzero_mask(&fs->fsk_mask))
 		fs->fs.flags |= DN_QHT_HASH;
 	else
 		fs->fs.flags &= ~DN_QHT_HASH;
 
 	/* XXX config_red() can fail... */
 	if (fs->fs.flags & DN_IS_RED)
 		config_red(fs);
 }
 
 /* update all flowsets which may refer to this scheduler */
 static void
 update_fs(struct dn_schk *s)
 {
 	struct dn_fsk *fs, *tmp;
 
 	SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
 		if (s->sch.sched_nr != fs->fs.sched_nr) {
 			D("fs %d for sch %d not %d still unlinked",
 				fs->fs.fs_nr, fs->fs.sched_nr,
 				s->sch.sched_nr);
 			continue;
 		}
 		fsk_attach(fs, s);
 	}
 }
 
 /*
  * Configuration -- to preserve backward compatibility we use
  * the following scheme (N is 65536)
  *	NUMBER		SCHED	LINK	FLOWSET
  *	   1 ..  N-1	(1)WFQ	(2)WFQ	(3)queue
  *	 N+1 .. 2N-1	(4)FIFO (5)FIFO	(6)FIFO for sched 1..N-1
  *	2N+1 .. 3N-1	--	--	(7)FIFO for sched N+1..2N-1
  *
  * "pipe i config" configures #1, #2 and #3
  * "sched i config" configures #1 and possibly #6
  * "queue i config" configures #3
  * #1 is configured with 'pipe i config' or 'sched i config'
  * #2 is configured with 'pipe i config', and created if not
  *	existing with 'sched i config'
  * #3 is configured with 'queue i config'
  * #4 is automatically configured after #1, can only be FIFO
  * #5 is automatically configured after #2
  * #6 is automatically created when #1 is !MULTIQUEUE,
  *	and can be updated.
  * #7 is automatically configured after #2
  */
 
 /*
  * configure a link (and its FIFO instance)
  */
 static int
 config_link(struct dn_link *p, struct dn_id *arg)
 {
 	int i;
 
 	if (p->oid.len != sizeof(*p)) {
 		D("invalid pipe len %d", p->oid.len);
 		return EINVAL;
 	}
 	i = p->link_nr;
 	if (i <= 0 || i >= DN_MAX_ID)
 		return EINVAL;
 	/*
 	 * The config program passes parameters as follows:
 	 * bw = bits/second (0 means no limits),
 	 * delay = ms, must be translated into ticks.
 	 * qsize = slots/bytes
 	 * burst ???
 	 */
 	p->delay = (p->delay * hz) / 1000;
 	/* Scale burst size: bytes -> bits * hz */
 	p->burst *= 8 * hz;
 
 	DN_BH_WLOCK();
 	/* do it twice, base link and FIFO link */
 	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
 	    struct dn_schk *s = locate_scheduler(i);
 	    if (s == NULL) {
 		DN_BH_WUNLOCK();
 		D("sched %d not found", i);
 		return EINVAL;
 	    }
 	    /* remove profile if exists */
 	    if (s->profile) {
 		free(s->profile, M_DUMMYNET);
 		s->profile = NULL;
 	    }
 	    /* copy all parameters */
 	    s->link.oid = p->oid;
 	    s->link.link_nr = i;
 	    s->link.delay = p->delay;
 	    if (s->link.bandwidth != p->bandwidth) {
 		/* XXX bandwidth changes, need to update red params */
 	    s->link.bandwidth = p->bandwidth;
 		update_red(s);
 	    }
 	    s->link.burst = p->burst;
 	    schk_reset_credit(s);
 	}
 	dn_cfg.id++;
 	DN_BH_WUNLOCK();
 	return 0;
 }
 
 /*
  * configure a flowset. Can be called from inside with locked=1,
  */
 static struct dn_fsk *
 config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
 {
 	int i;
 	struct dn_fsk *fs;
 
 	if (nfs->oid.len != sizeof(*nfs)) {
 		D("invalid flowset len %d", nfs->oid.len);
 		return NULL;
 	}
 	i = nfs->fs_nr;
 	if (i <= 0 || i >= 3*DN_MAX_ID)
 		return NULL;
 	ND("flowset %d", i);
 	/* XXX other sanity checks */
         if (nfs->flags & DN_QSIZE_BYTES) {
 		ipdn_bound_var(&nfs->qsize, 16384,
 		    1500, dn_cfg.byte_limit, NULL); // "queue byte size");
         } else {
 		ipdn_bound_var(&nfs->qsize, 50,
 		    1, dn_cfg.slot_limit, NULL); // "queue slot size");
         }
 	if (nfs->flags & DN_HAVE_MASK) {
 		/* make sure we have some buckets */
 		ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size,
 			1, dn_cfg.max_hash_size, "flowset buckets");
 	} else {
 		nfs->buckets = 1;	/* we only need 1 */
 	}
 	if (!locked)
 		DN_BH_WLOCK();
 	do { /* exit with break when done */
 	    struct dn_schk *s;
 	    int flags = nfs->sched_nr ? DNHT_INSERT : 0;
 	    int j;
 	    int oldc = dn_cfg.fsk_count;
 	    fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
 	    if (fs == NULL) {
 		D("missing sched for flowset %d", i);
 	        break;
 	    }
 	    /* grab some defaults from the existing one */
 	    if (nfs->sched_nr == 0) /* reuse */
 		nfs->sched_nr = fs->fs.sched_nr;
 	    for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
 		if (nfs->par[j] == -1) /* reuse */
 		    nfs->par[j] = fs->fs.par[j];
 	    }
 	    if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
 		ND("flowset %d unchanged", i);
 		break; /* no change, nothing to do */
 	    }
 	    if (oldc != dn_cfg.fsk_count)	/* new item */
 		dn_cfg.id++;
 	    s = locate_scheduler(nfs->sched_nr);
 	    /* detach from old scheduler if needed, preserving
 	     * queues if we need to reattach. Then update the
 	     * configuration, and possibly attach to the new sched.
 	     */
 	    DX(2, "fs %d changed sched %d@%p to %d@%p",
 		fs->fs.fs_nr,
 		fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
 	    if (fs->sched) {
 		int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
 		flags |= DN_DESTROY; /* XXX temporary */
 		fsk_detach(fs, flags);
 	    }
 	    fs->fs = *nfs; /* copy configuration */
 	    if (s != NULL)
 		fsk_attach(fs, s);
 	} while (0);
 	if (!locked)
 		DN_BH_WUNLOCK();
 	return fs;
 }
 
 /*
  * config/reconfig a scheduler and its FIFO variant.
  * For !MULTIQUEUE schedulers, also set up the flowset.
  *
  * On reconfigurations (detected because s->fp is set),
  * detach existing flowsets preserving traffic, preserve link,
  * and delete the old scheduler creating a new one.
  */
 static int
 config_sched(struct dn_sch *_nsch, struct dn_id *arg)
 {
 	struct dn_schk *s;
 	struct schk_new_arg a; /* argument for schk_new */
 	int i;
 	struct dn_link p;	/* copy of oldlink */
 	struct dn_profile *pf = NULL;	/* copy of old link profile */
 	/* Used to preserv mask parameter */
 	struct ipfw_flow_id new_mask;
 	int new_buckets = 0;
 	int new_flags = 0;
 	int pipe_cmd;
 	int err = ENOMEM;
 
 	a.sch = _nsch;
 	if (a.sch->oid.len != sizeof(*a.sch)) {
 		D("bad sched len %d", a.sch->oid.len);
 		return EINVAL;
 	}
 	i = a.sch->sched_nr;
 	if (i <= 0 || i >= DN_MAX_ID)
 		return EINVAL;
 	/* make sure we have some buckets */
 	if (a.sch->flags & DN_HAVE_MASK)
 		ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size,
 			1, dn_cfg.max_hash_size, "sched buckets");
 	/* XXX other sanity checks */
 	bzero(&p, sizeof(p));
 
 	pipe_cmd = a.sch->flags & DN_PIPE_CMD;
 	a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
 	if (pipe_cmd) {
 		/* Copy mask parameter */
 		new_mask = a.sch->sched_mask;
 		new_buckets = a.sch->buckets;
 		new_flags = a.sch->flags;
 	}
 	DN_BH_WLOCK();
 again: /* run twice, for wfq and fifo */
 	/*
 	 * lookup the type. If not supplied, use the previous one
 	 * or default to WF2Q+. Otherwise, return an error.
 	 */
 	dn_cfg.id++;
 	a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
 	if (a.fp != NULL) {
 		/* found. Lookup or create entry */
 		s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
 	} else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
 		/* No type. search existing s* or retry with WF2Q+ */
 		s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
 		if (s != NULL) {
 			a.fp = s->fp;
 			/* Scheduler exists, skip to FIFO scheduler 
 			 * if command was pipe config...
 			 */
 			if (pipe_cmd)
 				goto next;
 		} else {
 			/* New scheduler, create a wf2q+ with no mask
 			 * if command was pipe config...
 			 */
 			if (pipe_cmd) {
 				/* clear mask parameter */
 				bzero(&a.sch->sched_mask, sizeof(new_mask));
 				a.sch->buckets = 0;
 				a.sch->flags &= ~DN_HAVE_MASK;
 			}
 			a.sch->oid.subtype = DN_SCHED_WF2QP;
 			goto again;
 		}
 	} else {
 		D("invalid scheduler type %d %s",
 			a.sch->oid.subtype, a.sch->name);
 		err = EINVAL;
 		goto error;
 	}
 	/* normalize name and subtype */
 	a.sch->oid.subtype = a.fp->type;
 	bzero(a.sch->name, sizeof(a.sch->name));
 	strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
 	if (s == NULL) {
 		D("cannot allocate scheduler %d", i);
 		goto error;
 	}
 	/* restore existing link if any */
 	if (p.link_nr) {
 		s->link = p;
 		if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
 			s->profile = NULL; /* XXX maybe not needed */
 		} else {
 			s->profile = malloc(sizeof(struct dn_profile),
 					     M_DUMMYNET, M_NOWAIT | M_ZERO);
 			if (s->profile == NULL) {
 				D("cannot allocate profile");
 				goto error; //XXX
 			}
 			bcopy(pf, s->profile, sizeof(*pf));
 		}
 	}
 	p.link_nr = 0;
 	if (s->fp == NULL) {
 		DX(2, "sched %d new type %s", i, a.fp->name);
 	} else if (s->fp != a.fp ||
 			bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
 		/* already existing. */
 		DX(2, "sched %d type changed from %s to %s",
 			i, s->fp->name, a.fp->name);
 		DX(4, "   type/sub %d/%d -> %d/%d",
 			s->sch.oid.type, s->sch.oid.subtype, 
 			a.sch->oid.type, a.sch->oid.subtype);
 		if (s->link.link_nr == 0)
 			D("XXX WARNING link 0 for sched %d", i);
 		p = s->link;	/* preserve link */
 		if (s->profile) {/* preserve profile */
 			if (!pf)
 				pf = malloc(sizeof(*pf),
 				    M_DUMMYNET, M_NOWAIT | M_ZERO);
 			if (pf)	/* XXX should issue a warning otherwise */
 				bcopy(s->profile, pf, sizeof(*pf));
 		}
 		/* remove from the hash */
 		dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
 		/* Detach flowsets, preserve queues. */
 		// schk_delete_cb(s, NULL);
 		// XXX temporarily, kill queues
 		schk_delete_cb(s, (void *)DN_DESTROY);
 		goto again;
 	} else {
 		DX(4, "sched %d unchanged type %s", i, a.fp->name);
 	}
 	/* complete initialization */
 	s->sch = *a.sch;
 	s->fp = a.fp;
 	s->cfg = arg;
 	// XXX schk_reset_credit(s);
 	/* create the internal flowset if needed,
 	 * trying to reuse existing ones if available
 	 */
 	if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
 	        s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
 		if (!s->fs) {
 			struct dn_fs fs;
 			bzero(&fs, sizeof(fs));
 			set_oid(&fs.oid, DN_FS, sizeof(fs));
 			fs.fs_nr = i + DN_MAX_ID;
 			fs.sched_nr = i;
 			s->fs = config_fs(&fs, NULL, 1 /* locked */);
 		}
 		if (!s->fs) {
 			schk_delete_cb(s, (void *)DN_DESTROY);
 			D("error creating internal fs for %d", i);
 			goto error;
 		}
 	}
 	/* call init function after the flowset is created */
 	if (s->fp->config)
 		s->fp->config(s);
 	update_fs(s);
 next:
 	if (i < DN_MAX_ID) { /* now configure the FIFO instance */
 		i += DN_MAX_ID;
 		if (pipe_cmd) {
 			/* Restore mask parameter for FIFO */
 			a.sch->sched_mask = new_mask;
 			a.sch->buckets = new_buckets;
 			a.sch->flags = new_flags;
 		} else {
 			/* sched config shouldn't modify the FIFO scheduler */
 			if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
 				/* FIFO already exist, don't touch it */
 				err = 0; /* and this is not an error */
 				goto error;
 			}
 		}
 		a.sch->sched_nr = i;
 		a.sch->oid.subtype = DN_SCHED_FIFO;
 		bzero(a.sch->name, sizeof(a.sch->name));
 		goto again;
 	}
 	err = 0;
 error:
 	DN_BH_WUNLOCK();
 	if (pf)
 		free(pf, M_DUMMYNET);
 	return err;
 }
 
 /*
  * attach a profile to a link
  */
 static int
 config_profile(struct dn_profile *pf, struct dn_id *arg)
 {
 	struct dn_schk *s;
 	int i, olen, err = 0;
 
 	if (pf->oid.len < sizeof(*pf)) {
 		D("short profile len %d", pf->oid.len);
 		return EINVAL;
 	}
 	i = pf->link_nr;
 	if (i <= 0 || i >= DN_MAX_ID)
 		return EINVAL;
 	/* XXX other sanity checks */
 	DN_BH_WLOCK();
 	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
 		s = locate_scheduler(i);
 
 		if (s == NULL) {
 			err = EINVAL;
 			break;
 		}
 		dn_cfg.id++;
 		/*
 		 * If we had a profile and the new one does not fit,
 		 * or it is deleted, then we need to free memory.
 		 */
 		if (s->profile && (pf->samples_no == 0 ||
 		    s->profile->oid.len < pf->oid.len)) {
 			free(s->profile, M_DUMMYNET);
 			s->profile = NULL;
 		}
 		if (pf->samples_no == 0)
 			continue;
 		/*
 		 * new profile, possibly allocate memory
 		 * and copy data.
 		 */
 		if (s->profile == NULL)
 			s->profile = malloc(pf->oid.len,
 				    M_DUMMYNET, M_NOWAIT | M_ZERO);
 		if (s->profile == NULL) {
 			D("no memory for profile %d", i);
 			err = ENOMEM;
 			break;
 		}
 		/* preserve larger length XXX double check */
 		olen = s->profile->oid.len;
 		if (olen < pf->oid.len)
 			olen = pf->oid.len;
 		bcopy(pf, s->profile, pf->oid.len);
 		s->profile->oid.len = olen;
 	}
 	DN_BH_WUNLOCK();
 	return err;
 }
 
 /*
  * Delete all objects:
  */
 static void
 dummynet_flush(void)
 {
 
 	/* delete all schedulers and related links/queues/flowsets */
 	dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
 		(void *)(uintptr_t)DN_DELETE_FS);
 	/* delete all remaining (unlinked) flowsets */
 	DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
 	dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
 	fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
 	/* Reinitialize system heap... */
 	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
 }
 
 /*
  * Main handler for configuration. We are guaranteed to be called
  * with an oid which is at least a dn_id.
  * - the first object is the command (config, delete, flush, ...)
  * - config_link must be issued after the corresponding config_sched
  * - parameters (DN_TXT) for an object must preceed the object
  *   processed on a config_sched.
  */
 int
 do_config(void *p, int l)
 {
 	struct dn_id *next, *o;
 	int err = 0, err2 = 0;
 	struct dn_id *arg = NULL;
 	uintptr_t *a;
 
 	o = p;
 	if (o->id != DN_API_VERSION) {
 		D("invalid api version got %d need %d",
 			o->id, DN_API_VERSION);
 		return EINVAL;
 	}
 	for (; l >= sizeof(*o); o = next) {
 		struct dn_id *prev = arg;
 		if (o->len < sizeof(*o) || l < o->len) {
 			D("bad len o->len %d len %d", o->len, l);
 			err = EINVAL;
 			break;
 		}
 		l -= o->len;
 		next = (struct dn_id *)((char *)o + o->len);
 		err = 0;
 		switch (o->type) {
 		default:
 			D("cmd %d not implemented", o->type);
 			break;
 
 #ifdef EMULATE_SYSCTL
 		/* sysctl emulation.
 		 * if we recognize the command, jump to the correct
 		 * handler and return
 		 */
 		case DN_SYSCTL_SET:
 			err = kesysctl_emu_set(p, l);
 			return err;
 #endif
 
 		case DN_CMD_CONFIG: /* simply a header */
 			break;
 
 		case DN_CMD_DELETE:
 			/* the argument is in the first uintptr_t after o */
 			a = (uintptr_t *)(o+1);
 			if (o->len < sizeof(*o) + sizeof(*a)) {
 				err = EINVAL;
 				break;
 			}
 			switch (o->subtype) {
 			case DN_LINK:
 				/* delete base and derived schedulers */
 				DN_BH_WLOCK();
 				err = delete_schk(*a);
 				err2 = delete_schk(*a + DN_MAX_ID);
 				DN_BH_WUNLOCK();
 				if (!err)
 					err = err2;
 				break;
 
 			default:
 				D("invalid delete type %d",
 					o->subtype);
 				err = EINVAL;
 				break;
 
 			case DN_FS:
 				err = (*a <1 || *a >= DN_MAX_ID) ?
 					EINVAL : delete_fs(*a, 0) ;
 				break;
 			}
 			break;
 
 		case DN_CMD_FLUSH:
 			DN_BH_WLOCK();
 			dummynet_flush();
 			DN_BH_WUNLOCK();
 			break;
 		case DN_TEXT:	/* store argument the next block */
 			prev = NULL;
 			arg = o;
 			break;
 		case DN_LINK:
 			err = config_link((struct dn_link *)o, arg);
 			break;
 		case DN_PROFILE:
 			err = config_profile((struct dn_profile *)o, arg);
 			break;
 		case DN_SCH:
 			err = config_sched((struct dn_sch *)o, arg);
 			break;
 		case DN_FS:
 			err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
 			break;
 		}
 		if (prev)
 			arg = NULL;
 		if (err != 0)
 			break;
 	}
 	return err;
 }
 
 static int
 compute_space(struct dn_id *cmd, struct copy_args *a)
 {
 	int x = 0, need = 0;
 	int profile_size = sizeof(struct dn_profile) - 
 		ED_MAX_SAMPLES_NO*sizeof(int);
 
 	/* NOTE about compute space:
 	 * NP 	= dn_cfg.schk_count
 	 * NSI 	= dn_cfg.si_count
 	 * NF 	= dn_cfg.fsk_count
 	 * NQ 	= dn_cfg.queue_count
 	 * - ipfw pipe show
 	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
 	 *                             link, scheduler template, flowset
 	 *                             integrated in scheduler and header
 	 *                             for flowset list
 	 *   (NSI)*(dn_flow) all scheduler instance (includes
 	 *                              the queue instance)
 	 * - ipfw sched show
 	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
 	 *                             link, scheduler template, flowset
 	 *                             integrated in scheduler and header
 	 *                             for flowset list
 	 *   (NSI * dn_flow) all scheduler instances
 	 *   (NF * sizeof(uint_32)) space for flowset list linked to scheduler
 	 *   (NQ * dn_queue) all queue [XXXfor now not listed]
 	 * - ipfw queue show
 	 *   (NF * dn_fs) all flowset
 	 *   (NQ * dn_queue) all queues
 	 */
 	switch (cmd->subtype) {
 	default:
 		return -1;
 	/* XXX where do LINK and SCH differ ? */
 	/* 'ipfw sched show' could list all queues associated to
 	 * a scheduler. This feature for now is disabled
 	 */
 	case DN_LINK:	/* pipe show */
 		x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
 		need += dn_cfg.schk_count *
 			(sizeof(struct dn_fs) + profile_size) / 2;
 		need += dn_cfg.fsk_count * sizeof(uint32_t);
 		break;
 	case DN_SCH:	/* sched show */
 		need += dn_cfg.schk_count *
 			(sizeof(struct dn_fs) + profile_size) / 2;
 		need += dn_cfg.fsk_count * sizeof(uint32_t);
 		x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
 		break;
 	case DN_FS:	/* queue show */
 		x = DN_C_FS | DN_C_QUEUE;
 		break;
 	case DN_GET_COMPAT:	/* compatibility mode */
 		need =  dn_compat_calc_size(); 
 		break;
 	}
 	a->flags = x;
 	if (x & DN_C_SCH) {
 		need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
 		/* NOT also, each fs might be attached to a sched */
 		need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
 	}
 	if (x & DN_C_FS)
 		need += dn_cfg.fsk_count * sizeof(struct dn_fs);
 	if (x & DN_C_LINK) {
 		need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
 	}
 	/*
 	 * When exporting a queue to userland, only pass up the
 	 * struct dn_flow, which is the only visible part.
 	 */
 
 	if (x & DN_C_QUEUE)
 		need += dn_cfg.queue_count * sizeof(struct dn_flow);
 	if (x & DN_C_FLOW)
 		need += dn_cfg.si_count * (sizeof(struct dn_flow));
 	return need;
 }
 
 /*
  * If compat != NULL dummynet_get is called in compatibility mode.
  * *compat will be the pointer to the buffer to pass to ipfw
  */
 int
 dummynet_get(struct sockopt *sopt, void **compat)
 {
 	int have, i, need, error;
 	char *start = NULL, *buf;
 	size_t sopt_valsize;
 	struct dn_id *cmd;
 	struct copy_args a;
 	struct copy_range r;
 	int l = sizeof(struct dn_id);
 
 	bzero(&a, sizeof(a));
 	bzero(&r, sizeof(r));
 
 	/* save and restore original sopt_valsize around copyin */
 	sopt_valsize = sopt->sopt_valsize;
 
 	cmd = &r.o;
 
 	if (!compat) {
 		/* copy at least an oid, and possibly a full object */
 		error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
 		sopt->sopt_valsize = sopt_valsize;
 		if (error)
 			goto done;
 		l = cmd->len;
 #ifdef EMULATE_SYSCTL
 		/* sysctl emulation. */
 		if (cmd->type == DN_SYSCTL_GET)
 			return kesysctl_emu_get(sopt);
 #endif
 		if (l > sizeof(r)) {
 			/* request larger than default, allocate buffer */
 			cmd = malloc(l,  M_DUMMYNET, M_WAITOK);
 			error = sooptcopyin(sopt, cmd, l, l);
 			sopt->sopt_valsize = sopt_valsize;
 			if (error)
 				goto done;
 		}
 	} else { /* compatibility */
 		error = 0;
 		cmd->type = DN_CMD_GET;
 		cmd->len = sizeof(struct dn_id);
 		cmd->subtype = DN_GET_COMPAT;
 		// cmd->id = sopt_valsize;
 		D("compatibility mode");
 	}
 	a.extra = (struct copy_range *)cmd;
 	if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
 		uint32_t *rp = (uint32_t *)(cmd + 1);
 		cmd->len += 2* sizeof(uint32_t);
 		rp[0] = 1;
 		rp[1] = DN_MAX_ID - 1;
 		if (cmd->subtype == DN_LINK) {
 			rp[0] += DN_MAX_ID;
 			rp[1] += DN_MAX_ID;
 		}
 	}
 	/* Count space (under lock) and allocate (outside lock).
 	 * Exit with lock held if we manage to get enough buffer.
 	 * Try a few times then give up.
 	 */
 	for (have = 0, i = 0; i < 10; i++) {
 		DN_BH_WLOCK();
 		need = compute_space(cmd, &a);
 
 		/* if there is a range, ignore value from compute_space() */
 		if (l > sizeof(*cmd))
 			need = sopt_valsize - sizeof(*cmd);
 
 		if (need < 0) {
 			DN_BH_WUNLOCK();
 			error = EINVAL;
 			goto done;
 		}
 		need += sizeof(*cmd);
 		cmd->id = need;
 		if (have >= need)
 			break;
 
 		DN_BH_WUNLOCK();
 		if (start)
 			free(start, M_DUMMYNET);
 		start = NULL;
 		if (need > sopt_valsize)
 			break;
 
 		have = need;
 		start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
 	}
 
 	if (start == NULL) {
 		if (compat) {
 			*compat = NULL;
 			error =  1; // XXX
 		} else {
 			error = sooptcopyout(sopt, cmd, sizeof(*cmd));
 		}
 		goto done;
 	}
 	ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
 		"%d:%d si %d, %d:%d queues %d",
 		dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
 		dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
 		dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
 		dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
 		dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
 	sopt->sopt_valsize = sopt_valsize;
 	a.type = cmd->subtype;
 
 	if (compat == NULL) {
 		bcopy(cmd, start, sizeof(*cmd));
 		((struct dn_id*)(start))->len = sizeof(struct dn_id);
 		buf = start + sizeof(*cmd);
 	} else
 		buf = start;
 	a.start = &buf;
 	a.end = start + have;
 	/* start copying other objects */
 	if (compat) {
 		a.type = DN_COMPAT_PIPE;
 		dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
 		a.type = DN_COMPAT_QUEUE;
 		dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
 	} else if (a.type == DN_FS) {
 		dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
 	} else {
 		dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
 	}
 	DN_BH_WUNLOCK();
 
 	if (compat) {
 		*compat = start;
 		sopt->sopt_valsize = buf - start;
 		/* free() is done by ip_dummynet_compat() */
 		start = NULL; //XXX hack
 	} else {
 		error = sooptcopyout(sopt, start, buf - start);
 	}
 done:
 	if (cmd && cmd != &r.o)
 		free(cmd, M_DUMMYNET);
 	if (start)
 		free(start, M_DUMMYNET);
 	return error;
 }
 
 /* Callback called on scheduler instance to delete it if idle */
 static int
 drain_scheduler_cb(void *_si, void *arg)
 {
 	struct dn_sch_inst *si = _si;
 
 	if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
 		return 0;
 
 	if (si->sched->fp->flags & DN_MULTIQUEUE) {
 		if (si->q_count == 0)
 			return si_destroy(si, NULL);
 		else
 			return 0;
 	} else { /* !DN_MULTIQUEUE */
 		if ((si+1)->ni.length == 0)
 			return si_destroy(si, NULL);
 		else
 			return 0;
 	}
 	return 0; /* unreachable */
 }
 
 /* Callback called on scheduler to check if it has instances */
 static int
 drain_scheduler_sch_cb(void *_s, void *arg)
 {
 	struct dn_schk *s = _s;
 
 	if (s->sch.flags & DN_HAVE_MASK) {
 		dn_ht_scan_bucket(s->siht, &s->drain_bucket,
 				drain_scheduler_cb, NULL);
 		s->drain_bucket++;
 	} else {
 		if (s->siht) {
 			if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
 				s->siht = NULL;
 		}
 	}
 	return 0;
 }
 
 /* Called every tick, try to delete a 'bucket' of scheduler */
 void
 dn_drain_scheduler(void)
 {
 	dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
 			   drain_scheduler_sch_cb, NULL);
 	dn_cfg.drain_sch++;
 }
 
 /* Callback called on queue to delete if it is idle */
 static int
 drain_queue_cb(void *_q, void *arg)
 {
 	struct dn_queue *q = _q;
 
 	if (q->ni.length == 0) {
 		dn_delete_queue(q, DN_DESTROY);
 		return DNHT_SCAN_DEL; /* queue is deleted */
 	}
 
 	return 0; /* queue isn't deleted */
 }
 
 /* Callback called on flowset used to check if it has queues */
 static int
 drain_queue_fs_cb(void *_fs, void *arg)
 {
 	struct dn_fsk *fs = _fs;
 
 	if (fs->fs.flags & DN_QHT_HASH) {
 		/* Flowset has a hash table for queues */
 		dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
 				drain_queue_cb, NULL);
 		fs->drain_bucket++;
 	} else {
 		/* No hash table for this flowset, null the pointer 
 		 * if the queue is deleted
 		 */
 		if (fs->qht) {
 			if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
 				fs->qht = NULL;
 		}
 	}
 	return 0;
 }
 
 /* Called every tick, try to delete a 'bucket' of queue */
 void
 dn_drain_queue(void)
 {
 	/* scan a bucket of flowset */
 	dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
                                drain_queue_fs_cb, NULL);
 	dn_cfg.drain_fs++;
 }
 
 /*
  * Handler for the various dummynet socket options
  */
 static int
 ip_dn_ctl(struct sockopt *sopt)
 {
 	void *p = NULL;
 	int error, l;
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
 	if (error)
 		return (error);
 
 	/* Disallow sets in really-really secure mode. */
 	if (sopt->sopt_dir == SOPT_SET) {
 		error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error)
 			return (error);
 	}
 
 	switch (sopt->sopt_name) {
 	default :
 		D("dummynet: unknown option %d", sopt->sopt_name);
 		error = EINVAL;
 		break;
 
 	case IP_DUMMYNET_FLUSH:
 	case IP_DUMMYNET_CONFIGURE:
 	case IP_DUMMYNET_DEL:	/* remove a pipe or queue */
 	case IP_DUMMYNET_GET:
 		D("dummynet: compat option %d", sopt->sopt_name);
 		error = ip_dummynet_compat(sopt);
 		break;
 
 	case IP_DUMMYNET3 :
 		if (sopt->sopt_dir == SOPT_GET) {
 			error = dummynet_get(sopt, NULL);
 			break;
 		}
 		l = sopt->sopt_valsize;
 		if (l < sizeof(struct dn_id) || l > 12000) {
 			D("argument len %d invalid", l);
 			break;
 		}
 		p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
 		error = sooptcopyin(sopt, p, l, l);
 		if (error)
 			break ;
 		error = do_config(p, l);
 		break;
 	}
 
 	if (p != NULL)
 		free(p, M_TEMP);
 
 	return error ;
 }
 
 
 static void
 ip_dn_init(void)
 {
 	if (dn_cfg.init_done)
 		return;
 	printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
 	dn_cfg.init_done = 1;
 	/* Set defaults here. MSVC does not accept initializers,
 	 * and this is also useful for vimages
 	 */
 	/* queue limits */
 	dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
 	dn_cfg.byte_limit = 1024 * 1024;
 	dn_cfg.expire = 1;
 
 	/* RED parameters */
 	dn_cfg.red_lookup_depth = 256;	/* default lookup table depth */
 	dn_cfg.red_avg_pkt_size = 512;	/* default medium packet size */
 	dn_cfg.red_max_pkt_size = 1500;	/* default max packet size */
 
 	/* hash tables */
 	dn_cfg.max_hash_size = 65536;	/* max in the hash tables */
 	dn_cfg.hash_size = 64;		/* default hash size */
 
 	/* create hash tables for schedulers and flowsets.
 	 * In both we search by key and by pointer.
 	 */
 	dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
 		offsetof(struct dn_schk, schk_next),
 		schk_hash, schk_match, schk_new);
 	dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
 		offsetof(struct dn_fsk, fsk_next),
 		fsk_hash, fsk_match, fsk_new);
 
 	/* bucket index to drain object */
 	dn_cfg.drain_fs = 0;
 	dn_cfg.drain_sch = 0;
 
 	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
 	SLIST_INIT(&dn_cfg.fsu);
 	SLIST_INIT(&dn_cfg.schedlist);
 
 	DN_LOCK_INIT();
 
 	TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
 	dn_tq = taskqueue_create_fast("dummynet", M_WAITOK,
 	    taskqueue_thread_enqueue, &dn_tq);
 	taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
 
 	callout_init(&dn_timeout, 1);
 	dn_reschedule();
 
 	/* Initialize curr_time adjustment mechanics. */
 	getmicrouptime(&dn_cfg.prev_t);
 }
 
 static void
 ip_dn_destroy(int last)
 {
 	DN_BH_WLOCK();
 	/* ensure no more callouts are started */
 	dn_gone = 1;
 
 	/* check for last */
 	if (last) {
 		ND("removing last instance\n");
 		ip_dn_ctl_ptr = NULL;
 		ip_dn_io_ptr = NULL;
 	}
 
 	dummynet_flush();
 	DN_BH_WUNLOCK();
 
 	callout_drain(&dn_timeout);
 	taskqueue_drain(dn_tq, &dn_task);
 	taskqueue_free(dn_tq);
 
 	dn_ht_free(dn_cfg.schedhash, 0);
 	dn_ht_free(dn_cfg.fshash, 0);
 	heap_free(&dn_cfg.evheap);
 
 	DN_LOCK_DESTROY();
 }
 
 static int
 dummynet_modevent(module_t mod, int type, void *data)
 {
 
 	if (type == MOD_LOAD) {
 		if (ip_dn_io_ptr) {
 			printf("DUMMYNET already loaded\n");
 			return EEXIST ;
 		}
 		ip_dn_init();
 		ip_dn_ctl_ptr = ip_dn_ctl;
 		ip_dn_io_ptr = dummynet_io;
 		return 0;
 	} else if (type == MOD_UNLOAD) {
 		ip_dn_destroy(1 /* last */);
 		return 0;
 	} else
 		return EOPNOTSUPP;
 }
 
 /* modevent helpers for the modules */
 static int
 load_dn_sched(struct dn_alg *d)
 {
 	struct dn_alg *s;
 
 	if (d == NULL)
 		return 1; /* error */
 	ip_dn_init();	/* just in case, we need the lock */
 
 	/* Check that mandatory funcs exists */
 	if (d->enqueue == NULL || d->dequeue == NULL) {
 		D("missing enqueue or dequeue for %s", d->name);
 		return 1;
 	}
 
 	/* Search if scheduler already exists */
 	DN_BH_WLOCK();
 	SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
 		if (strcmp(s->name, d->name) == 0) {
 			D("%s already loaded", d->name);
 			break; /* scheduler already exists */
 		}
 	}
 	if (s == NULL)
 		SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
 	DN_BH_WUNLOCK();
 	D("dn_sched %s %sloaded", d->name, s ? "not ":"");
 	return s ? 1 : 0;
 }
 
 static int
 unload_dn_sched(struct dn_alg *s)
 {
 	struct dn_alg *tmp, *r;
 	int err = EINVAL;
 
 	ND("called for %s", s->name);
 
 	DN_BH_WLOCK();
 	SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
 		if (strcmp(s->name, r->name) != 0)
 			continue;
 		ND("ref_count = %d", r->ref_count);
 		err = (r->ref_count != 0) ? EBUSY : 0;
 		if (err == 0)
 			SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
 		break;
 	}
 	DN_BH_WUNLOCK();
 	D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
 	return err;
 }
 
 int
 dn_sched_modevent(module_t mod, int cmd, void *arg)
 {
 	struct dn_alg *sch = arg;
 
 	if (cmd == MOD_LOAD)
 		return load_dn_sched(sch);
 	else if (cmd == MOD_UNLOAD)
 		return unload_dn_sched(sch);
 	else
 		return EINVAL;
 }
 
 static moduledata_t dummynet_mod = {
 	"dummynet", dummynet_modevent, NULL
 };
 
-#define	DN_SI_SUB	SI_SUB_PROTO_IFATTACHDOMAIN
+#define	DN_SI_SUB	SI_SUB_FW
 #define	DN_MODEV_ORD	(SI_ORDER_ANY - 128) /* after ipfw */
 DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
 MODULE_DEPEND(dummynet, ipfw, 3, 3, 3);
 MODULE_VERSION(dummynet, 3);
 
 /*
  * Starting up. Done in order after dummynet_modevent() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 //VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);
 
 /*
  * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
  * after dummynet_modevent() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
 
 /* end of file */
Index: projects/vnet/sys/netpfil/ipfw/ip_fw2.c
===================================================================
--- projects/vnet/sys/netpfil/ipfw/ip_fw2.c	(revision 294565)
+++ projects/vnet/sys/netpfil/ipfw/ip_fw2.c	(revision 294566)
@@ -1,2915 +1,2915 @@
 /*-
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * The FreeBSD IP packet firewall, main file
  */
 
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error "IPFIREWALL requires INET"
 #endif /* INET */
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_fib.h>
 #ifdef INET6
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * static variables followed by global ones.
  * All ipfw global variables are here.
  */
 
 static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
 #define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)
 
 static VNET_DEFINE(int, fw_permit_single_frag6) = 1;
 #define	V_fw_permit_single_frag6	VNET(fw_permit_single_frag6)
 
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 static int default_to_accept = 1;
 #else
 static int default_to_accept;
 #endif
 
 VNET_DEFINE(int, autoinc_step);
 VNET_DEFINE(int, fw_one_pass) = 1;
 
 VNET_DEFINE(unsigned int, fw_tables_max);
 VNET_DEFINE(unsigned int, fw_tables_sets) = 0;	/* Don't use set-aware tables */
 /* Use 128 tables by default */
 static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
 
 #ifndef LINEAR_SKIPTO
 static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_fast(ch, f, num, targ, back)
 #else
 static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_linear(ch, f, num, targ, back)
 #endif
 
 /*
  * Each rule belongs to one of 32 different sets (0..31).
  * The variable set_disable contains one bit per set.
  * If the bit is set, all rules in the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted individually.
  */
 VNET_DEFINE(u_int32_t, set_disable);
 #define	V_set_disable			VNET(set_disable)
 
 VNET_DEFINE(int, fw_verbose);
 /* counter for ipfw_log(NULL...) */
 VNET_DEFINE(u_int64_t, norule_counter);
 VNET_DEFINE(int, verbose_limit);
 
 /* layer3_chain contains the list of rules for layer 3 */
 VNET_DEFINE(struct ip_fw_chain, layer3_chain);
 
 /* ipfw_vnet_ready controls when we are open for business */
 VNET_DEFINE(int, ipfw_vnet_ready) = 0;
 
 VNET_DEFINE(int, ipfw_nat_ready) = 0;
 
 ipfw_nat_t *ipfw_nat_ptr = NULL;
 struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
 uint32_t dummy_def = IPFW_DEFAULT_RULE;
 static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f3)
 
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
     "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
     "Maximum number of concurrently used tables");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     0, 0, sysctl_ipfw_tables_sets, "IU",
     "Use per-set namespace for tables");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
 TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
     CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
     "Number of static rules");
 
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_deny_unknown_exthdrs), 0,
     "Deny packets with unknown IPv6 Extension Headers");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_permit_single_frag6), 0,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 
 /*
  * Some macros used in the various matching options.
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(tcp + 1);
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
     uint32_t *tablearg)
 {
 
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return (0);
 
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		if (cmd->name[0] == '\1') /* use tablearg to match */
 			return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0,
 				&ifp->if_index, tablearg);
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 #if !defined(USERSPACE) && defined(__FreeBSD__)	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
 		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr) {
 				if_addr_runlock(ifp);
 				return(1);	/* match */
 			}
 		}
 		if_addr_runlock(ifp);
 #endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.
  * The 'versrcreach' option just checks that the source address is
  * reachable via any route (except default) in the routing table.
  * These two are a measure to block forged packets. This is also
  * commonly known as "anti-spoofing" or Unicast Reverse Path
  * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that the syntax
  * is misleading, and the check may be performed on all IP packets
  * whether unicast, multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
 #if defined(USERSPACE) || !defined(__FreeBSD__)
 	return 0;
 #else
 	struct nhop4_basic nh4;
 
 	if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0)
 		return (0);
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ifp != nh4.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 #endif /* __FreeBSD__ */
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match (int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i )
 		if (curr_flow == cmd->d[i] )
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static const struct in6_addr lla_mask = {{{
 	0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 }}};
 
 static int
 ipfw_localip6(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_MULTICAST(in6))
 		return (0);
 
 	if (!IN6_IS_ADDR_LINKLOCAL(in6))
 		return (in6_localip(in6));
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    in6, &lla_mask)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
 {
 	struct nhop6_basic nh6;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(src))
 		return (1);
 
 	if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0)
 		return (0);
 
 	/* If ifp is provided, check for equality with route table. */
 	if (ifp != NULL && ifp != nh6.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m0;
 			m0 = ipfw_send_pkt(args->m, &(args->f_id),
 			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 			    tcp->th_flags | TH_RST);
 			if (m0 != NULL)
 				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
 				    NULL);
 		}
 		FREE_PKT(m);
 	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		FREE_PKT(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 {
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	} else if (args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = ipfw_send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		FREE_PKT(args->m);
 	} else
 		FREE_PKT(args->m);
 	args->m = NULL;
 }
 
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
  * so we cache the results. ugid_lookupp is 0 if we have not
  * yet done a lookup, 1 if we succeeded, and -1 if we tried
  * and failed. The function always returns the match value.
  * We could actually spare the variable and use *uc, setting
  * it to '(void *)check_uidgid if we have no info, NULL if
  * we tried and failed, or any other value if successful.
  */
 static int
 check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
     struct ucred **uc)
 {
 #if defined(USERSPACE)
 	return 0;	// not supported in userspace
 #else
 #ifndef __FreeBSD__
 	/* XXX */
 	return cred_check(insn, proto, oif,
 	    dst_ip, dst_port, src_ip, src_port,
 	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
 #else  /* FreeBSD */
 	struct in_addr src_ip, dst_ip;
 	struct inpcbinfo *pi;
 	struct ipfw_flow_id *id;
 	struct inpcb *pcb, *inp;
 	struct ifnet *oif;
 	int lookupflags;
 	int match;
 
 	id = &args->f_id;
 	inp = args->inp;
 	oif = args->oif;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *ugid_lookupp == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			*uc = crhold(inp->inp_cred);
 			*ugid_lookupp = 1;
 		} else
 			*ugid_lookupp = -1;
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*ugid_lookupp == -1)
 		return (0);
 	if (id->proto == IPPROTO_TCP) {
 		lookupflags = 0;
 		pi = &V_tcbinfo;
 	} else if (id->proto == IPPROTO_UDP) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_udbinfo;
 	} else
 		return 0;
 	lookupflags |= INPLOOKUP_RLOCKPCB;
 	match = 0;
 	if (*ugid_lookupp == 0) {
 		if (id->addr_type == 6) {
 #ifdef INET6
 			if (oif == NULL)
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->src_ip6, htons(id->src_port),
 				    &id->dst_ip6, htons(id->dst_port),
 				    lookupflags, oif, args->m);
 			else
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->dst_ip6, htons(id->dst_port),
 				    &id->src_ip6, htons(id->src_port),
 				    lookupflags, oif, args->m);
 #else
 			*ugid_lookupp = -1;
 			return (0);
 #endif
 		} else {
 			src_ip.s_addr = htonl(id->src_ip);
 			dst_ip.s_addr = htonl(id->dst_ip);
 			if (oif == NULL)
 				pcb = in_pcblookup_mbuf(pi,
 				    src_ip, htons(id->src_port),
 				    dst_ip, htons(id->dst_port),
 				    lookupflags, oif, args->m);
 			else
 				pcb = in_pcblookup_mbuf(pi,
 				    dst_ip, htons(id->dst_port),
 				    src_ip, htons(id->src_port),
 				    lookupflags, oif, args->m);
 		}
 		if (pcb != NULL) {
 			INP_RLOCK_ASSERT(pcb);
 			*uc = crhold(pcb->inp_cred);
 			*ugid_lookupp = 1;
 			INP_RUNLOCK(pcb);
 		}
 		if (*ugid_lookupp == 0) {
 			/*
 			 * We tried and failed, set the variable to -1
 			 * so we will not try again on this packet.
 			 */
 			*ugid_lookupp = -1;
 			return (0);
 		}
 	}
 	if (insn->o.opcode == O_UID)
 		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID)
 		match = groupmember((gid_t)insn->d[0], *uc);
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return (match);
 #endif /* __FreeBSD__ */
 #endif /* not supported in userspace */
 }
 
 /*
  * Helper function to set args with info on the rule after the matching
  * one. slot is precise, whereas we guess rule_id as they are
  * assigned sequentially.
  */
 static inline void
 set_match(struct ip_fw_args *args, int slot,
 	struct ip_fw_chain *chain)
 {
 	args->rule.chain_id = chain->id;
 	args->rule.slot = slot + 1; /* we use 0 as a marker */
 	args->rule.rule_id = 1 + chain->map[slot]->id;
 	args->rule.rulenum = chain->map[slot]->rulenum;
 }
 
 #ifndef LINEAR_SKIPTO
 /*
  * Helper function to enable cached rule lookups using
  * cached_id and cached_pos fields in ipfw rule.
  */
 static int
 jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	/* If possible use cached f_pos (in f->cached_pos),
 	 * whose version is written in f->cached_id
 	 * (horrible hacks to avoid changing the ABI).
 	 */
 	if (num != IP_FW_TARG && f->cached_id == chain->id)
 		f_pos = f->cached_pos;
 	else {
 		int i = IP_FW_ARG_TABLEARG(chain, num, skipto);
 		/* make sure we do not jump backward */
 		if (jump_backwards == 0 && i <= f->rulenum)
 			i = f->rulenum + 1;
 		if (chain->idxmap != NULL)
 			f_pos = chain->idxmap[i];
 		else
 			f_pos = ipfw_find_rule(chain, i, 0);
 		/* update the cache */
 		if (num != IP_FW_TARG) {
 			f->cached_id = chain->id;
 			f->cached_pos = f_pos;
 		}
 	}
 
 	return (f_pos);
 }
 #else
 /*
  * Helper function to enable real fast rule lookups.
  */
 static int
 jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	num = IP_FW_ARG_TABLEARG(chain, num, skipto);
 	/* make sure we do not jump backward */
 	if (jump_backwards == 0 && num <= f->rulenum)
 		num = f->rulenum + 1;
 	f_pos = chain->idxmap[num];
 
 	return (f_pos);
 }
 #endif
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->eh (in)	Mac header if present, NULL for layer3 packet.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->oif	Outgoing interface, NULL if packet is incoming.
  *		The incoming interface is in the mbuf. (in)
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->next_hop6	IPv6 next hop we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->rule.info	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *		args->rule contains the matching rule,
  *		args->rule.info has additional information.
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 
 	/*
 	 * Local variables holding state while processing a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * args->eh	The MAC header. It is non-null for a layer2
 	 *	packet, it is NULL for a layer-3 packet.
 	 * **notyet**
 	 * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 * ip	is the beginning of the ip(4 or 6) header.
 	 *	Calculated by adding the L3offset to the start of data.
 	 *	(Until we start using L3offset, the packet is
 	 *	supposed to start with the ip header).
 	 */
 	struct mbuf *m = args->m;
 	struct ip *ip = mtod(m, struct ip *);
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 #ifndef __FreeBSD__
 	struct bsd_ucred ucred_cache;
 #else
 	struct ucred *ucred_cache = NULL;
 #endif
 	int ucred_lookup = 0;
 
 	/*
 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
 	 *	inbound path (ether_input, ip_input).
 	 *	If non-NULL, ipfw_chk has been called on the outbound path
 	 *	(ether_output, ip_output).
 	 */
 	struct ifnet *oif = args->oif;
 
 	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
 	 *	or there is a single packet fragement (fragement header added
 	 *	without needed).  We will treat a single packet fragment as if
 	 *	there was no fragment header (or log/block depending on the
 	 *	V_fw_permit_single_frag6 sysctl setting).
 	 */
 	u_short offset = 0;
 	u_short ip6f_mf = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	uint8_t proto;
 	uint16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	uint16_t iplen=0;
 	int pktlen;
 	uint16_t	etype = 0;	/* Host order stored ether type */
 
 	/*
 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
 	 * 	MATCH_NONE when checked and not matched (q = NULL),
 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
 	 */
 	int dyn_dir = MATCH_UNKNOWN;
 	ipfw_dyn_rule *q = NULL;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 	uint8_t	icmp6_type = 0;
 	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
 	IPFW_RLOCK_TRACKER;
 
 	if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
 		return (IP_FW_PASS);	/* accept */
 
 	dst_ip.s_addr = 0;		/* make sure it is initialized */
 	src_ip.s_addr = 0;		/* make sure it is initialized */
 	pktlen = m->m_pkthdr.len;
 	args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
 	proto = args->f_id.proto = 0;	/* mark f_id invalid */
 		/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
 
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
 #define PULLUP_LEN(_len, p, T)					\
 do {								\
 	int x = (_len) + T;					\
 	if ((m)->m_len < x) {					\
 		args->m = m = m_pullup(m, x);			\
 		if (m == NULL)					\
 			goto pullup_failed;			\
 	}							\
 	p = (mtod(m, char *) + (_len));				\
 } while (0)
 
 	/*
 	 * if we have an ether header,
 	 */
 	if (args->eh)
 		etype = ntohs(args->eh->ether_type);
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 		is_ipv6 = 1;
 		args->f_id.addr_type = 6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL && offset == 0) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 				icmp6_type = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				PULLUP_TO(hlen, ulp, struct sctphdr);
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Unknown "
 						    "Routing Header type(%d)\n",
 						    ((struct ip6_rthdr *)
 						    ulp)->ip6r_type);
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (V_fw_permit_single_frag6 == 0 &&
 				    offset == 0 && ip6f_mf == 0) {
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Invalid "
 						    "Fragment Header\n");
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				/*
 				 * Packet ends here, and IPv6 header has
 				 * already been pulled up. If ip6e_len!=0
 				 * then octets must be ignored.
 				 */
 				ulp = ip; /* non-NULL to get out of loop. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, struct carp_header);
 				if (((struct carp_header *)ulp)->carp_version !=
 				    CARP_VERSION) 
 					return (IP_FW_DENY);
 				if (((struct carp_header *)ulp)->carp_type !=
 				    CARP_ADVERTISEMENT) 
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				if (V_fw_verbose)
 					printf("IPFW2: IPV6 - Unknown "
 					    "Extension Header(%d), ext_hd=%x\n",
 					     proto, ext_hd);
 				if (V_fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		ip = mtod(m, struct ip *);
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.src_ip = 0;
 		args->f_id.dst_ip = 0;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
 	    	is_ipv4 = 1;
 		hlen = ip->ip_hl << 2;
 		args->f_id.addr_type = 4;
 
 		/*
 		 * Collect parameters into local variables for faster matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		iplen = ntohs(ip->ip_len);
 		pktlen = iplen < pktlen ? iplen : pktlen;
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				PULLUP_TO(hlen, ulp, struct sctphdr);
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		}
 
 		ip = mtod(m, struct ip *);
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	}
 #undef PULLUP_TO
 	if (proto) { /* we may have port numbers, store them */
 		args->f_id.proto = proto;
 		args->f_id.src_port = src_port = ntohs(src_port);
 		args->f_id.dst_port = dst_port = ntohs(dst_port);
 	}
 
 	IPFW_PF_RLOCK(chain);
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
 		IPFW_PF_RUNLOCK(chain);
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->rule.slot) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH, DIVERT/TEE...)
 		 * Validate the slot and continue from the next one
 		 * if still present, otherwise do a lookup.
 		 */
 		f_pos = (args->rule.chain_id == chain->id) ?
 		    args->rule.slot :
 		    ipfw_find_rule(chain, args->rule.rulenum,
 			args->rule.rule_id);
 	} else {
 		f_pos = 0;
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 * We have two nested loops and an inner switch. Sometimes we
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
 	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
 	 *		cmd points to the current microinstruction.
 	 *
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
 	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
 	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 		struct ip_fw *f;
 
 		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 /* check_body: */
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset != 0)
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 #else
 						    (void *)&ucred_cache);
 #endif
 				break;
 
 			case O_RECV:
 				match = iface_match(m->m_pkthdr.rcvif,
 				    (ipfw_insn_if *)cmd, chain, &tablearg);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_VIA:
 				match = iface_match(oif ? oif :
 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_MACADDR2:
 				if (args->eh != NULL) {	/* have MAC header */
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)args->eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->eh != NULL) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (etype >= p[0] &&
 						    etype <= p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->eh != NULL);
 				break;
 
 			case O_DIVERTED:
 			    {
 				/* For diverted packets, args->rule.info
 				 * contains the divert port (in host format)
 				 * reason and direction.
 				 */
 				uint32_t i = args->rule.info;
 				match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
 				    cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
 			    }
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_SRC_LOOKUP:
 			case O_IP_DST_LOOKUP:
 				if (is_ipv4) {
 				    uint32_t key =
 					(cmd->opcode == O_IP_DST_LOOKUP) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t v = 0;
 
 				    if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
 					/* generic lookup. The key must be
 					 * in 32bit big-endian format.
 					 */
 					v = ((ipfw_insn_u32 *)cmd)->d[1];
 					if (v == 0)
 					    key = dst_ip.s_addr;
 					else if (v == 1)
 					    key = src_ip.s_addr;
 					else if (v == 6) /* dscp */
 					    key = (ip->ip_tos >> 2) & 0x3f;
 					else if (offset != 0)
 					    break;
 					else if (proto != IPPROTO_TCP &&
 						proto != IPPROTO_UDP)
 					    break;
 					else if (v == 2)
 					    key = dst_port;
 					else if (v == 3)
 					    key = src_port;
 #ifndef USERSPACE
 					else if (v == 4 || v == 5) {
 					    check_uidgid(
 						(ipfw_insn_u32 *)cmd,
 						args, &ucred_lookup,
 #ifdef __FreeBSD__
 						&ucred_cache);
 					    if (v == 4 /* O_UID */)
 						key = ucred_cache->cr_uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache->cr_prison->pr_id;
 #else /* !__FreeBSD__ */
 						(void *)&ucred_cache);
 					    if (v ==4 /* O_UID */)
 						key = ucred_cache.uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache.xid;
 #endif /* !__FreeBSD__ */
 					}
 #endif /* !USERSPACE */
 					else
 					    break;
 				    }
 				    match = ipfw_lookup_table(chain,
 					cmd->arg1, key, &v);
 				    if (!match)
 					break;
 				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 					match =
 					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
 				    else
 					tablearg = v;
 				} else if (is_ipv6) {
 					uint32_t v = 0;
 					void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ?
 						&args->f_id.dst_ip6: &args->f_id.src_ip6;
 					match = ipfw_lookup_table_extended(chain,
 							cmd->arg1,
 							sizeof(struct in6_addr),
 							pkey, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
 					if (match)
 						tablearg = v;
 				}
 				break;
 
 			case O_IP_FLOW_LOOKUP:
 				{
 					uint32_t v = 0;
 					match = ipfw_lookup_table_extended(chain,
 					    cmd->arg1, 0, &args->f_id, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
 					if (match)
 						tablearg = v;
 				}
 				break;
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(src_ip, tif);
 					match = (tif != NULL);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
 				match= is_ipv6 && ipfw_localip6(&args->f_id.src_ip6);
 #endif
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(dst_ip, tif);
 					match = (tif != NULL);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
 				match= is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6);
 #endif
 				break;
 
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
 				    && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (is_ipv4 &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPLEN:
 			case O_IPTTL:
 				if (is_ipv4) {	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = iplen;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_DSCP:
 			    {
 				uint32_t *p;
 				uint16_t x;
 
 				p = ((ipfw_insn_u32 *)cmd)->d;
 
 				if (is_ipv4)
 					x = ip->ip_tos >> 2;
 				else if (is_ipv6) {
 					uint8_t *v;
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					x = (*v & 0x0F) << 2;
 					v++;
 					x |= *v >> 6;
 				} else
 					break;
 
 				/* DSCP bitmask is stored as low_u32 high_u32 */
 				if (x > 32)
 					match = *(p + 1) & (1 << (x - 32));
 				else
 					match = *p & (1 << x);
 			    }
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    tcp = TCP(ulp);
 				    x = iplen -
 					((ip->ip_hl + tcp->th_off) << 2);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				if (proto == IPPROTO_TCP && offset == 0 && ulp){
 					PULLUP_LEN(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					match = tcpopts_match(TCP(ulp), cmd);
 				}
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPWIN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    x = ntohs(TCP(ulp)->th_win);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* Otherwise we have ranges. */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i > 0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				struct m_tag *mtag;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				/*
 				 * ALTQ uses mbuf tags from another
 				 * packet filtering system - pf(4).
 				 * We allocate a tag in its format
 				 * and fill it in, pretending to be pf(4).
 				 */
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				mtag = m_tag_get(PACKET_TAG_PF,
 				    sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
 				if (mtag == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				m_tag_prepend(m, mtag);
 				at = (struct pf_mtag *)(mtag + 1);
 				at->qid = altq->qid;
 				at->hdr = ip;
 				break;
 			}
 
 			case O_LOG:
 				ipfw_log(chain, f, hlen, args, m,
 				    oif, offset | ip6f_mf, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = ((oif != NULL) ||
 				    (m->m_pkthdr.rcvif == NULL) ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    m->m_pkthdr.rcvif, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, m->m_pkthdr.rcvif,
 				        args->f_id.fib)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) ||
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, NULL, args->f_id.fib)));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6),
 					        m->m_pkthdr.rcvif,
 						args->f_id.fib) :
 #endif
 					    verify_path(src_ip,
 					    	m->m_pkthdr.rcvif,
 					        args->f_id.fib);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 #ifdef IPSEC
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 #endif
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 					match = 0;
 				} else {
 					if (mtag == NULL) {
 						mtag = m_tag_alloc( MTAG_IPFW,
 						    tag, 0, M_NOWAIT);
 						if (mtag != NULL)
 							m_tag_prepend(m, mtag);
 					}
 					match = 1;
 				}
 				break;
 			}
 
 			case O_FIB: /* try match the specified fib */
 				if (args->f_id.fib == cmd->arg1)
 					match = 1;
 				break;
 
 			case O_SOCKARG:	{
 #ifndef USERSPACE	/* not supported in userspace */
 				struct inpcb *inp = args->inp;
 				struct inpcbinfo *pi;
 				
 				if (is_ipv6) /* XXX can we remove this ? */
 					break;
 
 				if (proto == IPPROTO_TCP)
 					pi = &V_tcbinfo;
 				else if (proto == IPPROTO_UDP)
 					pi = &V_udbinfo;
 				else
 					break;
 
 				/*
 				 * XXXRW: so_user_cookie should almost
 				 * certainly be inp_user_cookie?
 				 */
 
 				/* For incomming packet, lookup up the 
 				inpcb using the src/dest ip/port tuple */
 				if (inp == NULL) {
 					inp = in_pcblookup(pi, 
 						src_ip, htons(src_port),
 						dst_ip, htons(dst_port),
 						INPLOOKUP_RLOCKPCB, NULL);
 					if (inp != NULL) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 						INP_RUNLOCK(inp);
 					}
 				} else {
 					if (inp->inp_socket) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 					}
 				}
 #endif /* !USERSPACE */
 				break;
 			}
 
 			case O_TAGGED: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to set l=0, done=1)
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   (setting l=0), or to the SKIPTO target (setting
 			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule.
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet must be dropped (set retval,
 			 *   break loops with l=0, done=1)
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found.
 			 *   The result of the lookup is cached so that
 			 *   further instances of these opcodes become NOPs.
 			 *   The jump to the next rule is done by setting
 			 *   l=0, cmdlen=0.
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (ipfw_install_state(chain, f,
 				    (ipfw_insn_limit *)cmd, args, tablearg)) {
 					/* error or limit violation */
 					retval = IP_FW_DENY;
 					l = 0;	/* exit inner loop */
 					done = 1; /* exit outer loop */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_dir.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (dyn_dir == MATCH_UNKNOWN &&
 				    (q = ipfw_lookup_dyn_rule(&args->f_id,
 				     &dyn_dir, proto == IPPROTO_TCP ?
 					TCP(ulp) : NULL))
 					!= NULL) {
 					/*
 					 * Found dynamic entry, update stats
 					 * and jump to the 'action' part of
 					 * the parent rule by setting
 					 * f, cmd, l and clearing cmdlen.
 					 */
 					IPFW_INC_DYN_COUNTER(q, pktlen);
 					/* XXX we would like to have f_pos
 					 * readily accessible in the dynamic
 				         * rule, instead of having to
 					 * lookup q->rule.
 					 */
 					f = q->rule;
 					f_pos = ipfw_find_rule(chain,
 						f->rulenum, f->id);
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					ipfw_dyn_unlock(q);
 					cmdlen = 0;
 					match = 1;
 					break;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					l = 0;	/* exit inner loop */
 				match = 1;
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_PIPE:
 			case O_QUEUE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, pipe);
 				if (cmd->opcode == O_PIPE)
 					args->rule.info |= IPFW_IS_PIPE;
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_DIVERT:
 			case O_TEE:
 				if (args->eh) /* not on layer 2 */
 				    break;
 				/* otherwise this is terminal */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				retval = (cmd->opcode == O_DIVERT) ?
 					IP_FW_DIVERT : IP_FW_TEE;
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, divert);
 				break;
 
 			case O_COUNT:
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    IPFW_INC_RULE_COUNTER(f, pktlen);
 			    f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
 			    /*
 			     * Skip disabled rules, and re-enter
 			     * the inner loop with the correct
 			     * f_pos, f, l and cmd.
 			     * Also clear cmdlen and skip_or
 			     */
 			    for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				     (1 << chain->map[f_pos]->set));
 				    f_pos++)
 				;
 			    /* Re-enter the inner loop at the skipto rule. */
 			    f = chain->map[f_pos];
 			    l = f->cmd_len;
 			    cmd = f->cmd;
 			    match = 1;
 			    cmdlen = 0;
 			    skip_or = 0;
 			    continue;
 			    break;	/* not reached */
 
 			case O_CALLRETURN: {
 				/*
 				 * Implementation of `subroutine' call/return,
 				 * in the stack carried in an mbuf tag. This
 				 * is different from `skipto' in that any call
 				 * address is possible (`skipto' must prevent
 				 * backward jumps to avoid endless loops).
 				 * We have `return' action when F_NOT flag is
 				 * present. The `m_tag_id' field is used as
 				 * stack pointer.
 				 */
 				struct m_tag *mtag;
 				uint16_t jmpto, *stack;
 
 #define	IS_CALL		((cmd->len & F_NOT) == 0)
 #define	IS_RETURN	((cmd->len & F_NOT) != 0)
 				/*
 				 * Hand-rolled version of m_tag_locate() with
 				 * wildcard `type'.
 				 * If not already tagged, allocate new tag.
 				 */
 				mtag = m_tag_first(m);
 				while (mtag != NULL) {
 					if (mtag->m_tag_cookie ==
 					    MTAG_IPFW_CALL)
 						break;
 					mtag = m_tag_next(m, mtag);
 				}
 				if (mtag == NULL && IS_CALL) {
 					mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
 					    IPFW_CALLSTACK_SIZE *
 					    sizeof(uint16_t), M_NOWAIT);
 					if (mtag != NULL)
 						m_tag_prepend(m, mtag);
 				}
 
 				/*
 				 * On error both `call' and `return' just
 				 * continue with next rule.
 				 */
 				if (IS_RETURN && (mtag == NULL ||
 				    mtag->m_tag_id == 0)) {
 					l = 0;		/* exit inner loop */
 					break;
 				}
 				if (IS_CALL && (mtag == NULL ||
 				    mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
 					printf("ipfw: call stack error, "
 					    "go to next rule\n");
 					l = 0;		/* exit inner loop */
 					break;
 				}
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
 				 * The `call' action may use cached f_pos
 				 * (in f->next_rule), whose version is written
 				 * in f->next_rule.
 				 * The `return' action, however, doesn't have
 				 * fixed jump address in cmd->arg1 and can't use
 				 * cache.
 				 */
 				if (IS_CALL) {
 					stack[mtag->m_tag_id] = f->rulenum;
 					mtag->m_tag_id++;
 			    		f_pos = JUMP(chain, f, cmd->arg1,
 					    tablearg, 1);
 				} else {	/* `return' action */
 					mtag->m_tag_id--;
 					jmpto = stack[mtag->m_tag_id] + 1;
 					f_pos = ipfw_find_rule(chain, jmpto, 0);
 				}
 
 				/*
 				 * Skip disabled rules, and re-enter
 				 * the inner loop with the correct
 				 * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
 				for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				    (1 << chain->map[f_pos]->set)); f_pos++)
 					;
 				/* Re-enter the inner loop at the dest rule. */
 				f = chain->map[f_pos];
 				l = f->cmd_len;
 				cmd = f->cmd;
 				cmdlen = 0;
 				skip_or = 0;
 				continue;
 				break;	/* NOTREACHED */
 			}
 #undef IS_CALL
 #undef IS_RETURN
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd->arg1, iplen, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
 					send_reject6(
 					    args, cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_FORWARD_IP:
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (q == NULL || q->rule != f ||
 				    dyn_dir == MATCH_FORWARD) {
 				    struct sockaddr_in *sa;
 
 				    sa = &(((ipfw_insn_sa *)cmd)->sa);
 				    if (sa->sin_addr.s_addr == INADDR_ANY) {
 #ifdef INET6
 					/*
 					 * We use O_FORWARD_IP opcode for
 					 * fwd rule with tablearg, but tables
 					 * now support IPv6 addresses. And
 					 * when we are inspecting IPv6 packet,
 					 * we can use nh6 field from
 					 * table_value as next_hop6 address.
 					 */
 					if (is_ipv6) {
 						struct sockaddr_in6 *sa6;
 
 						sa6 = args->next_hop6 =
 						    &args->hopstore6;
 						sa6->sin6_family = AF_INET6;
 						sa6->sin6_len = sizeof(*sa6);
 						sa6->sin6_addr = TARG_VAL(
 						    chain, tablearg, nh6);
 						/*
 						 * Set sin6_scope_id only for
 						 * link-local unicast addresses.
 						 */
 						if (IN6_IS_ADDR_LINKLOCAL(
 						    &sa6->sin6_addr))
 							sa6->sin6_scope_id =
 							    TARG_VAL(chain,
 								tablearg,
 								zoneid);
 					} else
 #endif
 					{
 						sa = args->next_hop =
 						    &args->hopstore;
 						sa->sin_family = AF_INET;
 						sa->sin_len = sizeof(*sa);
 						sa->sin_addr.s_addr = htonl(
 						    TARG_VAL(chain, tablearg,
 						    nh4));
 					}
 				    } else {
 					args->next_hop = sa;
 				    }
 				}
 				retval = IP_FW_PASS;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 #ifdef INET6
 			case O_FORWARD_IP6:
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (q == NULL || q->rule != f ||
 				    dyn_dir == MATCH_FORWARD) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
 					args->next_hop6 = sin6;
 				}
 				retval = IP_FW_PASS;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 #endif
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, netgraph);
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_SETFIB: {
 				uint32_t fib;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				fib = TARG(cmd->arg1, fib) & 0x7FFF;
 				if (fib >= rt_numfibs)
 					fib = 0;
 				M_SETFIB(m, fib);
 				args->f_id.fib = fib;
 				l = 0;		/* exit inner loop */
 				break;
 		        }
 
 			case O_SETDSCP: {
 				uint16_t code;
 
 				code = TARG(cmd->arg1, dscp) & 0x3F;
 				l = 0;		/* exit inner loop */
 				if (is_ipv4) {
 					uint16_t old;
 
 					old = *(uint16_t *)ip;
 					ip->ip_tos = (code << 2) |
 					    (ip->ip_tos & 0x03);
 					ip->ip_sum = cksum_adjust(ip->ip_sum,
 					    old, *(uint16_t *)ip);
 				} else if (is_ipv6) {
 					uint8_t *v;
 
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					*v = (*v & 0xF0) | (code >> 2);
 					v++;
 					*v = (*v & 0x3F) | ((code & 0x03) << 6);
 				} else
 					break;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				break;
 			}
 
 			case O_NAT:
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
  				if (!IPFW_NAT_LOADED) {
 				    retval = IP_FW_DENY;
 				    break;
 				}
 
 				struct cfg_nat *t;
 				int nat_id;
 
 				set_match(args, f_pos, chain);
 				/* Check if this is 'global' nat rule */
 				if (cmd->arg1 == 0) {
 					retval = ipfw_nat_ptr(args, NULL, m);
 					break;
 				}
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					nat_id = TARG(cmd->arg1, nat);
 					t = (*lookup_nat_ptr)(&chain->nat, nat_id);
 
 					if (t == NULL) {
 					    retval = IP_FW_DENY;
 					    break;
 					}
 					if (cmd->arg1 != IP_FW_TARG)
 					    ((ipfw_insn_nat *)cmd)->nat = t;
 				}
 				retval = ipfw_nat_ptr(args, t, m);
 				break;
 
 			case O_REASS: {
 				int ip_off;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;	/* in any case exit inner loop */
 				ip_off = ntohs(ip->ip_off);
 
 				/* if not fragmented, go to next rule */
 				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
 				    break;
 
 				args->m = m = ip_reass(m);
 
 				/*
 				 * do IP header checksum fixup.
 				 */
 				if (m == NULL) { /* fragment got swallowed */
 				    retval = IP_FW_DENY;
 				} else { /* good, packet complete */
 				    int hlen;
 
 				    ip = mtod(m, struct ip *);
 				    hlen = ip->ip_hl << 2;
 				    ip->ip_sum = 0;
 				    if (hlen == sizeof(struct ip))
 					ip->ip_sum = in_cksum_hdr(ip);
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
 				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
 			}
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 			/*
 			 * if we get here with l=0, then match is irrelevant.
 			 */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner loop, scan opcodes */
 #undef PULLUP_LEN
 
 		if (done)
 			break;
 
 /* next_rule:; */	/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 
 	if (done) {
 		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
 		IPFW_INC_RULE_COUNTER(rule, pktlen);
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_PF_RUNLOCK(chain);
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
 #endif
 	return (retval);
 
 pullup_failed:
 	if (V_fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * Set maximum number of tables that can be used in given VNET ipfw instance.
  */
 #ifdef SYSCTL_NODE
 static int
 sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int ntables;
 
 	ntables = V_fw_tables_max;
 
 	error = sysctl_handle_int(oidp, &ntables, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_resize_tables(&V_layer3_chain, ntables));
 }
 
 /*
  * Switches table namespace between global and per-set.
  */
 static int
 sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int sets;
 
 	sets = V_fw_tables_sets;
 
 	error = sysctl_handle_int(oidp, &sets, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
 }
 #endif
 
 /*
  * Module and VNET glue
  */
 
 /*
  * Stuff that must be initialised only on boot or module load
  */
 static int
 ipfw_init(void)
 {
 	int error = 0;
 
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
 	 */
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, nat %s, "
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 #ifdef IPFIREWALL_NAT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_to_accept ? "accept" : "deny");
 
 	/*
 	 * Note: V_xxx variables can be accessed here but the vnet specific
 	 * initializer may not have been called yet for the VIMAGE case.
 	 * Tuneables will have been processed. We will print out values for
 	 * the default vnet. 
 	 * XXX This should all be rationalized AFTER 8.0
 	 */
 	if (V_fw_verbose == 0)
 		printf("disabled\n");
 	else if (V_verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    V_verbose_limit);
 
 	/* Check user-supplied table count for validness */
 	if (default_fw_tables > IPFW_TABLES_MAX)
 	  default_fw_tables = IPFW_TABLES_MAX;
 
 	ipfw_init_sopt_handler();
 	ipfw_log_bpf(1); /* init */
 	ipfw_iface_init();
 	return (error);
 }
 
 /*
  * Called for the removal of the last instance only on module unload.
  */
 static void
 ipfw_destroy(void)
 {
 
 	ipfw_iface_destroy();
 	ipfw_log_bpf(0); /* uninit */
 	ipfw_destroy_sopt_handler();
 	printf("IP firewall unloaded\n");
 }
 
 /*
  * Stuff that must be initialized for every instance
  * (including the first of course).
  */
 static int
 vnet_ipfw_init(const void *unused)
 {
 	int error, first;
 	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 
 	first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	/* First set up some values that are compile time options */
 	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
 	V_fw_deny_unknown_exthdrs = 1;
 #ifdef IPFIREWALL_VERBOSE
 	V_fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&chain->nat);
 #endif
 
 	/* Init shared services hash table */
 	ipfw_init_srv(chain);
 
 	ipfw_init_obj_rewriter();
 	ipfw_init_counters();
 	/* insert the default rule and create the initial map */
 	chain->n_rules = 1;
 	chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO);
 	rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
 
 	/* Set initial number of tables */
 	V_fw_tables_max = default_fw_tables;
 	error = ipfw_init_tables(chain, first);
 	if (error) {
 		printf("ipfw2: setting up tables failed\n");
 		free(chain->map, M_IPFW);
 		free(rule, M_IPFW);
 		return (ENOSPC);
 	}
 
 	/* fill and insert the default rule */
 	rule->act_ofs = 0;
 	rule->rulenum = IPFW_DEFAULT_RULE;
 	rule->cmd_len = 1;
 	rule->set = RESVD_SET;
 	rule->cmd[0].len = 1;
 	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
 	chain->default_rule = chain->map[0] = rule;
 	chain->id = rule->id = 1;
 	/* Pre-calculate rules length for legacy dump format */
 	chain->static_len = sizeof(struct ip_fw_rule0);
 
 	IPFW_LOCK_INIT(chain);
 	ipfw_dyn_init(chain);
 #ifdef LINEAR_SKIPTO
 	ipfw_init_skipto_cache(chain);
 #endif
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
 	/*
 	 * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
 	 * Even if the latter two fail we still keep the module alive
 	 * because the sockopt and layer2 paths are still useful.
 	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
 	 * so we can ignore the exact return value and just set a flag.
 	 *
 	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
 	 * changes in the underlying (per-vnet) variables trigger
 	 * immediate hook()/unhook() calls.
 	 * In layer2 we have the same behaviour, except that V_ether_ipfw
 	 * is checked on each packet because there are no pfil hooks.
 	 */
 	V_ip_fw_ctl_ptr = ipfw_ctl3;
 	error = ipfw_attach_hooks(1);
 	return (error);
 }
 
 /*
  * Called for the removal of each instance.
  */
 static int
 vnet_ipfw_uninit(const void *unused)
 {
 	struct ip_fw *reap;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i, last;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
 	 * disconnect from ipv4, ipv6, layer2 and sockopt.
 	 * Then grab, release and grab again the WLOCK so we make
 	 * sure the update is propagated and nobody will be in.
 	 */
 	(void)ipfw_attach_hooks(0 /* detach */);
 	V_ip_fw_ctl_ptr = NULL;
 
 	last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	IPFW_UH_WLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 
 	IPFW_UH_WLOCK(chain);
 
 	reap = NULL;
 	IPFW_WLOCK(chain);
 	for (i = 0; i < chain->n_rules; i++)
 		ipfw_reap_add(chain, &reap, chain->map[i]);
 	free(chain->map, M_IPFW);
 #ifdef LINEAR_SKIPTO
 	ipfw_destroy_skipto_cache(chain);
 #endif
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	ipfw_destroy_tables(chain, last);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	vnet_ipfw_iface_destroy(chain);
 	ipfw_destroy_srv(chain);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	ipfw_destroy_counters();
 	ipfw_destroy_obj_rewriter();
 	return (0);
 }
 
 /*
  * Module event handler.
  * In general we have the choice of handling most of these events by the
  * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
  * use the SYSINIT handlers as they are more capable of expressing the
  * flow of control during module and vnet operations, so this is just
  * a skeleton. Note there is no SYSINIT equivalent of the module
  * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
  */
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Called once at module load or
 	 	 * system boot if compiled in. */
 		break;
 	case MOD_QUIESCE:
 		/* Called before unload. May veto unloading. */
 		break;
 	case MOD_UNLOAD:
 		/* Called during unload. */
 		break;
 	case MOD_SHUTDOWN:
 		/* Called during system shutdown. */
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 
 /* Define startup order. */
-#define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_IFATTACHDOMAIN
+#define	IPFW_SI_SUB_FIREWALL	SI_SUB_FW
 #define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
 #define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
 #define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */
 
 DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
 FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
 MODULE_VERSION(ipfw, 3);
 /* should declare some dependencies here */
 
 /*
  * Starting up. Done in order after ipfwmod() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_init, NULL);
 VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_init, NULL);
  
 /*
  * Closing up shop. These are done in REVERSE ORDER, but still
  * after ipfwmod() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_uninit, NULL);
 /* end of file */
Index: projects/vnet/sys/netpfil/ipfw/ip_fw_nat.c
===================================================================
--- projects/vnet/sys/netpfil/ipfw/ip_fw_nat.c	(revision 294565)
+++ projects/vnet/sys/netpfil/ipfw/ip_fw_nat.c	(revision 294566)
@@ -1,1236 +1,1236 @@
 /*-
  * Copyright (c) 2008 Paolo Pisati
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 
 #include <netinet/libalias/alias.h>
 #include <netinet/libalias/alias_local.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 struct cfg_spool {
 	LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
 	struct in_addr          addr;
 	uint16_t		port;
 };
 
 /* Nat redirect configuration. */
 struct cfg_redir {
 	LIST_ENTRY(cfg_redir)	_next;	/* chain of redir instances */
 	uint16_t		mode;	/* type of redirect mode */
 	uint16_t		proto;	/* protocol: tcp/udp */
 	struct in_addr		laddr;	/* local ip address */
 	struct in_addr		paddr;	/* public ip address */
 	struct in_addr		raddr;	/* remote ip address */
 	uint16_t		lport;	/* local port */
 	uint16_t		pport;	/* public port */
 	uint16_t		rport;	/* remote port	*/
 	uint16_t		pport_cnt;	/* number of public ports */
 	uint16_t		rport_cnt;	/* number of remote ports */
 	struct alias_link	**alink;	
 	u_int16_t		spool_cnt; /* num of entry in spool chain */
 	/* chain of spool instances */
 	LIST_HEAD(spool_chain, cfg_spool) spool_chain;
 };
 
 /* Nat configuration data struct. */
 struct cfg_nat {
 	/* chain of nat instances */
 	LIST_ENTRY(cfg_nat)	_next;
 	int			id;		/* nat id  */
 	struct in_addr		ip;		/* nat ip address */
 	struct libalias		*lib;		/* libalias instance */
 	int			mode;		/* aliasing mode */
 	int			redir_cnt; /* number of entry in spool chain */
 	/* chain of redir instances */
 	LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
 	char			if_name[IF_NAMESIZE];	/* interface name */
 };
 
 static eventhandler_tag ifaddr_event_tag;
 
 static void
 ifaddr_change(void *arg __unused, struct ifnet *ifp)
 {
 	struct cfg_nat *ptr;
 	struct ifaddr *ifa;
 	struct ip_fw_chain *chain;
 
 	KASSERT(curvnet == ifp->if_vnet,
 	    ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet));
 	chain = &V_layer3_chain;
 	IPFW_UH_WLOCK(chain);
 	/* Check every nat entry... */
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
 		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
 			continue;
 		if_addr_rlock(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			IPFW_WLOCK(chain);
 			ptr->ip = ((struct sockaddr_in *)
 			    (ifa->ifa_addr))->sin_addr;
 			LibAliasSetAddress(ptr->lib, ptr->ip);
 			IPFW_WUNLOCK(chain);
 		}
 		if_addr_runlock(ifp);
 	}
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * delete the pointers for nat entry ix, or all of them if ix < 0
  */
 static void
 flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
 {
 	int i;
 	ipfw_insn_nat *cmd;
 
 	IPFW_WLOCK_ASSERT(chain);
 	for (i = 0; i < chain->n_rules; i++) {
 		cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
 		/* XXX skip log and the like ? */
 		if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
 			    (ix < 0 || cmd->nat->id == ix))
 			cmd->nat = NULL;
 	}
 }
 
 static void
 del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
 {
 	struct cfg_redir *r, *tmp_r;
 	struct cfg_spool *s, *tmp_s;
 	int i, num;
 
 	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
 		num = 1; /* Number of alias_link to delete. */
 		switch (r->mode) {
 		case NAT44_REDIR_PORT:
 			num = r->pport_cnt;
 			/* FALLTHROUGH */
 		case NAT44_REDIR_ADDR:
 		case NAT44_REDIR_PROTO:
 			/* Delete all libalias redirect entry. */
 			for (i = 0; i < num; i++)
 				LibAliasRedirectDelete(n->lib, r->alink[i]);
 			/* Del spool cfg if any. */
 			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
 				LIST_REMOVE(s, _next);
 				free(s, M_IPFW);
 			}
 			free(r->alink, M_IPFW);
 			LIST_REMOVE(r, _next);
 			free(r, M_IPFW);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			/* XXX - panic?!?!? */
 			break;
 		}
 	}
 }
 
 static int
 add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
 {
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 
 	int cnt, off, i;
 
 	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
 		ser_r = (struct nat44_cfg_redir *)&buf[off];
 		r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO);
 		r->mode = ser_r->mode;
 		r->laddr = ser_r->laddr;
 		r->paddr = ser_r->paddr;
 		r->raddr = ser_r->raddr;
 		r->lport = ser_r->lport;
 		r->pport = ser_r->pport;
 		r->rport = ser_r->rport;
 		r->pport_cnt = ser_r->pport_cnt;
 		r->rport_cnt = ser_r->rport_cnt;
 		r->proto = ser_r->proto;
 		r->spool_cnt = ser_r->spool_cnt;
 		//memcpy(r, ser_r, SOF_REDIR);
 		LIST_INIT(&r->spool_chain);
 		off += sizeof(struct nat44_cfg_redir);
 		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		switch (r->mode) {
 		case NAT44_REDIR_ADDR:
 			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
 			    r->paddr);
 			break;
 		case NAT44_REDIR_PORT:
 			for (i = 0 ; i < r->pport_cnt; i++) {
 				/* If remotePort is all ports, set it to 0. */
 				u_short remotePortCopy = r->rport + i;
 				if (r->rport_cnt == 1 && r->rport == 0)
 					remotePortCopy = 0;
 				r->alink[i] = LibAliasRedirectPort(ptr->lib,
 				    r->laddr, htons(r->lport + i), r->raddr,
 				    htons(remotePortCopy), r->paddr,
 				    htons(r->pport + i), r->proto);
 				if (r->alink[i] == NULL) {
 					r->alink[0] = NULL;
 					break;
 				}
 			}
 			break;
 		case NAT44_REDIR_PROTO:
 			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
 			    r->raddr, r->paddr, r->proto);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			break;
 		}
 		if (r->alink[0] == NULL) {
 			printf("LibAliasRedirect* returned NULL\n");
 			free(r->alink, M_IPFW);
 			free(r, M_IPFW);
 			return (EINVAL);
 		}
 		/* LSNAT handling. */
 		for (i = 0; i < r->spool_cnt; i++) {
 			ser_s = (struct nat44_cfg_spool *)&buf[off];
 			s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO);
 			s->addr = ser_s->addr;
 			s->port = ser_s->port;
 			LibAliasAddServer(ptr->lib, r->alink[0],
 			    s->addr, htons(s->port));
 			off += sizeof(struct nat44_cfg_spool);
 			/* Hook spool entry. */
 			LIST_INSERT_HEAD(&r->spool_chain, s, _next);
 		}
 		/* And finally hook this redir entry. */
 		LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
 	}
 
 	return (0);
 }
 
 static void
 free_nat_instance(struct cfg_nat *ptr)
 {
 
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	LibAliasUninit(ptr->lib);
 	free(ptr, M_IPFW);
 }
 
 
 /*
  * ipfw_nat - perform mbuf header translation.
  *
  * Note V_layer3_chain has to be locked while calling ipfw_nat() in
  * 'global' operation mode (t == NULL).
  *
  */
 static int
 ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 {
 	struct mbuf *mcl;
 	struct ip *ip;
 	/* XXX - libalias duct tape */
 	int ldt, retval, found;
 	struct ip_fw_chain *chain;
 	char *c;
 
 	ldt = 0;
 	retval = 0;
 	mcl = m_megapullup(m, m->m_pkthdr.len);
 	if (mcl == NULL) {
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 	ip = mtod(mcl, struct ip *);
 
 	/*
 	 * XXX - Libalias checksum offload 'duct tape':
 	 *
 	 * locally generated packets have only pseudo-header checksum
 	 * calculated and libalias will break it[1], so mark them for
 	 * later fix.  Moreover there are cases when libalias modifies
 	 * tcp packet data[2], mark them for later fix too.
 	 *
 	 * [1] libalias was never meant to run in kernel, so it does
 	 * not have any knowledge about checksum offloading, and
 	 * expects a packet with a full internet checksum.
 	 * Unfortunately, packets generated locally will have just the
 	 * pseudo header calculated, and when libalias tries to adjust
 	 * the checksum it will actually compute a wrong value.
 	 *
 	 * [2] when libalias modifies tcp's data content, full TCP
 	 * checksum has to be recomputed: the problem is that
 	 * libalias does not have any idea about checksum offloading.
 	 * To work around this, we do not do checksumming in LibAlias,
 	 * but only mark the packets in th_x2 field. If we receive a
 	 * marked packet, we calculate correct checksum for it
 	 * aware of offloading.  Why such a terrible hack instead of
 	 * recalculating checksum for each packet?
 	 * Because the previous checksum was not checked!
 	 * Recalculating checksums for EVERY packet will hide ALL
 	 * transmission errors. Yes, marked packets still suffer from
 	 * this problem. But, sigh, natd(8) has this problem, too.
 	 *
 	 * TODO: -make libalias mbuf aware (so
 	 * it can handle delayed checksum and tso)
 	 */
 
 	if (mcl->m_pkthdr.rcvif == NULL &&
 	    mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		ldt = 1;
 
 	c = mtod(mcl, char *);
 
 	/* Check if this is 'global' instance */
 	if (t == NULL) {
 		if (args->oif == NULL) {
 			/* Wrong direction, skip processing */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 
 		found = 0;
 		chain = &V_layer3_chain;
 		IPFW_RLOCK_ASSERT(chain);
 		/* Check every nat entry... */
 		LIST_FOREACH(t, &chain->nat, _next) {
 			if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0)
 				continue;
 			retval = LibAliasOutTry(t->lib, c,
 			    mcl->m_len + M_TRAILINGSPACE(mcl), 0);
 			if (retval == PKT_ALIAS_OK) {
 				/* Nat instance recognises state */
 				found = 1;
 				break;
 			}
 		}
 		if (found != 1) {
 			/* No instance found, return ignore */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 	} else {
 		if (args->oif == NULL)
 			retval = LibAliasIn(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 		else
 			retval = LibAliasOut(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 	}
 
 	/*
 	 * We drop packet when:
 	 * 1. libalias returns PKT_ALIAS_ERROR;
 	 * 2. For incoming packets:
 	 *	a) for unresolved fragments;
 	 *	b) libalias returns PKT_ALIAS_IGNORED and
 	 *		PKT_ALIAS_DENY_INCOMING flag is set.
 	 */
 	if (retval == PKT_ALIAS_ERROR ||
 	    (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT ||
 	    (retval == PKT_ALIAS_IGNORED &&
 	    (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) {
 		/* XXX - should i add some logging? */
 		m_free(mcl);
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 
 	if (retval == PKT_ALIAS_RESPOND)
 		mcl->m_flags |= M_SKIP_FIREWALL;
 	mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
 
 	/*
 	 * XXX - libalias checksum offload
 	 * 'duct tape' (see above)
 	 */
 
 	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
 	    ip->ip_p == IPPROTO_TCP) {
 		struct tcphdr 	*th;
 
 		th = (struct tcphdr *)(ip + 1);
 		if (th->th_x2)
 			ldt = 1;
 	}
 
 	if (ldt) {
 		struct tcphdr 	*th;
 		struct udphdr 	*uh;
 		uint16_t ip_len, cksum;
 
 		ip_len = ntohs(ip->ip_len);
 		cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(ip->ip_p + ip_len - (ip->ip_hl << 2)));
 
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			th = (struct tcphdr *)(ip + 1);
 			/*
 			 * Maybe it was set in
 			 * libalias...
 			 */
 			th->th_x2 = 0;
 			th->th_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct tcphdr, th_sum);
 			break;
 		case IPPROTO_UDP:
 			uh = (struct udphdr *)(ip + 1);
 			uh->uh_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct udphdr, uh_sum);
 			break;
 		}
 		/* No hw checksum offloading: do it ourselves */
 		if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
 			in_delayed_cksum(mcl);
 			mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		}
 	}
 	args->m = mcl;
 	return (IP_FW_NAT);
 }
 
 static struct cfg_nat *
 lookup_nat(struct nat_list *l, int nat_id)
 {
 	struct cfg_nat *res;
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == nat_id)
 			break;
 	}
 	return res;
 }
 
 static struct cfg_nat *
 lookup_nat_name(struct nat_list *l, char *name)
 {
 	struct cfg_nat *res;
 	int id;
 	char *errptr;
 
 	id = strtol(name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (NULL);
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == id)
 			break;
 	}
 	return (res);
 }
 
 /* IP_FW3 configuration routines */
 
 static void
 nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg)
 {
 	struct cfg_nat *ptr, *tcfg;
 	int gencnt;
 
 	/*
 	 * Find/create nat rule.
 	 */
 	IPFW_UH_WLOCK(chain);
 	gencnt = chain->gencnt;
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		/* New rule: allocate and init new instance. */
 		ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO);
 		ptr->lib = LibAliasInit(NULL);
 		LIST_INIT(&ptr->redir_chain);
 	} else {
 		/* Entry already present: temporarily unhook it. */
 		IPFW_WLOCK(chain);
 		LIST_REMOVE(ptr, _next);
 		flush_nat_ptrs(chain, ptr->id);
 		IPFW_WUNLOCK(chain);
 		IPFW_UH_WUNLOCK(chain);
 	}
 
 	/*
 	 * Basic nat (re)configuration.
 	 */
 	ptr->id = strtol(ucfg->name, NULL, 10);
 	/*
 	 * XXX - what if this rule doesn't nat any ip and just
 	 * redirect?
 	 * do we set aliasaddress to 0.0.0.0?
 	 */
 	ptr->ip = ucfg->ip;
 	ptr->redir_cnt = ucfg->redir_cnt;
 	ptr->mode = ucfg->mode;
 	strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name));
 	LibAliasSetMode(ptr->lib, ptr->mode, ~0);
 	LibAliasSetAddress(ptr->lib, ptr->ip);
 
 	/*
 	 * Redir and LSNAT configuration.
 	 */
 	/* Delete old cfgs. */
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	/* Add new entries. */
 	add_redir_spool_cfg((char *)(ucfg + 1), ptr);
 	IPFW_UH_WLOCK(chain);
 
 	/* Extra check to avoid race with another ipfw_nat_cfg() */
 	tcfg = NULL;
 	if (gencnt != chain->gencnt)
 	    tcfg = lookup_nat_name(&chain->nat, ucfg->name);
 	IPFW_WLOCK(chain);
 	if (tcfg != NULL)
 		LIST_REMOVE(tcfg, _next);
 	LIST_INSERT_HEAD(&chain->nat, ptr, _next);
 	IPFW_WUNLOCK(chain);
 	chain->gencnt++;
 
 	IPFW_UH_WUNLOCK(chain);
 
 	if (tcfg != NULL)
 		free_nat_instance(ptr);
 }
 
 /*
  * Creates/configure nat44 instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	int id;
 	size_t read;
 	char *errptr;
 
 	/* Check minimum header size */
 	if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg)))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated and looks like number */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 	id = strtol(ucfg->name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (EINVAL);
 
 	read = sizeof(*oh) + sizeof(*ucfg);
 	/* Check number of redirs */
 	if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir))
 		return (EINVAL);
 
 	nat44_config(chain, ucfg);
 	return (0);
 }
 
 /*
  * Destroys given nat instances.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  *
  * Returns 0 on success
  */
 static int
 nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct cfg_nat *ptr;
 	ipfw_obj_ntlv *ntlv;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(*oh))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ntlv = &oh->ntlv;
 	/* Check if name is properly terminated */
 	if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name))
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ntlv->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ESRCH);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, ptr->id);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	free_nat_instance(ptr);
 
 	return (0);
 }
 
 static void
 export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg)
 {
 
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id);
 	ucfg->ip = ptr->ip;
 	ucfg->redir_cnt = ptr->redir_cnt;
 	ucfg->mode = ptr->mode;
 	strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name));
 }
 
 /*
  * Gets config for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 	
 	/* Estimate memory amount */
 	sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat);
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		sz += sizeof(struct nat44_cfg_redir);
 		LIST_FOREACH(s, &r->spool_chain, _next)
 			sz += sizeof(struct nat44_cfg_spool);
 	}
 
 	ucfg->size = sz;
 	if (sd->valsize < sz) {
 
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	/* Size OK, let's copy data */
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd,
 		    sizeof(*ser_r));
 		ser_r->mode = r->mode;
 		ser_r->laddr = r->laddr;
 		ser_r->paddr = r->paddr;
 		ser_r->raddr = r->raddr;
 		ser_r->lport = r->lport;
 		ser_r->pport = r->pport;
 		ser_r->rport = r->rport;
 		ser_r->pport_cnt = r->pport_cnt;
 		ser_r->rport_cnt = r->rport_cnt;
 		ser_r->proto = r->proto;
 		ser_r->spool_cnt = r->spool_cnt;
 
 		LIST_FOREACH(s, &r->spool_chain, _next) {
 			ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space(
 			    sd, sizeof(*ser_s));
 
 			ser_s->addr = s->addr;
 			ser_s->port = s->port;
 		}
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Lists all nat44 instances currently available in kernel.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ]
  * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ]
  *
  * Returns 0 on success
  */
 static int
 nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *olh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	int nat_count;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(ipfw_obj_lheader))
 		return (EINVAL);
 
 	olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
 	IPFW_UH_RLOCK(chain);
 	nat_count = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next)
 		nat_count++;
 
 	olh->count = nat_count;
 	olh->objsize = sizeof(struct nat44_cfg_nat);
 	olh->size = sizeof(*olh) + olh->count * olh->objsize;
 
 	if (sd->valsize < olh->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd,
 		    sizeof(*ucfg));
 		export_nat_cfg(ptr, ucfg);
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Gets log for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat ]
  * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	void *pbuf;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	if (ptr->lib->logDesc == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOENT);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 	
 	/* Estimate memory amount */
 	ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE;
 	if (sd->valsize < sz + sizeof(*oh)) {
 
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE);
 	memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE);
 	
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_NAT44_XCONFIG,	0,	HDIR_SET,	nat44_cfg },
 	{ IP_FW_NAT44_DESTROY,	0,	HDIR_SET,	nat44_destroy },
 	{ IP_FW_NAT44_XGETCONFIG,	0,	HDIR_GET,	nat44_get_cfg },
 	{ IP_FW_NAT44_LIST_NAT,	0,	HDIR_GET,	nat44_list_nat },
 	{ IP_FW_NAT44_XGETLOG,	0,	HDIR_GET,	nat44_get_log },
 };
 
 
 /*
  * Legacy configuration routines
  */
 
 struct cfg_spool_legacy {
 	LIST_ENTRY(cfg_spool_legacy)	_next;
 	struct in_addr			addr;
 	u_short				port;
 };
 
 struct cfg_redir_legacy {
 	LIST_ENTRY(cfg_redir)   _next;
 	u_int16_t               mode;
 	struct in_addr	        laddr;
 	struct in_addr	        paddr;
 	struct in_addr	        raddr;
 	u_short                 lport;
 	u_short                 pport;
 	u_short                 rport;
 	u_short                 pport_cnt;
 	u_short                 rport_cnt;
 	int                     proto;
 	struct alias_link       **alink;
 	u_int16_t               spool_cnt;
 	LIST_HEAD(, cfg_spool_legacy) spool_chain;
 };
 
 struct cfg_nat_legacy {
 	LIST_ENTRY(cfg_nat_legacy)	_next;
 	int				id;
 	struct in_addr			ip;
 	char				if_name[IF_NAMESIZE];
 	int				mode;
 	struct libalias			*lib;
 	int				redir_cnt;
 	LIST_HEAD(, cfg_redir_legacy)	redir_chain;
 };
 
 static int
 ipfw_nat_cfg(struct sockopt *sopt)
 {
 	struct cfg_nat_legacy *cfg;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_redir_legacy *rdir;
 	struct nat44_cfg_redir *urdir;
 	char *buf;
 	size_t len, len2;
 	int error, i;
 
 	len = sopt->sopt_valsize;
 	len2 = len + 128;
 
 	/*
 	 * Allocate 2x buffer to store converted structures.
 	 * new redir_cfg has shrinked, so we're sure that
 	 * new buffer size is enough.
 	 */
 	buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO);
 	error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy));
 	if (error != 0)
 		goto out;
 
 	cfg = (struct cfg_nat_legacy *)buf;
 	if (cfg->id < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)];
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id);
 	strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name));
 	ucfg->ip = cfg->ip;
 	ucfg->mode = cfg->mode;
 	ucfg->redir_cnt = cfg->redir_cnt;
 
 	if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	urdir = (struct nat44_cfg_redir *)(ucfg + 1);
 	rdir = (struct cfg_redir_legacy *)(cfg + 1);
 	for (i = 0; i < cfg->redir_cnt; i++) {
 		urdir->mode = rdir->mode;
 		urdir->laddr = rdir->laddr;
 		urdir->paddr = rdir->paddr;
 		urdir->raddr = rdir->raddr;
 		urdir->lport = rdir->lport;
 		urdir->pport = rdir->pport;
 		urdir->rport = rdir->rport;
 		urdir->pport_cnt = rdir->pport_cnt;
 		urdir->rport_cnt = rdir->rport_cnt;
 		urdir->proto = rdir->proto;
 		urdir->spool_cnt = rdir->spool_cnt;
 
 		urdir++;
 		rdir++;
 	}
 
 	nat44_config(&V_layer3_chain, ucfg);
 
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 ipfw_nat_del(struct sockopt *sopt)
 {
 	struct cfg_nat *ptr;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i;
 
 	sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	/* XXX validate i */
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat(&chain->nat, i);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (EINVAL);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, i);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	free_nat_instance(ptr);
 	return (0);
 }
 
 static int
 ipfw_nat_get_cfg(struct sockopt *sopt)
 {
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	struct cfg_nat *n;
 	struct cfg_nat_legacy *ucfg;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct cfg_redir_legacy *ser_r;
 	struct cfg_spool_legacy *ser_s;
 	char *data;
 	int gencnt, nat_cnt, len, error;
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 retry:
 	gencnt = chain->gencnt;
 	/* Estimate memory amount */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		nat_cnt++;
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next)
 				len += sizeof(struct cfg_spool_legacy);
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	data = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
 	bcopy(&nat_cnt, data, sizeof(nat_cnt));
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 	if (gencnt != chain->gencnt) {
 		free(data, M_TEMP);
 		goto retry;
 	}
 	/* Serialize all the data. */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		ucfg = (struct cfg_nat_legacy *)&data[len];
 		ucfg->id = n->id;
 		ucfg->ip = n->ip;
 		ucfg->redir_cnt = n->redir_cnt;
 		ucfg->mode = n->mode;
 		strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name));
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			ser_r = (struct cfg_redir_legacy *)&data[len];
 			ser_r->mode = r->mode;
 			ser_r->laddr = r->laddr;
 			ser_r->paddr = r->paddr;
 			ser_r->raddr = r->raddr;
 			ser_r->lport = r->lport;
 			ser_r->pport = r->pport;
 			ser_r->rport = r->rport;
 			ser_r->pport_cnt = r->pport_cnt;
 			ser_r->rport_cnt = r->rport_cnt;
 			ser_r->proto = r->proto;
 			ser_r->spool_cnt = r->spool_cnt;
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next) {
 				ser_s = (struct cfg_spool_legacy *)&data[len];
 				ser_s->addr = s->addr;
 				ser_s->port = s->port;
 				len += sizeof(struct cfg_spool_legacy);
 			}
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	error = sooptcopyout(sopt, data, len);
 	free(data, M_TEMP);
 
 	return (error);
 }
 
 static int
 ipfw_nat_get_log(struct sockopt *sopt)
 {
 	uint8_t *data;
 	struct cfg_nat *ptr;
 	int i, size;
 	struct ip_fw_chain *chain;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 
 	IPFW_RLOCK(chain);
 	/* one pass to count, one to copy the data */
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		i++;
 	}
 	size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
 	data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
 	if (data == NULL) {
 		IPFW_RUNLOCK(chain);
 		return (ENOSPC);
 	}
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		bcopy(&ptr->id, &data[i], sizeof(int));
 		i += sizeof(int);
 		bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
 		i += LIBALIAS_BUF_SIZE;
 	}
 	IPFW_RUNLOCK(chain);
 	sooptcopyout(sopt, data, size);
 	free(data, M_IPFW);
 	return(0);
 }
 
 static int
 vnet_ipfw_nat_init(const void *arg __unused)
 {
 
 	V_ipfw_nat_ready = 1;
 	return (0);
 }
 
 static int
 vnet_ipfw_nat_uninit(const void *arg __unused)
 {
 	struct cfg_nat *ptr, *ptr_temp;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 	IPFW_WLOCK(chain);
 	LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
 		LIST_REMOVE(ptr, _next);
 		free_nat_instance(ptr);
 	}
 	flush_nat_ptrs(chain, -1 /* flush all */);
 	V_ipfw_nat_ready = 0;
 	IPFW_WUNLOCK(chain);
 	return (0);
 }
 
 static void
 ipfw_nat_init(void)
 {
 
 	/* init ipfw hooks */
 	ipfw_nat_ptr = ipfw_nat;
 	lookup_nat_ptr = lookup_nat;
 	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
 	ipfw_nat_del_ptr = ipfw_nat_del;
 	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
 	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 
 	ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 }
 
 static void
 ipfw_nat_destroy(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
 	/* deregister ipfw_nat */
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	ipfw_nat_ptr = NULL;
 	lookup_nat_ptr = NULL;
 	ipfw_nat_cfg_ptr = NULL;
 	ipfw_nat_del_ptr = NULL;
 	ipfw_nat_get_cfg_ptr = NULL;
 	ipfw_nat_get_log_ptr = NULL;
 }
 
 static int
 ipfw_nat_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	default:
 		return EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfw_nat_mod = {
 	"ipfw_nat",
 	ipfw_nat_modevent,
 	0
 };
 
 /* Define startup order. */
-#define	IPFW_NAT_SI_SUB_FIREWALL	SI_SUB_PROTO_IFATTACHDOMAIN
+#define	IPFW_NAT_SI_SUB_FIREWALL	SI_SUB_FW
 #define	IPFW_NAT_MODEVENT_ORDER		(SI_ORDER_ANY - 128) /* after ipfw */
 #define	IPFW_NAT_MODULE_ORDER		(IPFW_NAT_MODEVENT_ORDER + 1)
 #define	IPFW_NAT_VNET_ORDER		(IPFW_NAT_MODEVENT_ORDER + 2)
 
 DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY);
 MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
 MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3);
 MODULE_VERSION(ipfw_nat, 1);
 
 SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_init, NULL);
 VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER,
     vnet_ipfw_nat_init, NULL);
 
 SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL,
     IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL);
 
 /* end of file */
Index: projects/vnet/sys/netpfil/pf/if_pfsync.c
===================================================================
--- projects/vnet/sys/netpfil/pf/if_pfsync.c	(revision 294565)
+++ projects/vnet/sys/netpfil/pf/if_pfsync.c	(revision 294566)
@@ -1,2421 +1,2422 @@
 /*-
  * Copyright (c) 2002 Michael Shalayeff
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 /*
  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
  *
  * Revisions picked from OpenBSD after revision 1.110 import:
  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
  * 1.120, 1.175 - use monotonic time_uptime
  * 1.122 - reduce number of updates for non-TCP sessions
  * 1.125, 1.127 - rewrite merge or stale processing
  * 1.128 - cleanups
  * 1.146 - bzero() mbuf before sparsely filling it with data
  * 1.170 - SIOCSIFMTU checks
  * 1.126, 1.142 - deferred packets processing
  * 1.173 - correct expire time processing
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 
 #define PFSYNC_MINPKT ( \
 	sizeof(struct ip) + \
 	sizeof(struct pfsync_header) + \
 	sizeof(struct pfsync_subheader) )
 
 struct pfsync_pkt {
 	struct ip *ip;
 	struct in_addr src;
 	u_int8_t flags;
 };
 
 static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
 		    struct pfsync_state_peer *);
 static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
 
 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
 };
 
 struct pfsync_q {
 	void		(*write)(struct pf_state *, void *);
 	size_t		len;
 	u_int8_t	action;
 };
 
 /* we have one of these for every PFSYNC_S_ */
 static void	pfsync_out_state(struct pf_state *, void *);
 static void	pfsync_out_iack(struct pf_state *, void *);
 static void	pfsync_out_upd_c(struct pf_state *, void *);
 static void	pfsync_out_del(struct pf_state *, void *);
 
 static struct pfsync_q pfsync_qs[] = {
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
 };
 
 static void	pfsync_q_ins(struct pf_state *, int);
 static void	pfsync_q_del(struct pf_state *);
 
 static void	pfsync_update_state(struct pf_state *);
 
 struct pfsync_upd_req_item {
 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
 	struct pfsync_upd_req			ur_msg;
 };
 
 struct pfsync_deferral {
 	struct pfsync_softc		*pd_sc;
 	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
 	u_int				pd_refs;
 	struct callout			pd_tmo;
 
 	struct pf_state			*pd_st;
 	struct mbuf			*pd_m;
 };
 
 struct pfsync_softc {
 	/* Configuration */
 	struct ifnet		*sc_ifp;
 	struct ifnet		*sc_sync_if;
 	struct ip_moptions	sc_imo;
 	struct in_addr		sc_sync_peer;
 	uint32_t		sc_flags;
 #define	PFSYNCF_OK		0x00000001
 #define	PFSYNCF_DEFER		0x00000002
 #define	PFSYNCF_PUSH		0x00000004
 	uint8_t			sc_maxupdates;
 	struct ip		sc_template;
 	struct callout		sc_tmo;
 	struct mtx		sc_mtx;
 
 	/* Queued data */
 	size_t			sc_len;
 	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
 	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
 	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
 	u_int			sc_deferred;
 	void			*sc_plus;
 	size_t			sc_pluslen;
 
 	/* Bulk update info */
 	struct mtx		sc_bulk_mtx;
 	uint32_t		sc_ureq_sent;
 	int			sc_bulk_tries;
 	uint32_t		sc_ureq_received;
 	int			sc_bulk_hashid;
 	uint64_t		sc_bulk_stateid;
 	uint32_t		sc_bulk_creatorid;
 	struct callout		sc_bulk_tmo;
 	struct callout		sc_bulkfail_tmo;
 };
 
 #define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 
 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
 
 static const char pfsyncname[] = "pfsync";
 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
 static VNET_DEFINE(struct pfsync_softc	*, pfsyncif) = NULL;
 #define	V_pfsyncif		VNET(pfsyncif)
 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
 #define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
 #define	V_pfsyncstats		VNET(pfsyncstats)
 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
 
 static void	pfsync_timeout(void *);
 static void	pfsync_push(struct pfsync_softc *);
 static void	pfsyncintr(void *);
 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
 		    void *);
 static void	pfsync_multicast_cleanup(struct pfsync_softc *);
 static void	pfsync_pointers_init(void);
 static void	pfsync_pointers_uninit(void);
 static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
 
 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 static void	pfsync_clone_destroy(struct ifnet *);
 static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
 		    struct pf_state_peer *);
 static int	pfsyncoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
 
 static int	pfsync_defer(struct pf_state *, struct mbuf *);
 static void	pfsync_undefer(struct pfsync_deferral *, int);
 static void	pfsync_undefer_state(struct pf_state *, int);
 static void	pfsync_defer_tmo(void *);
 
 static void	pfsync_request_update(u_int32_t, u_int64_t);
 static void	pfsync_update_state_req(struct pf_state *);
 
 static void	pfsync_drop(struct pfsync_softc *);
 static void	pfsync_sendout(int);
 static void	pfsync_send_plus(void *, size_t);
 
 static void	pfsync_bulk_start(void);
 static void	pfsync_bulk_status(u_int8_t);
 static void	pfsync_bulk_update(void *);
 static void	pfsync_bulk_fail(void *);
 
 #ifdef IPSEC
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 
 #define PFSYNC_MAX_BULKTRIES	12
 
 VNET_DEFINE(struct if_clone *, pfsync_cloner);
 #define	V_pfsync_cloner	VNET(pfsync_cloner)
 
 static int
 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
 {
 	struct pfsync_softc *sc;
 	struct ifnet *ifp;
 	int q;
 
 	if (unit != 0)
 		return (EINVAL);
 
 	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
 	sc->sc_flags |= PFSYNCF_OK;
 
 	for (q = 0; q < PFSYNC_S_COUNT; q++)
 		TAILQ_INIT(&sc->sc_qs[q]);
 
 	TAILQ_INIT(&sc->sc_upd_req_list);
 	TAILQ_INIT(&sc->sc_deferrals);
 
 	sc->sc_len = PFSYNC_MINPKT;
 	sc->sc_maxupdates = 128;
 
 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
 	if (ifp == NULL) {
 		free(sc, M_PFSYNC);
 		return (ENOSPC);
 	}
 	if_initname(ifp, pfsyncname, unit);
 	ifp->if_softc = sc;
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_type = IFT_PFSYNC;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = sizeof(struct pfsync_header);
 	ifp->if_mtu = ETHERMTU;
 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
 	callout_init(&sc->sc_tmo, 1);
 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
 
 	if_attach(ifp);
 
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 
 	V_pfsyncif = sc;
 
 	return (0);
 }
 
 static void
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 
 	/*
 	 * At this stage, everything should have already been
 	 * cleared by pfsync_uninit(), and we have only to
 	 * drain callouts.
 	 */
 	while (sc->sc_deferred > 0) {
 		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
 
 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 		sc->sc_deferred--;
 		if (callout_stop(&pd->pd_tmo) > 0) {
 			pf_release_state(pd->pd_st);
 			m_freem(pd->pd_m);
 			free(pd, M_PFSYNC);
 		} else {
 			pd->pd_refs++;
 			callout_drain(&pd->pd_tmo);
 			free(pd, M_PFSYNC);
 		}
 	}
 
 	callout_drain(&sc->sc_tmo);
 	callout_drain(&sc->sc_bulkfail_tmo);
 	callout_drain(&sc->sc_bulk_tmo);
 
 	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
 	bpfdetach(ifp);
 	if_detach(ifp);
 
 	pfsync_drop(sc);
 
 	if_free(ifp);
 	if (sc->sc_imo.imo_membership)
 		pfsync_multicast_cleanup(sc);
 	mtx_destroy(&sc->sc_mtx);
 	mtx_destroy(&sc->sc_bulk_mtx);
 	free(sc, M_PFSYNC);
 
 	V_pfsyncif = NULL;
 }
 
 static int
 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
     struct pf_state_peer *d)
 {
 	if (s->scrub.scrub_flag && d->scrub == NULL) {
 		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
 		if (d->scrub == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 
 static int
 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 #ifndef	__NO_STRICT_ALIGNMENT
 	struct pfsync_state_key key[2];
 #endif
 	struct pfsync_state_key *kw, *ks;
 	struct pf_state	*st = NULL;
 	struct pf_state_key *skw = NULL, *sks = NULL;
 	struct pf_rule *r = NULL;
 	struct pfi_kif	*kif;
 	int error;
 
 	PF_RULES_RASSERT();
 
 	if (sp->creatorid == 0) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: invalid creator id: %08x\n", __func__,
 			    ntohl(sp->creatorid));
 		return (EINVAL);
 	}
 
 	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: unknown interface: %s\n", __func__,
 			    sp->ifname);
 		if (flags & PFSYNC_SI_IOCTL)
 			return (EINVAL);
 		return (0);	/* skip this state */
 	}
 
 	/*
 	 * If the ruleset checksums match or the state is coming from the ioctl,
 	 * it's safe to associate the state with the rule of that number.
 	 */
 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
 		r = pf_main_ruleset.rules[
 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
 	else
 		r = &V_pf_default_rule;
 
 	if ((r->max_states &&
 	    counter_u64_fetch(r->states_cur) >= r->max_states))
 		goto cleanup;
 
 	/*
 	 * XXXGL: consider M_WAITOK in ioctl path after.
 	 */
 	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
 		goto cleanup;
 
 	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
 		goto cleanup;
 
 #ifndef	__NO_STRICT_ALIGNMENT
 	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
 	kw = &key[PF_SK_WIRE];
 	ks = &key[PF_SK_STACK];
 #else
 	kw = &sp->key[PF_SK_WIRE];
 	ks = &sp->key[PF_SK_STACK];
 #endif
 
 	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
 	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
 	    kw->port[0] != ks->port[0] ||
 	    kw->port[1] != ks->port[1]) {
 		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 		if (sks == NULL)
 			goto cleanup;
 	} else
 		sks = skw;
 
 	/* allocate memory for scrub info */
 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
 		goto cleanup;
 
 	/* Copy to state key(s). */
 	skw->addr[0] = kw->addr[0];
 	skw->addr[1] = kw->addr[1];
 	skw->port[0] = kw->port[0];
 	skw->port[1] = kw->port[1];
 	skw->proto = sp->proto;
 	skw->af = sp->af;
 	if (sks != skw) {
 		sks->addr[0] = ks->addr[0];
 		sks->addr[1] = ks->addr[1];
 		sks->port[0] = ks->port[0];
 		sks->port[1] = ks->port[1];
 		sks->proto = sp->proto;
 		sks->af = sp->af;
 	}
 
 	/* copy to state */
 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
 	st->creation = time_uptime - ntohl(sp->creation);
 	st->expire = time_uptime;
 	if (sp->expire) {
 		uint32_t timeout;
 
 		timeout = r->timeout[sp->timeout];
 		if (!timeout)
 			timeout = V_pf_default_rule.timeout[sp->timeout];
 
 		/* sp->expire may have been adaptively scaled by export. */
 		st->expire -= timeout - ntohl(sp->expire);
 	}
 
 	st->direction = sp->direction;
 	st->log = sp->log;
 	st->timeout = sp->timeout;
 	st->state_flags = sp->state_flags;
 
 	st->id = sp->id;
 	st->creatorid = sp->creatorid;
 	pf_state_peer_ntoh(&sp->src, &st->src);
 	pf_state_peer_ntoh(&sp->dst, &st->dst);
 
 	st->rule.ptr = r;
 	st->nat_rule.ptr = NULL;
 	st->anchor.ptr = NULL;
 	st->rt_kif = NULL;
 
 	st->pfsync_time = time_uptime;
 	st->sync_state = PFSYNC_S_NONE;
 
 	if (!(flags & PFSYNC_SI_IOCTL))
 		st->state_flags |= PFSTATE_NOSYNC;
 
 	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
 		goto cleanup_state;
 
 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
 	counter_u64_add(r->states_cur, 1);
 	counter_u64_add(r->states_tot, 1);
 
 	if (!(flags & PFSYNC_SI_IOCTL)) {
 		st->state_flags &= ~PFSTATE_NOSYNC;
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK);
 			pfsync_push(sc);
 		}
 	}
 	st->state_flags &= ~PFSTATE_ACK;
 	PF_STATE_UNLOCK(st);
 
 	return (0);
 
 cleanup:
 	error = ENOMEM;
 	if (skw == sks)
 		sks = NULL;
 	if (skw != NULL)
 		uma_zfree(V_pf_state_key_z, skw);
 	if (sks != NULL)
 		uma_zfree(V_pf_state_key_z, sks);
 
 cleanup_state:	/* pf_state_insert() frees the state keys. */
 	if (st) {
 		if (st->dst.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
 		if (st->src.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
 		uma_zfree(V_pf_state_z, st);
 	}
 	return (error);
 }
 
 static int
 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_pkt pkt;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct pfsync_header *ph;
 	struct pfsync_subheader subh;
 
 	int offset, len;
 	int rv;
 	uint16_t count;
 
 	*mp = NULL;
 	V_pfsyncstats.pfsyncs_ipackets++;
 
 	/* Verify that we have a sync interface configured. */
 	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
 	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto done;
 
 	/* verify that the packet came in on the right interface */
 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
 		V_pfsyncstats.pfsyncs_badif++;
 		goto done;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	/* verify that the IP TTL is 255. */
 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
 		V_pfsyncstats.pfsyncs_badttl++;
 		goto done;
 	}
 
 	offset = ip->ip_hl << 2;
 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
 		V_pfsyncstats.pfsyncs_hdrops++;
 		goto done;
 	}
 
 	if (offset + sizeof(*ph) > m->m_len) {
 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
 			V_pfsyncstats.pfsyncs_hdrops++;
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ph = (struct pfsync_header *)((char *)ip + offset);
 
 	/* verify the version */
 	if (ph->version != PFSYNC_VERSION) {
 		V_pfsyncstats.pfsyncs_badver++;
 		goto done;
 	}
 
 	len = ntohs(ph->len) + offset;
 	if (m->m_pkthdr.len < len) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		goto done;
 	}
 
 	/* Cheaper to grab this now than having to mess with mbufs later */
 	pkt.ip = ip;
 	pkt.src = ip->ip_src;
 	pkt.flags = 0;
 
 	/*
 	 * Trusting pf_chksum during packet processing, as well as seeking
 	 * in interface name tree, require holding PF_RULES_RLOCK().
 	 */
 	PF_RULES_RLOCK();
 	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
 		pkt.flags |= PFSYNC_SI_CKSUM;
 
 	offset += sizeof(*ph);
 	while (offset <= len - sizeof(subh)) {
 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
 		offset += sizeof(subh);
 
 		if (subh.action >= PFSYNC_ACT_MAX) {
 			V_pfsyncstats.pfsyncs_badact++;
 			PF_RULES_RUNLOCK();
 			goto done;
 		}
 
 		count = ntohs(subh.count);
 		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
 		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
 		if (rv == -1) {
 			PF_RULES_RUNLOCK();
 			return (IPPROTO_DONE);
 		}
 
 		offset += rv;
 	}
 	PF_RULES_RUNLOCK();
 
 done:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static int
 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_clr *clr;
 	struct mbuf *mp;
 	int len = sizeof(*clr) * count;
 	int i, offp;
 	u_int32_t creatorid;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	clr = (struct pfsync_clr *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		creatorid = clr[i].creatorid;
 
 		if (clr[i].ifname[0] != '\0' &&
 		    pfi_kif_find(clr[i].ifname) == NULL)
 			continue;
 
 		for (int i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 			struct pf_state *s;
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->creatorid == creatorid) {
 					s->state_flags |= PFSTATE_NOSYNC;
 					pf_unlink_state(s, PF_ENTER_LOCKED);
 					goto relock;
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	int len = sizeof(*sp) * count;
 	int i, offp;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* Check for invalid values. */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST ||
 		    sp->direction > PF_OUT ||
 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("%s: invalid value\n", __func__);
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
 			/* Drop out, but process the rest of the actions. */
 			break;
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_ins_ack *ia, *iaa;
 	struct pf_state *st;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*ia);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ia = &iaa[i];
 
 		st = pf_find_state_byid(ia->id, ia->creatorid);
 		if (st == NULL)
 			continue;
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(V_pfsyncif);
 			pfsync_undefer_state(st, 0);
 			PFSYNC_UNLOCK(V_pfsyncif);
 		}
 		PF_STATE_UNLOCK(st);
 	}
 	/*
 	 * XXX this is not yet implemented, but we know the size of the
 	 * message so we can skip it.
 	 */
 
 	return (count * sizeof(struct pfsync_ins_ack));
 }
 
 static int
 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
     struct pfsync_state_peer *dst)
 {
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 
 	/*
 	 * The state should never go backwards except
 	 * for syn-proxy states.  Neither should the
 	 * sequence window slide backwards.
 	 */
 	if ((st->src.state > src->state &&
 	    (st->src.state < PF_TCPS_PROXY_SRC ||
 	    src->state >= PF_TCPS_PROXY_SRC)) ||
 
 	    (st->src.state == src->state &&
 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(src, &st->src);
 
 	if ((st->dst.state > dst->state) ||
 
 	    (st->dst.state >= TCPS_SYN_SENT &&
 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(dst, &st->dst);
 
 	return (sync);
 }
 
 static int
 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int sync;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* check for invalid values */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: PFSYNC_ACT_UPD: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			/* insert the update */
 			if (pfsync_state_import(sp, 0))
 				V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
 			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > sp->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->src, &st->src);
 			if (st->dst.state > sp->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
 			pf_state_peer_ntoh(&sp->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = sp->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			PFSYNC_LOCK(sc);
 			pfsync_push(sc);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_c *ua, *up;
 	struct pf_state *st;
 	int len = count * sizeof(*up);
 	int sync;
 	struct mbuf *mp;
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		up = &ua[i];
 
 		/* check for invalid values */
 		if (up->timeout >= PFTM_MAX ||
 		    up->src.state > PF_TCPS_PROXY_DST ||
 		    up->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: "
 				    "PFSYNC_ACT_UPD_C: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(up->id, up->creatorid);
 		if (st == NULL) {
 			/* We don't have this state. Ask for it. */
 			PFSYNC_LOCK(sc);
 			pfsync_request_update(up->creatorid, up->id);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
 			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > up->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->src, &st->src);
 			if (st->dst.state > up->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
 			pf_state_peer_ntoh(&up->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = up->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			PFSYNC_LOCK(sc);
 			pfsync_push(sc);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_upd_req *ur, *ura;
 	struct mbuf *mp;
 	int len = count * sizeof(*ur);
 	int i, offp;
 
 	struct pf_state *st;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ur = &ura[i];
 
 		if (ur->id == 0 && ur->creatorid == 0)
 			pfsync_bulk_start();
 		else {
 			st = pf_find_state_byid(ur->id, ur->creatorid);
 			if (st == NULL) {
 				V_pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			if (st->state_flags & PFSTATE_NOSYNC) {
 				PF_STATE_UNLOCK(st);
 				continue;
 			}
 
 			pfsync_update_state_req(st);
 			PF_STATE_UNLOCK(st);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_del_c *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bus *bus;
 	struct mbuf *mp;
 	int len = count * sizeof(*bus);
 	int offp;
 
 	PFSYNC_BLOCK(sc);
 
 	/* If we're not waiting for a bulk update, who cares. */
 	if (sc->sc_ureq_sent == 0) {
 		PFSYNC_BUNLOCK(sc);
 		return (len);
 	}
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		PFSYNC_BUNLOCK(sc);
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	bus = (struct pfsync_bus *)(mp->m_data + offp);
 
 	switch (bus->status) {
 	case PFSYNC_BUS_START:
 		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
 		    V_pf_limits[PF_LIMIT_STATES].limit /
 		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
 		    sizeof(struct pfsync_state)),
 		    pfsync_bulk_fail, sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: received bulk update start\n");
 		break;
 
 	case PFSYNC_BUS_END:
 		if (time_uptime - ntohl(bus->endtime) >=
 		    sc->sc_ureq_sent) {
 			/* that's it, we're happy */
 			sc->sc_ureq_sent = 0;
 			sc->sc_bulk_tries = 0;
 			callout_stop(&sc->sc_bulkfail_tmo);
 			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 				    "pfsync bulk done");
 			sc->sc_flags |= PFSYNCF_OK;
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received valid "
 				    "bulk update end\n");
 		} else {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received invalid "
 				    "bulk update end: bad timestamp\n");
 		}
 		break;
 	}
 	PFSYNC_BUNLOCK(sc);
 
 	return (len);
 }
 
 static int
 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	int len = count * sizeof(struct pfsync_tdb);
 
 #if defined(IPSEC)
 	struct pfsync_tdb *tp;
 	struct mbuf *mp;
 	int offp;
 	int i;
 	int s;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++)
 		pfsync_update_net_tdb(&tp[i]);
 #endif
 
 	return (len);
 }
 
 #if defined(IPSEC)
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 static void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
 	struct tdb		*tdb;
 	int			 s;
 
 	/* check for invalid values */
 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
 	    (pt->dst.sa.sa_family != AF_INET &&
 	    pt->dst.sa.sa_family != AF_INET6))
 		goto bad;
 
 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
 	if (tdb) {
 		pt->rpl = ntohl(pt->rpl);
 		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
 
 		/* Neither replay nor byte counter should ever decrease. */
 		if (pt->rpl < tdb->tdb_rpl ||
 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
 			goto bad;
 		}
 
 		tdb->tdb_rpl = pt->rpl;
 		tdb->tdb_cur_bytes = pt->cur_bytes;
 	}
 	return;
 
 bad:
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
 		    "invalid value\n");
 	V_pfsyncstats.pfsyncs_badstate++;
 	return;
 }
 #endif
 
 
 static int
 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	/* check if we are at the right place in the packet */
 	if (offset != m->m_pkthdr.len)
 		V_pfsyncstats.pfsyncs_badlen++;
 
 	/* we're done. free and let the caller return */
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	V_pfsyncstats.pfsyncs_badact++;
 
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct pfsyncreq pfsyncr;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		PFSYNC_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_init();
 		} else {
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_uninit();
 		}
 		break;
 	case SIOCSIFMTU:
 		if (!sc->sc_sync_if ||
 		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
 			return (EINVAL);
 		if (ifr->ifr_mtu < ifp->if_mtu) {
 			PFSYNC_LOCK(sc);
 			if (sc->sc_len > PFSYNC_MINPKT)
 				pfsync_sendout(1);
 			PFSYNC_UNLOCK(sc);
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCGETPFSYNC:
 		bzero(&pfsyncr, sizeof(pfsyncr));
 		PFSYNC_LOCK(sc);
 		if (sc->sc_sync_if) {
 			strlcpy(pfsyncr.pfsyncr_syncdev,
 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
 		}
 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
 		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
 		    (sc->sc_flags & PFSYNCF_DEFER));
 		PFSYNC_UNLOCK(sc);
 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
 
 	case SIOCSETPFSYNC:
 	    {
 		struct ip_moptions *imo = &sc->sc_imo;
 		struct ifnet *sifp;
 		struct ip *ip;
 		void *mship = NULL;
 
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 			return (error);
 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
 			return (error);
 
 		if (pfsyncr.pfsyncr_maxupdates > 255)
 			return (EINVAL);
 
 		if (pfsyncr.pfsyncr_syncdev[0] == 0)
 			sifp = NULL;
 		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
 			return (EINVAL);
 
 		if (sifp != NULL && (
 		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
 		    pfsyncr.pfsyncr_syncpeer.s_addr ==
 		    htonl(INADDR_PFSYNC_GROUP)))
 			mship = malloc((sizeof(struct in_multi *) *
 			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
 
 		PFSYNC_LOCK(sc);
 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
 			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
 		else
 			sc->sc_sync_peer.s_addr =
 			    pfsyncr.pfsyncr_syncpeer.s_addr;
 
 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
 		if (pfsyncr.pfsyncr_defer) {
 			sc->sc_flags |= PFSYNCF_DEFER;
 			pfsync_defer_ptr = pfsync_defer;
 		} else {
 			sc->sc_flags &= ~PFSYNCF_DEFER;
 			pfsync_defer_ptr = NULL;
 		}
 
 		if (sifp == NULL) {
 			if (sc->sc_sync_if)
 				if_rele(sc->sc_sync_if);
 			sc->sc_sync_if = NULL;
 			if (imo->imo_membership)
 				pfsync_multicast_cleanup(sc);
 			PFSYNC_UNLOCK(sc);
 			break;
 		}
 
 		if (sc->sc_len > PFSYNC_MINPKT &&
 		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
 		    (sc->sc_sync_if != NULL &&
 		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
 			pfsync_sendout(1);
 
 		if (imo->imo_membership)
 			pfsync_multicast_cleanup(sc);
 
 		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 			error = pfsync_multicast_setup(sc, sifp, mship);
 			if (error) {
 				if_rele(sifp);
 				free(mship, M_PFSYNC);
 				return (error);
 			}
 		}
 		if (sc->sc_sync_if)
 			if_rele(sc->sc_sync_if);
 		sc->sc_sync_if = sifp;
 
 		ip = &sc->sc_template;
 		bzero(ip, sizeof(*ip));
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		/* len and id are set later. */
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = PFSYNC_DFLTTL;
 		ip->ip_p = IPPROTO_PFSYNC;
 		ip->ip_src.s_addr = INADDR_ANY;
 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
 
 		/* Request a full state table update. */
 		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(V_pfsync_carp_adj,
 			    "pfsync bulk start");
 		sc->sc_flags &= ~PFSYNCF_OK;
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: requesting bulk update\n");
 		pfsync_request_update(0, 0);
 		PFSYNC_UNLOCK(sc);
 		PFSYNC_BLOCK(sc);
 		sc->sc_ureq_sent = time_uptime;
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
 		    sc);
 		PFSYNC_BUNLOCK(sc);
 
 		break;
 	    }
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static void
 pfsync_out_state(struct pf_state *st, void *buf)
 {
 	struct pfsync_state *sp = buf;
 
 	pfsync_state_export(sp, st);
 }
 
 static void
 pfsync_out_iack(struct pf_state *st, void *buf)
 {
 	struct pfsync_ins_ack *iack = buf;
 
 	iack->id = st->id;
 	iack->creatorid = st->creatorid;
 }
 
 static void
 pfsync_out_upd_c(struct pf_state *st, void *buf)
 {
 	struct pfsync_upd_c *up = buf;
 
 	bzero(up, sizeof(*up));
 	up->id = st->id;
 	pf_state_peer_hton(&st->src, &up->src);
 	pf_state_peer_hton(&st->dst, &up->dst);
 	up->creatorid = st->creatorid;
 	up->timeout = st->timeout;
 }
 
 static void
 pfsync_out_del(struct pf_state *st, void *buf)
 {
 	struct pfsync_del_c *dp = buf;
 
 	dp->id = st->id;
 	dp->creatorid = st->creatorid;
 	st->state_flags |= PFSTATE_NOSYNC;
 }
 
 static void
 pfsync_drop(struct pfsync_softc *sc)
 {
 	struct pf_state *st, *next;
 	struct pfsync_upd_req_item *ur;
 	int q;
 
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
 			continue;
 
 		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
 	}
 
 	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
 		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
 		free(ur, M_PFSYNC);
 	}
 
 	sc->sc_plus = NULL;
 	sc->sc_len = PFSYNC_MINPKT;
 }
 
 static void
 pfsync_sendout(int schedswi)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct mbuf *m;
 	struct ip *ip;
 	struct pfsync_header *ph;
 	struct pfsync_subheader *subh;
 	struct pf_state *st;
 	struct pfsync_upd_req_item *ur;
 	int offset;
 	int q, count = 0;
 
 	KASSERT(sc != NULL, ("%s: null sc", __func__));
 	KASSERT(sc->sc_len > PFSYNC_MINPKT,
 	    ("%s: sc_len %zu", __func__, sc->sc_len));
 	PFSYNC_LOCK_ASSERT(sc);
 
 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
 		pfsync_drop(sc);
 		return;
 	}
 
 	m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		V_pfsyncstats.pfsyncs_onomem++;
 		return;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len = m->m_pkthdr.len = sc->sc_len;
 
 	/* build the ip header */
 	ip = (struct ip *)m->m_data;
 	bcopy(&sc->sc_template, ip, sizeof(*ip));
 	offset = sizeof(*ip);
 
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip_fillid(ip);
 
 	/* build the pfsync header */
 	ph = (struct pfsync_header *)(m->m_data + offset);
 	bzero(ph, sizeof(*ph));
 	offset += sizeof(*ph);
 
 	ph->version = PFSYNC_VERSION;
 	ph->len = htons(sc->sc_len - sizeof(*ip));
 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
 	/* walk the queues */
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
 			continue;
 
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			/*
 			 * XXXGL: some of write methods do unlocked reads
 			 * of state data :(
 			 */
 			pfsync_qs[q].write(st, m->m_data + offset);
 			offset += pfsync_qs[q].len;
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 			count++;
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
 
 		bzero(subh, sizeof(*subh));
 		subh->action = pfsync_qs[q].action;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
 	}
 
 	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
 
 			bcopy(&ur->ur_msg, m->m_data + offset,
 			    sizeof(ur->ur_msg));
 			offset += sizeof(ur->ur_msg);
 			free(ur, M_PFSYNC);
 			count++;
 		}
 
 		bzero(subh, sizeof(*subh));
 		subh->action = PFSYNC_ACT_UPD_REQ;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
 	}
 
 	/* has someone built a custom region for us to add? */
 	if (sc->sc_plus != NULL) {
 		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
 		offset += sc->sc_pluslen;
 
 		sc->sc_plus = NULL;
 	}
 
 	subh = (struct pfsync_subheader *)(m->m_data + offset);
 	offset += sizeof(*subh);
 
 	bzero(subh, sizeof(*subh));
 	subh->action = PFSYNC_ACT_EOF;
 	subh->count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
 
 	/* we're done, let's put it on the wire */
 	if (ifp->if_bpf) {
 		m->m_data += sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
 		BPF_MTAP(ifp, m);
 		m->m_data -= sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = sc->sc_len;
 	}
 
 	if (sc->sc_sync_if == NULL) {
 		sc->sc_len = PFSYNC_MINPKT;
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	sc->sc_len = PFSYNC_MINPKT;
 
 	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
 		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
 	else {
 		m_freem(m);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
 	}
 	if (schedswi)
 		swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_insert_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (st->state_flags & PFSTATE_NOSYNC)
 		return;
 
 	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
 		st->state_flags |= PFSTATE_NOSYNC;
 		return;
 	}
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 
 	PFSYNC_LOCK(sc);
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	pfsync_q_ins(st, PFSYNC_S_INS);
 	PFSYNC_UNLOCK(sc);
 
 	st->sync_updates = 0;
 }
 
 static int
 pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		return (0);
 
 	PFSYNC_LOCK(sc);
 
 	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
 	    !(sc->sc_flags & PFSYNCF_DEFER)) {
 		PFSYNC_UNLOCK(sc);
 		return (0);
 	}
 
 	 if (sc->sc_deferred >= 128)
 		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
 
 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
 	if (pd == NULL)
 		return (0);
 	sc->sc_deferred++;
 
 	m->m_flags |= M_SKIP_FIREWALL;
 	st->state_flags |= PFSTATE_ACK;
 
 	pd->pd_sc = sc;
 	pd->pd_refs = 0;
 	pd->pd_st = st;
 	pf_ref_state(st);
 	pd->pd_m = m;
 
 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
 	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
 
 	pfsync_push(sc);
 
 	return (1);
 }
 
 static void
 pfsync_undefer(struct pfsync_deferral *pd, int drop)
 {
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 	sc->sc_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	free(pd, M_PFSYNC);
 	pf_release_state(st);
 
 	if (drop)
 		m_freem(m);
 	else {
 		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
 		pfsync_push(sc);
 	}
 }
 
 static void
 pfsync_defer_tmo(void *arg)
 {
 	struct pfsync_deferral *pd = arg;
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 
 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 	sc->sc_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	if (pd->pd_refs == 0)
 		free(pd, M_PFSYNC);
 	PFSYNC_UNLOCK(sc);
 
 	ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	pf_release_state(st);
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_undefer_state(struct pf_state *st, int drop)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
 		 if (pd->pd_st == st) {
 			if (callout_stop(&pd->pd_tmo) > 0)
 				pfsync_undefer(pd, drop);
 			return;
 		}
 	}
 
 	panic("%s: unable to find deferred state", __func__);
 }
 
 static void
 pfsync_update_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_LOCK(sc);
 
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 0);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_INS:
 		/* we're already handling it */
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
 			st->sync_updates++;
 			if (st->sync_updates >= sc->sc_maxupdates)
 				sync = 1;
 		}
 		break;
 
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
 		st->sync_updates = 0;
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if (sync || (time_uptime - st->pfsync_time) < 2)
 		pfsync_push(sc);
 
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_req_item *item;
 	size_t nlen = sizeof(struct pfsync_upd_req);
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	/*
 	 * This code does a bit to prevent multiple update requests for the
 	 * same state being generated. It searches current subheader queue,
 	 * but it doesn't lookup into queue of already packed datagrams.
 	 */
 	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
 		if (item->ur_msg.id == id &&
 		    item->ur_msg.creatorid == creatorid)
 			return;
 
 	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
 	if (item == NULL)
 		return; /* XXX stats */
 
 	item->ur_msg.id = id;
 	item->ur_msg.creatorid = creatorid;
 
 	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1);
 
 		nlen = sizeof(struct pfsync_subheader) +
 		    sizeof(struct pfsync_upd_req);
 	}
 
 	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
 	sc->sc_len += nlen;
 }
 
 static void
 pfsync_update_state_req(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_LOCK(sc);
 
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD);
 		pfsync_push(sc);
 		break;
 
 	case PFSYNC_S_INS:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_DEL:
 		/* we're already handling it */
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_delete_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PFSYNC_LOCK(sc);
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 1);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_INS:
 		/* We never got to tell the world so just forget about it. */
 		pfsync_q_del(st);
 		break;
 
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 		/* FALLTHROUGH to putting it on the del list */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_DEL);
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_clr clr;
 	} __packed r;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_CLR;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
 
 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
 	r.clr.creatorid = creatorid;
 
 	PFSYNC_LOCK(sc);
 	pfsync_send_plus(&r, sizeof(r));
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_q_ins(struct pf_state *st, int q)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	size_t nlen = pfsync_qs[q].len;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
 	    sc->sc_len));
 
 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1);
 
 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
 	}
 
 	sc->sc_len += nlen;
 	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = q;
 	pf_ref_state(st);
 }
 
 static void
 pfsync_q_del(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	int q = st->sync_state;
 
 	PFSYNC_LOCK_ASSERT(sc);
 	KASSERT(st->sync_state != PFSYNC_S_NONE,
 		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
 
 	sc->sc_len -= pfsync_qs[q].len;
 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = PFSYNC_S_NONE;
 	pf_release_state(st);
 
 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
 		sc->sc_len -= sizeof(struct pfsync_subheader);
 }
 
 static void
 pfsync_bulk_start(void)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync: received bulk update request\n");
 
 	PFSYNC_BLOCK(sc);
 
 	sc->sc_ureq_received = time_uptime;
 	sc->sc_bulk_hashid = 0;
 	sc->sc_bulk_stateid = 0;
 	pfsync_bulk_status(PFSYNC_BUS_START);
 	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
 	PFSYNC_BUNLOCK(sc);
 }
 
 static void
 pfsync_bulk_update(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pf_state *s;
 	int i, sent = 0;
 
 	PFSYNC_BLOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	/*
 	 * Start with last state from previous invocation.
 	 * It may had gone, in this case start from the
 	 * hash slot.
 	 */
 	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
 
 	if (s != NULL)
 		i = PF_IDHASH(s);
 	else
 		i = sc->sc_bulk_hashid;
 
 	for (; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		if (s != NULL)
 			PF_HASHROW_ASSERT(ih);
 		else {
 			PF_HASHROW_LOCK(ih);
 			s = LIST_FIRST(&ih->states);
 		}
 
 		for (; s; s = LIST_NEXT(s, entry)) {
 
 			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
 			    sizeof(struct pfsync_state)) {
 				/* We've filled a packet. */
 				sc->sc_bulk_hashid = i;
 				sc->sc_bulk_stateid = s->id;
 				sc->sc_bulk_creatorid = s->creatorid;
 				PF_HASHROW_UNLOCK(ih);
 				callout_reset(&sc->sc_bulk_tmo, 1,
 				    pfsync_bulk_update, sc);
 				goto full;
 			}
 
 			if (s->sync_state == PFSYNC_S_NONE &&
 			    s->timeout < PFTM_MAX &&
 			    s->pfsync_time <= sc->sc_ureq_received) {
 				pfsync_update_state_req(s);
 				sent++;
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	/* We're done. */
 	pfsync_bulk_status(PFSYNC_BUS_END);
 
 full:
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_bulk_status(u_int8_t status)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_bus bus;
 	} __packed r;
 
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_BUS;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
 
 	r.bus.creatorid = V_pf_status.hostid;
 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
 	r.bus.status = status;
 
 	PFSYNC_LOCK(sc);
 	pfsync_send_plus(&r, sizeof(r));
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_bulk_fail(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_BLOCK_ASSERT(sc);
 
 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
 		/* Try again */
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
 		    pfsync_bulk_fail, V_pfsyncif);
 		PFSYNC_LOCK(sc);
 		pfsync_request_update(0, 0);
 		PFSYNC_UNLOCK(sc);
 	} else {
 		/* Pretend like the transfer was ok. */
 		sc->sc_ureq_sent = 0;
 		sc->sc_bulk_tries = 0;
 		PFSYNC_LOCK(sc);
 		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 			    "pfsync bulk fail");
 		sc->sc_flags |= PFSYNCF_OK;
 		PFSYNC_UNLOCK(sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: failed to receive bulk update\n");
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_send_plus(void *plus, size_t pluslen)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
 		pfsync_sendout(1);
 
 	sc->sc_plus = plus;
 	sc->sc_len += (sc->sc_pluslen = pluslen);
 
 	pfsync_sendout(1);
 }
 
 static void
 pfsync_timeout(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 	PFSYNC_LOCK(sc);
 	pfsync_push(sc);
 	PFSYNC_UNLOCK(sc);
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_push(struct pfsync_softc *sc)
 {
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	sc->sc_flags |= PFSYNCF_PUSH;
 	swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsyncintr(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct mbuf *m, *n;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_LOCK(sc);
 	if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
 		pfsync_sendout(0);
 		sc->sc_flags &= ~PFSYNCF_PUSH;
 	}
 	_IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
 	PFSYNC_UNLOCK(sc);
 
 	for (; m != NULL; m = n) {
 
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We distinguish between a deferral packet and our
 		 * own pfsync packet based on M_SKIP_FIREWALL
 		 * flag. This is XXX.
 		 */
 		if (m->m_flags & M_SKIP_FIREWALL)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 		else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
 		    NULL) == 0)
 			V_pfsyncstats.pfsyncs_opackets++;
 		else
 			V_pfsyncstats.pfsyncs_oerrors++;
 	}
 	CURVNET_RESTORE();
 }
 
 static int
 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	int error;
 
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		return (EADDRNOTAVAIL);
 
 	imo->imo_membership = (struct in_multi **)mship;
 	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	imo->imo_multicast_vif = -1;
 
 	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
 	    &imo->imo_membership[0])) != 0) {
 		imo->imo_membership = NULL;
 		return (error);
 	}
 	imo->imo_num_memberships++;
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
 	imo->imo_multicast_loop = 0;
 
 	return (0);
 }
 
 static void
 pfsync_multicast_cleanup(struct pfsync_softc *sc)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 
 	in_leavegroup(imo->imo_membership[0], NULL);
 	free(imo->imo_membership, M_PFSYNC);
 	imo->imo_membership = NULL;
 	imo->imo_multicast_ifp = NULL;
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_pfsync_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PFSYNC,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		pfsync_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 static void
 pfsync_pointers_init()
 {
 
 	PF_RULES_WLOCK();
 	pfsync_state_import_ptr = pfsync_state_import;
 	pfsync_insert_state_ptr = pfsync_insert_state;
 	pfsync_update_state_ptr = pfsync_update_state;
 	pfsync_delete_state_ptr = pfsync_delete_state;
 	pfsync_clear_states_ptr = pfsync_clear_states;
 	pfsync_defer_ptr = pfsync_defer;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 pfsync_pointers_uninit()
 {
 
 	PF_RULES_WLOCK();
 	pfsync_state_import_ptr = NULL;
 	pfsync_insert_state_ptr = NULL;
 	pfsync_update_state_ptr = NULL;
 	pfsync_delete_state_ptr = NULL;
 	pfsync_clear_states_ptr = NULL;
 	pfsync_defer_ptr = NULL;
 	PF_RULES_WUNLOCK();
 }
 
 static int
 pfsync_init()
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	int error = 0;
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_pfsync_cloner = if_clone_simple(pfsyncname,
 		    pfsync_clone_create, pfsync_clone_destroy, 1);
 		error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
 		    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
 		CURVNET_RESTORE();
 		if (error)
 			goto fail_locked;
 	}
 	VNET_LIST_RUNLOCK();
 #ifdef INET
 	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
 	if (error)
 		goto fail;
 	error = ipproto_register(IPPROTO_PFSYNC);
 	if (error) {
 		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 		goto fail;
 	}
 #endif
 	pfsync_pointers_init();
 
 	return (0);
 
 fail:
 	VNET_LIST_RLOCK();
 fail_locked:
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		if (V_pfsync_swi_cookie) {
 			swi_remove(V_pfsync_swi_cookie);
 			if_clone_detach(V_pfsync_cloner);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (error);
 }
 
 static void
 pfsync_uninit()
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	pfsync_pointers_uninit();
 
 	ipproto_unregister(IPPROTO_PFSYNC);
 	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		if_clone_detach(V_pfsync_cloner);
 		swi_remove(V_pfsync_swi_cookie);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 static int
 pfsync_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = pfsync_init();
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * Module should not be unloaded due to race conditions.
 		 */
 		error = EBUSY;
 		break;
 	case MOD_UNLOAD:
 		pfsync_uninit();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pfsync_mod = {
 	pfsyncname,
 	pfsync_modevent,
 	0
 };
 
 #define PFSYNC_MODVER 1
 
+/* XXX-BZ recheck the r229853 comment once the shuffling is done. */
 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_VERSION(pfsync, PFSYNC_MODVER);
 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
Index: projects/vnet/sys/netpfil/pf/pf_ioctl.c
===================================================================
--- projects/vnet/sys/netpfil/pf/pf_ioctl.c	(revision 294565)
+++ projects/vnet/sys/netpfil/pf/pf_ioctl.c	(revision 294566)
@@ -1,3793 +1,3793 @@
 /*-
  * Copyright (c) 2001 Daniel Hartmeier
  * Copyright (c) 2002,2003 Henning Brauer
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  *    - Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    - Redistributions in binary form must reproduce the above
  *      copyright notice, this list of conditions and the following
  *      disclaimer in the documentation and/or other materials provided
  *      with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Effort sponsored in part by the Defense Advanced Research Projects
  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  *
  *	$OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_bpf.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/interrupt.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/md5.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 #include <net/if_pflog.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/ip_icmp.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif /* INET6 */
 
 #ifdef ALTQ
 #include <net/altq/altq.h>
 #endif
 
 static int		 pfattach(void);
 static struct pf_pool	*pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t,
 			    u_int8_t, u_int8_t, u_int8_t);
 
 static void		 pf_mv_pool(struct pf_palist *, struct pf_palist *);
 static void		 pf_empty_pool(struct pf_palist *);
 static int		 pfioctl(struct cdev *, u_long, caddr_t, int,
 			    struct thread *);
 #ifdef ALTQ
 static int		 pf_begin_altq(u_int32_t *);
 static int		 pf_rollback_altq(u_int32_t);
 static int		 pf_commit_altq(u_int32_t);
 static int		 pf_enable_altq(struct pf_altq *);
 static int		 pf_disable_altq(struct pf_altq *);
 static u_int32_t	 pf_qname2qid(char *);
 static void		 pf_qid_unref(u_int32_t);
 #endif /* ALTQ */
 static int		 pf_begin_rules(u_int32_t *, int, const char *);
 static int		 pf_rollback_rules(u_int32_t, int, char *);
 static int		 pf_setup_pfsync_matching(struct pf_ruleset *);
 static void		 pf_hash_rule(MD5_CTX *, struct pf_rule *);
 static void		 pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *);
 static int		 pf_commit_rules(u_int32_t, int, char *);
 static int		 pf_addr_setup(struct pf_ruleset *,
 			    struct pf_addr_wrap *, sa_family_t);
 static void		 pf_addr_copyout(struct pf_addr_wrap *);
 
 VNET_DEFINE(struct pf_rule,	pf_default_rule);
 
 #ifdef ALTQ
 static VNET_DEFINE(int,		pf_altq_running);
 #define	V_pf_altq_running	VNET(pf_altq_running)
 #endif
 
 #define	TAGID_MAX	 50000
 struct pf_tagname {
 	TAILQ_ENTRY(pf_tagname)	entries;
 	char			name[PF_TAG_NAME_SIZE];
 	uint16_t		tag;
 	int			ref;
 };
 
 TAILQ_HEAD(pf_tags, pf_tagname);
 #define	V_pf_tags		VNET(pf_tags)
 VNET_DEFINE(struct pf_tags, pf_tags);
 #define	V_pf_qids		VNET(pf_qids)
 VNET_DEFINE(struct pf_tags, pf_qids);
 static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names");
 static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db");
 static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules");
 
 #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
 #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
 #endif
 
 static u_int16_t	 tagname2tag(struct pf_tags *, char *);
 static u_int16_t	 pf_tagname2tag(char *);
 static void		 tag_unref(struct pf_tags *, u_int16_t);
 
 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
 
 struct cdev *pf_dev;
 
 /*
  * XXX - These are new and need to be checked when moveing to a new version
  */
 static void		 pf_clear_states(void);
 static int		 pf_clear_tables(void);
 static void		 pf_clear_srcnodes(struct pf_src_node *);
 static void		 pf_kill_srcnodes(struct pfioc_src_node_kill *);
 static void		 pf_tbladdr_copyout(struct pf_addr_wrap *);
 
 /*
  * Wrapper functions for pfil(9) hooks
  */
 #ifdef INET
 static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp,
     int dir, struct inpcb *inp);
 static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp,
     int dir, struct inpcb *inp);
 #endif
 #ifdef INET6
 static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp,
     int dir, struct inpcb *inp);
 static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp,
     int dir, struct inpcb *inp);
 #endif
 
 static int		hook_pf(void);
 static int		dehook_pf(void);
 static int		shutdown_pf(void);
 static int		pf_load(void);
 static int		pf_unload(void);
 
 static struct cdevsw pf_cdevsw = {
 	.d_ioctl =	pfioctl,
 	.d_name =	PF_NAME,
 	.d_version =	D_VERSION,
 };
 
 static volatile VNET_DEFINE(int, pf_pfil_hooked);
 #define V_pf_pfil_hooked	VNET(pf_pfil_hooked)
 VNET_DEFINE(int,		pf_end_threads);
 
 struct rwlock			pf_rules_lock;
 struct sx			pf_ioctl_lock;
 
 /* pfsync */
 pfsync_state_import_t 		*pfsync_state_import_ptr = NULL;
 pfsync_insert_state_t		*pfsync_insert_state_ptr = NULL;
 pfsync_update_state_t		*pfsync_update_state_ptr = NULL;
 pfsync_delete_state_t		*pfsync_delete_state_ptr = NULL;
 pfsync_clear_states_t		*pfsync_clear_states_ptr = NULL;
 pfsync_defer_t			*pfsync_defer_ptr = NULL;
 /* pflog */
 pflog_packet_t			*pflog_packet_ptr = NULL;
 
 static int
 pfattach(void)
 {
 	u_int32_t *my_timeout = V_pf_default_rule.timeout;
 	int error;
 
 	if (IS_DEFAULT_VNET(curvnet))
 		pf_mtag_initialize();
 	pf_initialize();
 	pfr_initialize();
 	pfi_initialize();
 	pf_normalize_init();
 
 	V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT;
 	V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT;
 
 	RB_INIT(&V_pf_anchors);
 	pf_init_ruleset(&pf_main_ruleset);
 
 	/* default rule should never be garbage collected */
 	V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next;
 #ifdef PF_DEFAULT_TO_DROP
 	V_pf_default_rule.action = PF_DROP;
 #else
 	V_pf_default_rule.action = PF_PASS;
 #endif
 	V_pf_default_rule.nr = -1;
 	V_pf_default_rule.rtableid = -1;
 
 	V_pf_default_rule.states_cur = counter_u64_alloc(M_WAITOK);
 	V_pf_default_rule.states_tot = counter_u64_alloc(M_WAITOK);
 	V_pf_default_rule.src_nodes = counter_u64_alloc(M_WAITOK);
 
 	/* initialize default timeouts */
 	my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
 	my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
 	my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
 	my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
 	my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
 	my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
 	my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
 	my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
 	my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
 	my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
 	my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
 	my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
 	my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
 	my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
 	my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
 	my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
 	my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;
 
 	bzero(&V_pf_status, sizeof(V_pf_status));
 	V_pf_status.debug = PF_DEBUG_URGENT;
 
 	V_pf_pfil_hooked = 0;
 
 	/* XXX do our best to avoid a conflict */
 	V_pf_status.hostid = arc4random();
 
 	for (int i = 0; i < PFRES_MAX; i++)
 		V_pf_status.counters[i] = counter_u64_alloc(M_WAITOK);
 	for (int i = 0; i < LCNT_MAX; i++)
 		V_pf_status.lcounters[i] = counter_u64_alloc(M_WAITOK);
 	for (int i = 0; i < FCNT_MAX; i++)
 		V_pf_status.fcounters[i] = counter_u64_alloc(M_WAITOK);
 	for (int i = 0; i < SCNT_MAX; i++)
 		V_pf_status.scounters[i] = counter_u64_alloc(M_WAITOK);
 
 	if ((error = kproc_create(pf_purge_thread, curvnet, NULL, 0, 0,
 	    "pf purge")) != 0)
 		/* XXXGL: leaked all above. */
 		return (error);
 	if ((error = swi_add(NULL, "pf send", pf_intr, curvnet, SWI_NET,
 	    INTR_MPSAFE, &V_pf_swi_cookie)) != 0)
 		/* XXXGL: leaked all above. */
 		return (error);
 
 	return (0);
 }
 
 static struct pf_pool *
 pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action,
     u_int32_t rule_number, u_int8_t r_last, u_int8_t active,
     u_int8_t check_ticket)
 {
 	struct pf_ruleset	*ruleset;
 	struct pf_rule		*rule;
 	int			 rs_num;
 
 	ruleset = pf_find_ruleset(anchor);
 	if (ruleset == NULL)
 		return (NULL);
 	rs_num = pf_get_ruleset_number(rule_action);
 	if (rs_num >= PF_RULESET_MAX)
 		return (NULL);
 	if (active) {
 		if (check_ticket && ticket !=
 		    ruleset->rules[rs_num].active.ticket)
 			return (NULL);
 		if (r_last)
 			rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
 			    pf_rulequeue);
 		else
 			rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
 	} else {
 		if (check_ticket && ticket !=
 		    ruleset->rules[rs_num].inactive.ticket)
 			return (NULL);
 		if (r_last)
 			rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
 			    pf_rulequeue);
 		else
 			rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr);
 	}
 	if (!r_last) {
 		while ((rule != NULL) && (rule->nr != rule_number))
 			rule = TAILQ_NEXT(rule, entries);
 	}
 	if (rule == NULL)
 		return (NULL);
 
 	return (&rule->rpool);
 }
 
 static void
 pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb)
 {
 	struct pf_pooladdr	*mv_pool_pa;
 
 	while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) {
 		TAILQ_REMOVE(poola, mv_pool_pa, entries);
 		TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries);
 	}
 }
 
 static void
 pf_empty_pool(struct pf_palist *poola)
 {
 	struct pf_pooladdr *pa;
 
 	while ((pa = TAILQ_FIRST(poola)) != NULL) {
 		switch (pa->addr.type) {
 		case PF_ADDR_DYNIFTL:
 			pfi_dynaddr_remove(pa->addr.p.dyn);
 			break;
 		case PF_ADDR_TABLE:
 			/* XXX: this could be unfinished pooladdr on pabuf */
 			if (pa->addr.p.tbl != NULL)
 				pfr_detach_table(pa->addr.p.tbl);
 			break;
 		}
 		if (pa->kif)
 			pfi_kif_unref(pa->kif);
 		TAILQ_REMOVE(poola, pa, entries);
 		free(pa, M_PFRULE);
 	}
 }
 
 static void
 pf_unlink_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule)
 {
 
 	PF_RULES_WASSERT();
 
 	TAILQ_REMOVE(rulequeue, rule, entries);
 
 	PF_UNLNKDRULES_LOCK();
 	rule->rule_flag |= PFRULE_REFS;
 	TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries);
 	PF_UNLNKDRULES_UNLOCK();
 }
 
 void
 pf_free_rule(struct pf_rule *rule)
 {
 
 	PF_RULES_WASSERT();
 
 	if (rule->tag)
 		tag_unref(&V_pf_tags, rule->tag);
 	if (rule->match_tag)
 		tag_unref(&V_pf_tags, rule->match_tag);
 #ifdef ALTQ
 	if (rule->pqid != rule->qid)
 		pf_qid_unref(rule->pqid);
 	pf_qid_unref(rule->qid);
 #endif
 	switch (rule->src.addr.type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_remove(rule->src.addr.p.dyn);
 		break;
 	case PF_ADDR_TABLE:
 		pfr_detach_table(rule->src.addr.p.tbl);
 		break;
 	}
 	switch (rule->dst.addr.type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_remove(rule->dst.addr.p.dyn);
 		break;
 	case PF_ADDR_TABLE:
 		pfr_detach_table(rule->dst.addr.p.tbl);
 		break;
 	}
 	if (rule->overload_tbl)
 		pfr_detach_table(rule->overload_tbl);
 	if (rule->kif)
 		pfi_kif_unref(rule->kif);
 	pf_anchor_remove(rule);
 	pf_empty_pool(&rule->rpool.list);
 	counter_u64_free(rule->states_cur);
 	counter_u64_free(rule->states_tot);
 	counter_u64_free(rule->src_nodes);
 	free(rule, M_PFRULE);
 }
 
 static u_int16_t
 tagname2tag(struct pf_tags *head, char *tagname)
 {
 	struct pf_tagname	*tag, *p = NULL;
 	u_int16_t		 new_tagid = 1;
 
 	PF_RULES_WASSERT();
 
 	TAILQ_FOREACH(tag, head, entries)
 		if (strcmp(tagname, tag->name) == 0) {
 			tag->ref++;
 			return (tag->tag);
 		}
 
 	/*
 	 * to avoid fragmentation, we do a linear search from the beginning
 	 * and take the first free slot we find. if there is none or the list
 	 * is empty, append a new entry at the end.
 	 */
 
 	/* new entry */
 	if (!TAILQ_EMPTY(head))
 		for (p = TAILQ_FIRST(head); p != NULL &&
 		    p->tag == new_tagid; p = TAILQ_NEXT(p, entries))
 			new_tagid = p->tag + 1;
 
 	if (new_tagid > TAGID_MAX)
 		return (0);
 
 	/* allocate and fill new struct pf_tagname */
 	tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO);
 	if (tag == NULL)
 		return (0);
 	strlcpy(tag->name, tagname, sizeof(tag->name));
 	tag->tag = new_tagid;
 	tag->ref++;
 
 	if (p != NULL)	/* insert new entry before p */
 		TAILQ_INSERT_BEFORE(p, tag, entries);
 	else	/* either list empty or no free slot in between */
 		TAILQ_INSERT_TAIL(head, tag, entries);
 
 	return (tag->tag);
 }
 
 static void
 tag_unref(struct pf_tags *head, u_int16_t tag)
 {
 	struct pf_tagname	*p, *next;
 
 	PF_RULES_WASSERT();
 
 	for (p = TAILQ_FIRST(head); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, entries);
 		if (tag == p->tag) {
 			if (--p->ref == 0) {
 				TAILQ_REMOVE(head, p, entries);
 				free(p, M_PFTAG);
 			}
 			break;
 		}
 	}
 }
 
 static u_int16_t
 pf_tagname2tag(char *tagname)
 {
 	return (tagname2tag(&V_pf_tags, tagname));
 }
 
 #ifdef ALTQ
 static u_int32_t
 pf_qname2qid(char *qname)
 {
 	return ((u_int32_t)tagname2tag(&V_pf_qids, qname));
 }
 
 static void
 pf_qid_unref(u_int32_t qid)
 {
 	tag_unref(&V_pf_qids, (u_int16_t)qid);
 }
 
 static int
 pf_begin_altq(u_int32_t *ticket)
 {
 	struct pf_altq	*altq;
 	int		 error = 0;
 
 	PF_RULES_WASSERT();
 
 	/* Purge the old altq list */
 	while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
 		TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
 		if (altq->qname[0] == 0 &&
 		    (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			error = altq_remove(altq);
 		} else
 			pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 	if (error)
 		return (error);
 	*ticket = ++V_ticket_altqs_inactive;
 	V_altqs_inactive_open = 1;
 	return (0);
 }
 
 static int
 pf_rollback_altq(u_int32_t ticket)
 {
 	struct pf_altq	*altq;
 	int		 error = 0;
 
 	PF_RULES_WASSERT();
 
 	if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
 		return (0);
 	/* Purge the old altq list */
 	while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
 		TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
 		if (altq->qname[0] == 0 &&
 		   (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			error = altq_remove(altq);
 		} else
 			pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 	V_altqs_inactive_open = 0;
 	return (error);
 }
 
 static int
 pf_commit_altq(u_int32_t ticket)
 {
 	struct pf_altqqueue	*old_altqs;
 	struct pf_altq		*altq;
 	int			 err, error = 0;
 
 	PF_RULES_WASSERT();
 
 	if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
 		return (EBUSY);
 
 	/* swap altqs, keep the old. */
 	old_altqs = V_pf_altqs_active;
 	V_pf_altqs_active = V_pf_altqs_inactive;
 	V_pf_altqs_inactive = old_altqs;
 	V_ticket_altqs_active = V_ticket_altqs_inactive;
 
 	/* Attach new disciplines */
 	TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
 	if (altq->qname[0] == 0 &&
 	   (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* attach the discipline */
 			error = altq_pfattach(altq);
 			if (error == 0 && V_pf_altq_running)
 				error = pf_enable_altq(altq);
 			if (error != 0)
 				return (error);
 		}
 	}
 
 	/* Purge the old altq list */
 	while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
 		TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
 		if (altq->qname[0] == 0 &&
 		    (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			if (V_pf_altq_running)
 				error = pf_disable_altq(altq);
 			err = altq_pfdetach(altq);
 			if (err != 0 && error == 0)
 				error = err;
 			err = altq_remove(altq);
 			if (err != 0 && error == 0)
 				error = err;
 		} else
 			pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 
 	V_altqs_inactive_open = 0;
 	return (error);
 }
 
 static int
 pf_enable_altq(struct pf_altq *altq)
 {
 	struct ifnet		*ifp;
 	struct tb_profile	 tb;
 	int			 error = 0;
 
 	if ((ifp = ifunit(altq->ifname)) == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd.altq_type != ALTQT_NONE)
 		error = altq_enable(&ifp->if_snd);
 
 	/* set tokenbucket regulator */
 	if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		tb.rate = altq->ifbandwidth;
 		tb.depth = altq->tbrsize;
 		error = tbr_set(&ifp->if_snd, &tb);
 	}
 
 	return (error);
 }
 
 static int
 pf_disable_altq(struct pf_altq *altq)
 {
 	struct ifnet		*ifp;
 	struct tb_profile	 tb;
 	int			 error;
 
 	if ((ifp = ifunit(altq->ifname)) == NULL)
 		return (EINVAL);
 
 	/*
 	 * when the discipline is no longer referenced, it was overridden
 	 * by a new one.  if so, just return.
 	 */
 	if (altq->altq_disc != ifp->if_snd.altq_disc)
 		return (0);
 
 	error = altq_disable(&ifp->if_snd);
 
 	if (error == 0) {
 		/* clear tokenbucket regulator */
 		tb.rate = 0;
 		error = tbr_set(&ifp->if_snd, &tb);
 	}
 
 	return (error);
 }
 
 void
 pf_altq_ifnet_event(struct ifnet *ifp, int remove)
 {
 	struct ifnet	*ifp1;
 	struct pf_altq	*a1, *a2, *a3;
 	u_int32_t	 ticket;
 	int		 error = 0;
 
 	/* Interrupt userland queue modifications */
 	if (V_altqs_inactive_open)
 		pf_rollback_altq(V_ticket_altqs_inactive);
 
 	/* Start new altq ruleset */
 	if (pf_begin_altq(&ticket))
 		return;
 
 	/* Copy the current active set */
 	TAILQ_FOREACH(a1, V_pf_altqs_active, entries) {
 		a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT);
 		if (a2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		bcopy(a1, a2, sizeof(struct pf_altq));
 
 		if (a2->qname[0] != 0) {
 			if ((a2->qid = pf_qname2qid(a2->qname)) == 0) {
 				error = EBUSY;
 				free(a2, M_PFALTQ);
 				break;
 			}
 			a2->altq_disc = NULL;
 			TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) {
 				if (strncmp(a3->ifname, a2->ifname,
 				    IFNAMSIZ) == 0 && a3->qname[0] == 0) {
 					a2->altq_disc = a3->altq_disc;
 					break;
 				}
 			}
 		}
 		/* Deactivate the interface in question */
 		a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED;
 		if ((ifp1 = ifunit(a2->ifname)) == NULL ||
 		    (remove && ifp1 == ifp)) {
 			a2->local_flags |= PFALTQ_FLAG_IF_REMOVED;
 		} else {
 			error = altq_add(a2);
 
 			if (ticket != V_ticket_altqs_inactive)
 				error = EBUSY;
 
 			if (error) {
 				free(a2, M_PFALTQ);
 				break;
 			}
 		}
 
 		TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries);
 	}
 
 	if (error != 0)
 		pf_rollback_altq(ticket);
 	else
 		pf_commit_altq(ticket);
 }
 #endif /* ALTQ */
 
 static int
 pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor)
 {
 	struct pf_ruleset	*rs;
 	struct pf_rule		*rule;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	rs = pf_find_or_create_ruleset(anchor);
 	if (rs == NULL)
 		return (EINVAL);
 	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
 		pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
 		rs->rules[rs_num].inactive.rcount--;
 	}
 	*ticket = ++rs->rules[rs_num].inactive.ticket;
 	rs->rules[rs_num].inactive.open = 1;
 	return (0);
 }
 
 static int
 pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor)
 {
 	struct pf_ruleset	*rs;
 	struct pf_rule		*rule;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	rs = pf_find_ruleset(anchor);
 	if (rs == NULL || !rs->rules[rs_num].inactive.open ||
 	    rs->rules[rs_num].inactive.ticket != ticket)
 		return (0);
 	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
 		pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
 		rs->rules[rs_num].inactive.rcount--;
 	}
 	rs->rules[rs_num].inactive.open = 0;
 	return (0);
 }
 
 #define PF_MD5_UPD(st, elm)						\
 		MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm))
 
 #define PF_MD5_UPD_STR(st, elm)						\
 		MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm))
 
 #define PF_MD5_UPD_HTONL(st, elm, stor) do {				\
 		(stor) = htonl((st)->elm);				\
 		MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\
 } while (0)
 
 #define PF_MD5_UPD_HTONS(st, elm, stor) do {				\
 		(stor) = htons((st)->elm);				\
 		MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\
 } while (0)
 
 static void
 pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
 {
 	PF_MD5_UPD(pfr, addr.type);
 	switch (pfr->addr.type) {
 		case PF_ADDR_DYNIFTL:
 			PF_MD5_UPD(pfr, addr.v.ifname);
 			PF_MD5_UPD(pfr, addr.iflags);
 			break;
 		case PF_ADDR_TABLE:
 			PF_MD5_UPD(pfr, addr.v.tblname);
 			break;
 		case PF_ADDR_ADDRMASK:
 			/* XXX ignore af? */
 			PF_MD5_UPD(pfr, addr.v.a.addr.addr32);
 			PF_MD5_UPD(pfr, addr.v.a.mask.addr32);
 			break;
 	}
 
 	PF_MD5_UPD(pfr, port[0]);
 	PF_MD5_UPD(pfr, port[1]);
 	PF_MD5_UPD(pfr, neg);
 	PF_MD5_UPD(pfr, port_op);
 }
 
 static void
 pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule)
 {
 	u_int16_t x;
 	u_int32_t y;
 
 	pf_hash_rule_addr(ctx, &rule->src);
 	pf_hash_rule_addr(ctx, &rule->dst);
 	PF_MD5_UPD_STR(rule, label);
 	PF_MD5_UPD_STR(rule, ifname);
 	PF_MD5_UPD_STR(rule, match_tagname);
 	PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */
 	PF_MD5_UPD_HTONL(rule, os_fingerprint, y);
 	PF_MD5_UPD_HTONL(rule, prob, y);
 	PF_MD5_UPD_HTONL(rule, uid.uid[0], y);
 	PF_MD5_UPD_HTONL(rule, uid.uid[1], y);
 	PF_MD5_UPD(rule, uid.op);
 	PF_MD5_UPD_HTONL(rule, gid.gid[0], y);
 	PF_MD5_UPD_HTONL(rule, gid.gid[1], y);
 	PF_MD5_UPD(rule, gid.op);
 	PF_MD5_UPD_HTONL(rule, rule_flag, y);
 	PF_MD5_UPD(rule, action);
 	PF_MD5_UPD(rule, direction);
 	PF_MD5_UPD(rule, af);
 	PF_MD5_UPD(rule, quick);
 	PF_MD5_UPD(rule, ifnot);
 	PF_MD5_UPD(rule, match_tag_not);
 	PF_MD5_UPD(rule, natpass);
 	PF_MD5_UPD(rule, keep_state);
 	PF_MD5_UPD(rule, proto);
 	PF_MD5_UPD(rule, type);
 	PF_MD5_UPD(rule, code);
 	PF_MD5_UPD(rule, flags);
 	PF_MD5_UPD(rule, flagset);
 	PF_MD5_UPD(rule, allow_opts);
 	PF_MD5_UPD(rule, rt);
 	PF_MD5_UPD(rule, tos);
 }
 
 static int
 pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 {
 	struct pf_ruleset	*rs;
 	struct pf_rule		*rule, **old_array;
 	struct pf_rulequeue	*old_rules;
 	int			 error;
 	u_int32_t		 old_rcount;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	rs = pf_find_ruleset(anchor);
 	if (rs == NULL || !rs->rules[rs_num].inactive.open ||
 	    ticket != rs->rules[rs_num].inactive.ticket)
 		return (EBUSY);
 
 	/* Calculate checksum for the main ruleset */
 	if (rs == &pf_main_ruleset) {
 		error = pf_setup_pfsync_matching(rs);
 		if (error != 0)
 			return (error);
 	}
 
 	/* Swap rules, keep the old. */
 	old_rules = rs->rules[rs_num].active.ptr;
 	old_rcount = rs->rules[rs_num].active.rcount;
 	old_array = rs->rules[rs_num].active.ptr_array;
 
 	rs->rules[rs_num].active.ptr =
 	    rs->rules[rs_num].inactive.ptr;
 	rs->rules[rs_num].active.ptr_array =
 	    rs->rules[rs_num].inactive.ptr_array;
 	rs->rules[rs_num].active.rcount =
 	    rs->rules[rs_num].inactive.rcount;
 	rs->rules[rs_num].inactive.ptr = old_rules;
 	rs->rules[rs_num].inactive.ptr_array = old_array;
 	rs->rules[rs_num].inactive.rcount = old_rcount;
 
 	rs->rules[rs_num].active.ticket =
 	    rs->rules[rs_num].inactive.ticket;
 	pf_calc_skip_steps(rs->rules[rs_num].active.ptr);
 
 
 	/* Purge the old rule list. */
 	while ((rule = TAILQ_FIRST(old_rules)) != NULL)
 		pf_unlink_rule(old_rules, rule);
 	if (rs->rules[rs_num].inactive.ptr_array)
 		free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
 	rs->rules[rs_num].inactive.ptr_array = NULL;
 	rs->rules[rs_num].inactive.rcount = 0;
 	rs->rules[rs_num].inactive.open = 0;
 	pf_remove_if_empty_ruleset(rs);
 
 	return (0);
 }
 
 static int
 pf_setup_pfsync_matching(struct pf_ruleset *rs)
 {
 	MD5_CTX			 ctx;
 	struct pf_rule		*rule;
 	int			 rs_cnt;
 	u_int8_t		 digest[PF_MD5_DIGEST_LENGTH];
 
 	MD5Init(&ctx);
 	for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) {
 		/* XXX PF_RULESET_SCRUB as well? */
 		if (rs_cnt == PF_RULESET_SCRUB)
 			continue;
 
 		if (rs->rules[rs_cnt].inactive.ptr_array)
 			free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
 		rs->rules[rs_cnt].inactive.ptr_array = NULL;
 
 		if (rs->rules[rs_cnt].inactive.rcount) {
 			rs->rules[rs_cnt].inactive.ptr_array =
 			    malloc(sizeof(caddr_t) *
 			    rs->rules[rs_cnt].inactive.rcount,
 			    M_TEMP, M_NOWAIT);
 
 			if (!rs->rules[rs_cnt].inactive.ptr_array)
 				return (ENOMEM);
 		}
 
 		TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
 		    entries) {
 			pf_hash_rule(&ctx, rule);
 			(rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
 		}
 	}
 
 	MD5Final(digest, &ctx);
 	memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum));
 	return (0);
 }
 
 static int
 pf_addr_setup(struct pf_ruleset *ruleset, struct pf_addr_wrap *addr,
     sa_family_t af)
 {
 	int error = 0;
 
 	switch (addr->type) {
 	case PF_ADDR_TABLE:
 		addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname);
 		if (addr->p.tbl == NULL)
 			error = ENOMEM;
 		break;
 	case PF_ADDR_DYNIFTL:
 		error = pfi_dynaddr_setup(addr, af);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 pf_addr_copyout(struct pf_addr_wrap *addr)
 {
 
 	switch (addr->type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_copyout(addr);
 		break;
 	case PF_ADDR_TABLE:
 		pf_tbladdr_copyout(addr);
 		break;
 	}
 }
 
 static int
 pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	int			 error = 0;
 
 	/* XXX keep in sync with switch() below */
 	if (securelevel_gt(td->td_ucred, 2))
 		switch (cmd) {
 		case DIOCGETRULES:
 		case DIOCGETRULE:
 		case DIOCGETADDRS:
 		case DIOCGETADDR:
 		case DIOCGETSTATE:
 		case DIOCSETSTATUSIF:
 		case DIOCGETSTATUS:
 		case DIOCCLRSTATUS:
 		case DIOCNATLOOK:
 		case DIOCSETDEBUG:
 		case DIOCGETSTATES:
 		case DIOCGETTIMEOUT:
 		case DIOCCLRRULECTRS:
 		case DIOCGETLIMIT:
 		case DIOCGETALTQS:
 		case DIOCGETALTQ:
 		case DIOCGETQSTATS:
 		case DIOCGETRULESETS:
 		case DIOCGETRULESET:
 		case DIOCRGETTABLES:
 		case DIOCRGETTSTATS:
 		case DIOCRCLRTSTATS:
 		case DIOCRCLRADDRS:
 		case DIOCRADDADDRS:
 		case DIOCRDELADDRS:
 		case DIOCRSETADDRS:
 		case DIOCRGETADDRS:
 		case DIOCRGETASTATS:
 		case DIOCRCLRASTATS:
 		case DIOCRTSTADDRS:
 		case DIOCOSFPGET:
 		case DIOCGETSRCNODES:
 		case DIOCCLRSRCNODES:
 		case DIOCIGETIFACES:
 		case DIOCGIFSPEED:
 		case DIOCSETIFFLAG:
 		case DIOCCLRIFFLAG:
 			break;
 		case DIOCRCLRTABLES:
 		case DIOCRADDTABLES:
 		case DIOCRDELTABLES:
 		case DIOCRSETTFLAGS:
 			if (((struct pfioc_table *)addr)->pfrio_flags &
 			    PFR_FLAG_DUMMY)
 				break; /* dummy operation ok */
 			return (EPERM);
 		default:
 			return (EPERM);
 		}
 
 	if (!(flags & FWRITE))
 		switch (cmd) {
 		case DIOCGETRULES:
 		case DIOCGETADDRS:
 		case DIOCGETADDR:
 		case DIOCGETSTATE:
 		case DIOCGETSTATUS:
 		case DIOCGETSTATES:
 		case DIOCGETTIMEOUT:
 		case DIOCGETLIMIT:
 		case DIOCGETALTQS:
 		case DIOCGETALTQ:
 		case DIOCGETQSTATS:
 		case DIOCGETRULESETS:
 		case DIOCGETRULESET:
 		case DIOCNATLOOK:
 		case DIOCRGETTABLES:
 		case DIOCRGETTSTATS:
 		case DIOCRGETADDRS:
 		case DIOCRGETASTATS:
 		case DIOCRTSTADDRS:
 		case DIOCOSFPGET:
 		case DIOCGETSRCNODES:
 		case DIOCIGETIFACES:
 		case DIOCGIFSPEED:
 			break;
 		case DIOCRCLRTABLES:
 		case DIOCRADDTABLES:
 		case DIOCRDELTABLES:
 		case DIOCRCLRTSTATS:
 		case DIOCRCLRADDRS:
 		case DIOCRADDADDRS:
 		case DIOCRDELADDRS:
 		case DIOCRSETADDRS:
 		case DIOCRSETTFLAGS:
 			if (((struct pfioc_table *)addr)->pfrio_flags &
 			    PFR_FLAG_DUMMY) {
 				flags |= FWRITE; /* need write lock for dummy */
 				break; /* dummy operation ok */
 			}
 			return (EACCES);
 		case DIOCGETRULE:
 			if (((struct pfioc_rule *)addr)->action ==
 			    PF_GET_CLR_CNTR)
 				return (EACCES);
 			break;
 		default:
 			return (EACCES);
 		}
 
 	CURVNET_SET(TD_TO_VNET(td));
 
 	switch (cmd) {
 	case DIOCSTART:
 		sx_xlock(&pf_ioctl_lock);
 		if (V_pf_status.running)
 			error = EEXIST;
 		else {
 			int cpu;
 
 			error = hook_pf();
 			if (error) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: pfil registration failed\n"));
 				break;
 			}
 			V_pf_status.running = 1;
 			V_pf_status.since = time_second;
 
 			CPU_FOREACH(cpu)
 				V_pf_stateid[cpu] = time_second;
 
 			DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
 		}
 		break;
 
 	case DIOCSTOP:
 		sx_xlock(&pf_ioctl_lock);
 		if (!V_pf_status.running)
 			error = ENOENT;
 		else {
 			V_pf_status.running = 0;
 			error = dehook_pf();
 			if (error) {
 				V_pf_status.running = 1;
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: pfil unregistration failed\n"));
 			}
 			V_pf_status.since = time_second;
 			DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
 		}
 		break;
 
 	case DIOCADDRULE: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_rule		*rule, *tail;
 		struct pf_pooladdr	*pa;
 		struct pfi_kif		*kif = NULL;
 		int			 rs_num;
 
 		if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
 			error = EINVAL;
 			break;
 		}
 #ifndef INET
 		if (pr->rule.af == AF_INET) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET */
 #ifndef INET6
 		if (pr->rule.af == AF_INET6) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET6 */
 
 		rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK);
 		bcopy(&pr->rule, rule, sizeof(struct pf_rule));
 		if (rule->ifname[0])
 			kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
 		rule->states_cur = counter_u64_alloc(M_WAITOK);
 		rule->states_tot = counter_u64_alloc(M_WAITOK);
 		rule->src_nodes = counter_u64_alloc(M_WAITOK);
 		rule->cuid = td->td_ucred->cr_ruid;
 		rule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
 		TAILQ_INIT(&rule->rpool.list);
 
 #define	ERROUT(x)	{ error = (x); goto DIOCADDRULE_error; }
 
 		PF_RULES_WLOCK();
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 		ruleset = pf_find_ruleset(pr->anchor);
 		if (ruleset == NULL)
 			ERROUT(EINVAL);
 		rs_num = pf_get_ruleset_number(pr->rule.action);
 		if (rs_num >= PF_RULESET_MAX)
 			ERROUT(EINVAL);
 		if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) {
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("ticket: %d != [%d]%d\n", pr->ticket, rs_num,
 			    ruleset->rules[rs_num].inactive.ticket));
 			ERROUT(EBUSY);
 		}
 		if (pr->pool_ticket != V_ticket_pabuf) {
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pool_ticket: %d != %d\n", pr->pool_ticket,
 			    V_ticket_pabuf));
 			ERROUT(EBUSY);
 		}
 
 		tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
 		    pf_rulequeue);
 		if (tail)
 			rule->nr = tail->nr + 1;
 		else
 			rule->nr = 0;
 		if (rule->ifname[0]) {
 			rule->kif = pfi_kif_attach(kif, rule->ifname);
 			pfi_kif_ref(rule->kif);
 		} else
 			rule->kif = NULL;
 
 		if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs)
 			error = EBUSY;
 
 #ifdef ALTQ
 		/* set queue IDs */
 		if (rule->qname[0] != 0) {
 			if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
 				error = EBUSY;
 			else if (rule->pqname[0] != 0) {
 				if ((rule->pqid =
 				    pf_qname2qid(rule->pqname)) == 0)
 					error = EBUSY;
 			} else
 				rule->pqid = rule->qid;
 		}
 #endif
 		if (rule->tagname[0])
 			if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
 				error = EBUSY;
 		if (rule->match_tagname[0])
 			if ((rule->match_tag =
 			    pf_tagname2tag(rule->match_tagname)) == 0)
 				error = EBUSY;
 		if (rule->rt && !rule->direction)
 			error = EINVAL;
 		if (!rule->log)
 			rule->logif = 0;
 		if (rule->logif >= PFLOGIFS_MAX)
 			error = EINVAL;
 		if (pf_addr_setup(ruleset, &rule->src.addr, rule->af))
 			error = ENOMEM;
 		if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af))
 			error = ENOMEM;
 		if (pf_anchor_setup(rule, ruleset, pr->anchor_call))
 			error = EINVAL;
 		TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
 			if (pa->addr.type == PF_ADDR_TABLE) {
 				pa->addr.p.tbl = pfr_attach_table(ruleset,
 				    pa->addr.v.tblname);
 				if (pa->addr.p.tbl == NULL)
 					error = ENOMEM;
 			}
 
 		if (rule->overload_tblname[0]) {
 			if ((rule->overload_tbl = pfr_attach_table(ruleset,
 			    rule->overload_tblname)) == NULL)
 				error = EINVAL;
 			else
 				rule->overload_tbl->pfrkt_flags |=
 				    PFR_TFLAG_ACTIVE;
 		}
 
 		pf_mv_pool(&V_pf_pabuf, &rule->rpool.list);
 		if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) ||
 		    (rule->action == PF_BINAT)) && rule->anchor == NULL) ||
 		    (rule->rt > PF_FASTROUTE)) &&
 		    (TAILQ_FIRST(&rule->rpool.list) == NULL))
 			error = EINVAL;
 
 		if (error) {
 			pf_free_rule(rule);
 			PF_RULES_WUNLOCK();
 			break;
 		}
 
 		rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list);
 		rule->evaluations = rule->packets[0] = rule->packets[1] =
 		    rule->bytes[0] = rule->bytes[1] = 0;
 		TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
 		    rule, entries);
 		ruleset->rules[rs_num].inactive.rcount++;
 		PF_RULES_WUNLOCK();
 		break;
 
 #undef ERROUT
 DIOCADDRULE_error:
 		PF_RULES_WUNLOCK();
 		counter_u64_free(rule->states_cur);
 		counter_u64_free(rule->states_tot);
 		counter_u64_free(rule->src_nodes);
 		free(rule, M_PFRULE);
 		if (kif)
 			free(kif, PFI_MTYPE);
 		break;
 	}
 
 	case DIOCGETRULES: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_rule		*tail;
 		int			 rs_num;
 
 		PF_RULES_WLOCK();
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 		ruleset = pf_find_ruleset(pr->anchor);
 		if (ruleset == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		rs_num = pf_get_ruleset_number(pr->rule.action);
 		if (rs_num >= PF_RULESET_MAX) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
 		    pf_rulequeue);
 		if (tail)
 			pr->nr = tail->nr + 1;
 		else
 			pr->nr = 0;
 		pr->ticket = ruleset->rules[rs_num].active.ticket;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETRULE: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_rule		*rule;
 		int			 rs_num, i;
 
 		PF_RULES_WLOCK();
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 		ruleset = pf_find_ruleset(pr->anchor);
 		if (ruleset == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		rs_num = pf_get_ruleset_number(pr->rule.action);
 		if (rs_num >= PF_RULESET_MAX) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		if (pr->ticket != ruleset->rules[rs_num].active.ticket) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
 		while ((rule != NULL) && (rule->nr != pr->nr))
 			rule = TAILQ_NEXT(rule, entries);
 		if (rule == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		bcopy(rule, &pr->rule, sizeof(struct pf_rule));
 		pr->rule.u_states_cur = counter_u64_fetch(rule->states_cur);
 		pr->rule.u_states_tot = counter_u64_fetch(rule->states_tot);
 		pr->rule.u_src_nodes = counter_u64_fetch(rule->src_nodes);
 		if (pf_anchor_copyout(ruleset, rule, pr)) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pf_addr_copyout(&pr->rule.src.addr);
 		pf_addr_copyout(&pr->rule.dst.addr);
 		for (i = 0; i < PF_SKIP_COUNT; ++i)
 			if (rule->skip[i].ptr == NULL)
 				pr->rule.skip[i].nr = -1;
 			else
 				pr->rule.skip[i].nr =
 				    rule->skip[i].ptr->nr;
 
 		if (pr->action == PF_GET_CLR_CNTR) {
 			rule->evaluations = 0;
 			rule->packets[0] = rule->packets[1] = 0;
 			rule->bytes[0] = rule->bytes[1] = 0;
 			counter_u64_zero(rule->states_tot);
 		}
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCHANGERULE: {
 		struct pfioc_rule	*pcr = (struct pfioc_rule *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_rule		*oldrule = NULL, *newrule = NULL;
 		struct pfi_kif		*kif = NULL;
 		struct pf_pooladdr	*pa;
 		u_int32_t		 nr = 0;
 		int			 rs_num;
 
 		if (pcr->action < PF_CHANGE_ADD_HEAD ||
 		    pcr->action > PF_CHANGE_GET_TICKET) {
 			error = EINVAL;
 			break;
 		}
 		if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pcr->action != PF_CHANGE_REMOVE) {
 #ifndef INET
 			if (pcr->rule.af == AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET */
 #ifndef INET6
 			if (pcr->rule.af == AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET6 */
 			newrule = malloc(sizeof(*newrule), M_PFRULE, M_WAITOK);
 			bcopy(&pcr->rule, newrule, sizeof(struct pf_rule));
 			if (newrule->ifname[0])
 				kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
 			newrule->states_cur = counter_u64_alloc(M_WAITOK);
 			newrule->states_tot = counter_u64_alloc(M_WAITOK);
 			newrule->src_nodes = counter_u64_alloc(M_WAITOK);
 			newrule->cuid = td->td_ucred->cr_ruid;
 			newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
 			TAILQ_INIT(&newrule->rpool.list);
 		}
 
 #define	ERROUT(x)	{ error = (x); goto DIOCCHANGERULE_error; }
 
 		PF_RULES_WLOCK();
 		if (!(pcr->action == PF_CHANGE_REMOVE ||
 		    pcr->action == PF_CHANGE_GET_TICKET) &&
 		    pcr->pool_ticket != V_ticket_pabuf)
 			ERROUT(EBUSY);
 
 		ruleset = pf_find_ruleset(pcr->anchor);
 		if (ruleset == NULL)
 			ERROUT(EINVAL);
 
 		rs_num = pf_get_ruleset_number(pcr->rule.action);
 		if (rs_num >= PF_RULESET_MAX)
 			ERROUT(EINVAL);
 
 		if (pcr->action == PF_CHANGE_GET_TICKET) {
 			pcr->ticket = ++ruleset->rules[rs_num].active.ticket;
 			ERROUT(0);
 		} else if (pcr->ticket !=
 			    ruleset->rules[rs_num].active.ticket)
 				ERROUT(EINVAL);
 
 		if (pcr->action != PF_CHANGE_REMOVE) {
 			if (newrule->ifname[0]) {
 				newrule->kif = pfi_kif_attach(kif,
 				    newrule->ifname);
 				pfi_kif_ref(newrule->kif);
 			} else
 				newrule->kif = NULL;
 
 			if (newrule->rtableid > 0 &&
 			    newrule->rtableid >= rt_numfibs)
 				error = EBUSY;
 
 #ifdef ALTQ
 			/* set queue IDs */
 			if (newrule->qname[0] != 0) {
 				if ((newrule->qid =
 				    pf_qname2qid(newrule->qname)) == 0)
 					error = EBUSY;
 				else if (newrule->pqname[0] != 0) {
 					if ((newrule->pqid =
 					    pf_qname2qid(newrule->pqname)) == 0)
 						error = EBUSY;
 				} else
 					newrule->pqid = newrule->qid;
 			}
 #endif /* ALTQ */
 			if (newrule->tagname[0])
 				if ((newrule->tag =
 				    pf_tagname2tag(newrule->tagname)) == 0)
 					error = EBUSY;
 			if (newrule->match_tagname[0])
 				if ((newrule->match_tag = pf_tagname2tag(
 				    newrule->match_tagname)) == 0)
 					error = EBUSY;
 			if (newrule->rt && !newrule->direction)
 				error = EINVAL;
 			if (!newrule->log)
 				newrule->logif = 0;
 			if (newrule->logif >= PFLOGIFS_MAX)
 				error = EINVAL;
 			if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af))
 				error = ENOMEM;
 			if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af))
 				error = ENOMEM;
 			if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call))
 				error = EINVAL;
 			TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
 				if (pa->addr.type == PF_ADDR_TABLE) {
 					pa->addr.p.tbl =
 					    pfr_attach_table(ruleset,
 					    pa->addr.v.tblname);
 					if (pa->addr.p.tbl == NULL)
 						error = ENOMEM;
 				}
 
 			if (newrule->overload_tblname[0]) {
 				if ((newrule->overload_tbl = pfr_attach_table(
 				    ruleset, newrule->overload_tblname)) ==
 				    NULL)
 					error = EINVAL;
 				else
 					newrule->overload_tbl->pfrkt_flags |=
 					    PFR_TFLAG_ACTIVE;
 			}
 
 			pf_mv_pool(&V_pf_pabuf, &newrule->rpool.list);
 			if (((((newrule->action == PF_NAT) ||
 			    (newrule->action == PF_RDR) ||
 			    (newrule->action == PF_BINAT) ||
 			    (newrule->rt > PF_FASTROUTE)) &&
 			    !newrule->anchor)) &&
 			    (TAILQ_FIRST(&newrule->rpool.list) == NULL))
 				error = EINVAL;
 
 			if (error) {
 				pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				break;
 			}
 
 			newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list);
 			newrule->evaluations = 0;
 			newrule->packets[0] = newrule->packets[1] = 0;
 			newrule->bytes[0] = newrule->bytes[1] = 0;
 		}
 		pf_empty_pool(&V_pf_pabuf);
 
 		if (pcr->action == PF_CHANGE_ADD_HEAD)
 			oldrule = TAILQ_FIRST(
 			    ruleset->rules[rs_num].active.ptr);
 		else if (pcr->action == PF_CHANGE_ADD_TAIL)
 			oldrule = TAILQ_LAST(
 			    ruleset->rules[rs_num].active.ptr, pf_rulequeue);
 		else {
 			oldrule = TAILQ_FIRST(
 			    ruleset->rules[rs_num].active.ptr);
 			while ((oldrule != NULL) && (oldrule->nr != pcr->nr))
 				oldrule = TAILQ_NEXT(oldrule, entries);
 			if (oldrule == NULL) {
 				if (newrule != NULL)
 					pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				error = EINVAL;
 				break;
 			}
 		}
 
 		if (pcr->action == PF_CHANGE_REMOVE) {
 			pf_unlink_rule(ruleset->rules[rs_num].active.ptr,
 			    oldrule);
 			ruleset->rules[rs_num].active.rcount--;
 		} else {
 			if (oldrule == NULL)
 				TAILQ_INSERT_TAIL(
 				    ruleset->rules[rs_num].active.ptr,
 				    newrule, entries);
 			else if (pcr->action == PF_CHANGE_ADD_HEAD ||
 			    pcr->action == PF_CHANGE_ADD_BEFORE)
 				TAILQ_INSERT_BEFORE(oldrule, newrule, entries);
 			else
 				TAILQ_INSERT_AFTER(
 				    ruleset->rules[rs_num].active.ptr,
 				    oldrule, newrule, entries);
 			ruleset->rules[rs_num].active.rcount++;
 		}
 
 		nr = 0;
 		TAILQ_FOREACH(oldrule,
 		    ruleset->rules[rs_num].active.ptr, entries)
 			oldrule->nr = nr++;
 
 		ruleset->rules[rs_num].active.ticket++;
 
 		pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr);
 		pf_remove_if_empty_ruleset(ruleset);
 
 		PF_RULES_WUNLOCK();
 		break;
 
 #undef ERROUT
 DIOCCHANGERULE_error:
 		PF_RULES_WUNLOCK();
 		if (newrule != NULL) {
 			counter_u64_free(newrule->states_cur);
 			counter_u64_free(newrule->states_tot);
 			counter_u64_free(newrule->src_nodes);
 			free(newrule, M_PFRULE);
 		}
 		if (kif != NULL)
 			free(kif, PFI_MTYPE);
 		break;
 	}
 
 	case DIOCCLRSTATES: {
 		struct pf_state		*s;
 		struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
 		u_int			 i, killed = 0;
 
 		for (i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 
 relock_DIOCCLRSTATES:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry)
 				if (!psk->psk_ifname[0] ||
 				    !strcmp(psk->psk_ifname,
 				    s->kif->pfik_name)) {
 					/*
 					 * Don't send out individual
 					 * delete messages.
 					 */
 					s->state_flags |= PFSTATE_NOSYNC;
 					pf_unlink_state(s, PF_ENTER_LOCKED);
 					killed++;
 					goto relock_DIOCCLRSTATES;
 				}
 			PF_HASHROW_UNLOCK(ih);
 		}
 		psk->psk_killed = killed;
 		if (pfsync_clear_states_ptr != NULL)
 			pfsync_clear_states_ptr(V_pf_status.hostid, psk->psk_ifname);
 		break;
 	}
 
 	case DIOCKILLSTATES: {
 		struct pf_state		*s;
 		struct pf_state_key	*sk;
 		struct pf_addr		*srcaddr, *dstaddr;
 		u_int16_t		 srcport, dstport;
 		struct pfioc_state_kill	*psk = (struct pfioc_state_kill *)addr;
 		u_int			 i, killed = 0;
 
 		if (psk->psk_pfcmp.id) {
 			if (psk->psk_pfcmp.creatorid == 0)
 				psk->psk_pfcmp.creatorid = V_pf_status.hostid;
 			if ((s = pf_find_state_byid(psk->psk_pfcmp.id,
 			    psk->psk_pfcmp.creatorid))) {
 				pf_unlink_state(s, PF_ENTER_LOCKED);
 				psk->psk_killed = 1;
 			}
 			break;
 		}
 
 		for (i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 
 relock_DIOCKILLSTATES:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				sk = s->key[PF_SK_WIRE];
 				if (s->direction == PF_OUT) {
 					srcaddr = &sk->addr[1];
 					dstaddr = &sk->addr[0];
 					srcport = sk->port[1];
 					dstport = sk->port[0];
 				} else {
 					srcaddr = &sk->addr[0];
 					dstaddr = &sk->addr[1];
 					srcport = sk->port[0];
 					dstport = sk->port[1];
 				}
 
 				if ((!psk->psk_af || sk->af == psk->psk_af)
 				    && (!psk->psk_proto || psk->psk_proto ==
 				    sk->proto) &&
 				    PF_MATCHA(psk->psk_src.neg,
 				    &psk->psk_src.addr.v.a.addr,
 				    &psk->psk_src.addr.v.a.mask,
 				    srcaddr, sk->af) &&
 				    PF_MATCHA(psk->psk_dst.neg,
 				    &psk->psk_dst.addr.v.a.addr,
 				    &psk->psk_dst.addr.v.a.mask,
 				    dstaddr, sk->af) &&
 				    (psk->psk_src.port_op == 0 ||
 				    pf_match_port(psk->psk_src.port_op,
 				    psk->psk_src.port[0], psk->psk_src.port[1],
 				    srcport)) &&
 				    (psk->psk_dst.port_op == 0 ||
 				    pf_match_port(psk->psk_dst.port_op,
 				    psk->psk_dst.port[0], psk->psk_dst.port[1],
 				    dstport)) &&
 				    (!psk->psk_label[0] ||
 				    (s->rule.ptr->label[0] &&
 				    !strcmp(psk->psk_label,
 				    s->rule.ptr->label))) &&
 				    (!psk->psk_ifname[0] ||
 				    !strcmp(psk->psk_ifname,
 				    s->kif->pfik_name))) {
 					pf_unlink_state(s, PF_ENTER_LOCKED);
 					killed++;
 					goto relock_DIOCKILLSTATES;
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 		psk->psk_killed = killed;
 		break;
 	}
 
 	case DIOCADDSTATE: {
 		struct pfioc_state	*ps = (struct pfioc_state *)addr;
 		struct pfsync_state	*sp = &ps->state;
 
 		if (sp->timeout >= PFTM_MAX) {
 			error = EINVAL;
 			break;
 		}
 		if (pfsync_state_import_ptr != NULL) {
 			PF_RULES_RLOCK();
 			error = pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL);
 			PF_RULES_RUNLOCK();
 		} else
 			error = EOPNOTSUPP;
 		break;
 	}
 
 	case DIOCGETSTATE: {
 		struct pfioc_state	*ps = (struct pfioc_state *)addr;
 		struct pf_state		*s;
 
 		s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
 		if (s == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		pfsync_state_export(&ps->state, s);
 		PF_STATE_UNLOCK(s);
 		break;
 	}
 
 	case DIOCGETSTATES: {
 		struct pfioc_states	*ps = (struct pfioc_states *)addr;
 		struct pf_state		*s;
 		struct pfsync_state	*pstore, *p;
 		int i, nr;
 
 		if (ps->ps_len == 0) {
 			nr = uma_zone_get_cur(V_pf_state_z);
 			ps->ps_len = sizeof(struct pfsync_state) * nr;
 			break;
 		}
 
 		p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK);
 		nr = 0;
 
 		for (i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 
 				if (s->timeout == PFTM_UNLINKED)
 					continue;
 
 				if ((nr+1) * sizeof(*p) > ps->ps_len) {
 					PF_HASHROW_UNLOCK(ih);
 					goto DIOCGETSTATES_full;
 				}
 				pfsync_state_export(p, s);
 				p++;
 				nr++;
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 DIOCGETSTATES_full:
 		error = copyout(pstore, ps->ps_states,
 		    sizeof(struct pfsync_state) * nr);
 		if (error) {
 			free(pstore, M_TEMP);
 			break;
 		}
 		ps->ps_len = sizeof(struct pfsync_state) * nr;
 		free(pstore, M_TEMP);
 
 		break;
 	}
 
 	case DIOCGETSTATUS: {
 		struct pf_status *s = (struct pf_status *)addr;
 
 		PF_RULES_RLOCK();
 		s->running = V_pf_status.running;
 		s->since   = V_pf_status.since;
 		s->debug   = V_pf_status.debug;
 		s->hostid  = V_pf_status.hostid;
 		s->states  = V_pf_status.states;
 		s->src_nodes = V_pf_status.src_nodes;
 
 		for (int i = 0; i < PFRES_MAX; i++)
 			s->counters[i] =
 			    counter_u64_fetch(V_pf_status.counters[i]);
 		for (int i = 0; i < LCNT_MAX; i++)
 			s->lcounters[i] =
 			    counter_u64_fetch(V_pf_status.lcounters[i]);
 		for (int i = 0; i < FCNT_MAX; i++)
 			s->fcounters[i] =
 			    counter_u64_fetch(V_pf_status.fcounters[i]);
 		for (int i = 0; i < SCNT_MAX; i++)
 			s->scounters[i] =
 			    counter_u64_fetch(V_pf_status.scounters[i]);
 
 		bcopy(V_pf_status.ifname, s->ifname, IFNAMSIZ);
 		bcopy(V_pf_status.pf_chksum, s->pf_chksum,
 		    PF_MD5_DIGEST_LENGTH);
 
 		pfi_update_status(s->ifname, s);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCSETSTATUSIF: {
 		struct pfioc_if	*pi = (struct pfioc_if *)addr;
 
 		if (pi->ifname[0] == 0) {
 			bzero(V_pf_status.ifname, IFNAMSIZ);
 			break;
 		}
 		PF_RULES_WLOCK();
 		strlcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRSTATUS: {
 		PF_RULES_WLOCK();
 		for (int i = 0; i < PFRES_MAX; i++)
 			counter_u64_zero(V_pf_status.counters[i]);
 		for (int i = 0; i < FCNT_MAX; i++)
 			counter_u64_zero(V_pf_status.fcounters[i]);
 		for (int i = 0; i < SCNT_MAX; i++)
 			counter_u64_zero(V_pf_status.scounters[i]);
 		V_pf_status.since = time_second;
 		if (*V_pf_status.ifname)
 			pfi_update_status(V_pf_status.ifname, NULL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCNATLOOK: {
 		struct pfioc_natlook	*pnl = (struct pfioc_natlook *)addr;
 		struct pf_state_key	*sk;
 		struct pf_state		*state;
 		struct pf_state_key_cmp	 key;
 		int			 m = 0, direction = pnl->direction;
 		int			 sidx, didx;
 
 		/* NATLOOK src and dst are reversed, so reverse sidx/didx */
 		sidx = (direction == PF_IN) ? 1 : 0;
 		didx = (direction == PF_IN) ? 0 : 1;
 
 		if (!pnl->proto ||
 		    PF_AZERO(&pnl->saddr, pnl->af) ||
 		    PF_AZERO(&pnl->daddr, pnl->af) ||
 		    ((pnl->proto == IPPROTO_TCP ||
 		    pnl->proto == IPPROTO_UDP) &&
 		    (!pnl->dport || !pnl->sport)))
 			error = EINVAL;
 		else {
 			bzero(&key, sizeof(key));
 			key.af = pnl->af;
 			key.proto = pnl->proto;
 			PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af);
 			key.port[sidx] = pnl->sport;
 			PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
 			key.port[didx] = pnl->dport;
 
 			state = pf_find_state_all(&key, direction, &m);
 
 			if (m > 1)
 				error = E2BIG;	/* more than one state */
 			else if (state != NULL) {
 				/* XXXGL: not locked read */
 				sk = state->key[sidx];
 				PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af);
 				pnl->rsport = sk->port[sidx];
 				PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af);
 				pnl->rdport = sk->port[didx];
 			} else
 				error = ENOENT;
 		}
 		break;
 	}
 
 	case DIOCSETTIMEOUT: {
 		struct pfioc_tm	*pt = (struct pfioc_tm *)addr;
 		int		 old;
 
 		if (pt->timeout < 0 || pt->timeout >= PFTM_MAX ||
 		    pt->seconds < 0) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_WLOCK();
 		old = V_pf_default_rule.timeout[pt->timeout];
 		if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0)
 			pt->seconds = 1;
 		V_pf_default_rule.timeout[pt->timeout] = pt->seconds;
 		if (pt->timeout == PFTM_INTERVAL && pt->seconds < old)
 			wakeup(pf_purge_thread);
 		pt->seconds = old;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETTIMEOUT: {
 		struct pfioc_tm	*pt = (struct pfioc_tm *)addr;
 
 		if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_RLOCK();
 		pt->seconds = V_pf_default_rule.timeout[pt->timeout];
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETLIMIT: {
 		struct pfioc_limit	*pl = (struct pfioc_limit *)addr;
 
 		if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_RLOCK();
 		pl->limit = V_pf_limits[pl->index].limit;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCSETLIMIT: {
 		struct pfioc_limit	*pl = (struct pfioc_limit *)addr;
 		int			 old_limit;
 
 		PF_RULES_WLOCK();
 		if (pl->index < 0 || pl->index >= PF_LIMIT_MAX ||
 		    V_pf_limits[pl->index].zone == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit);
 		old_limit = V_pf_limits[pl->index].limit;
 		V_pf_limits[pl->index].limit = pl->limit;
 		pl->limit = old_limit;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCSETDEBUG: {
 		u_int32_t	*level = (u_int32_t *)addr;
 
 		PF_RULES_WLOCK();
 		V_pf_status.debug = *level;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRRULECTRS: {
 		/* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */
 		struct pf_ruleset	*ruleset = &pf_main_ruleset;
 		struct pf_rule		*rule;
 
 		PF_RULES_WLOCK();
 		TAILQ_FOREACH(rule,
 		    ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) {
 			rule->evaluations = 0;
 			rule->packets[0] = rule->packets[1] = 0;
 			rule->bytes[0] = rule->bytes[1] = 0;
 		}
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGIFSPEED: {
 		struct pf_ifspeed	*psp = (struct pf_ifspeed *)addr;
 		struct pf_ifspeed	ps;
 		struct ifnet		*ifp;
 
 		if (psp->ifname[0] != 0) {
 			/* Can we completely trust user-land? */
 			strlcpy(ps.ifname, psp->ifname, IFNAMSIZ);
 			ifp = ifunit(ps.ifname);
 			if (ifp != NULL)
 				psp->baudrate = ifp->if_baudrate;
 			else
 				error = EINVAL;
 		} else
 			error = EINVAL;
 		break;
 	}
 
 #ifdef ALTQ
 	case DIOCSTARTALTQ: {
 		struct pf_altq		*altq;
 
 		PF_RULES_WLOCK();
 		/* enable all altq interfaces on active list */
 		TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
 			if (altq->qname[0] == 0 && (altq->local_flags &
 			    PFALTQ_FLAG_IF_REMOVED) == 0) {
 				error = pf_enable_altq(altq);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error == 0)
 			V_pf_altq_running = 1;
 		PF_RULES_WUNLOCK();
 		DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
 		break;
 	}
 
 	case DIOCSTOPALTQ: {
 		struct pf_altq		*altq;
 
 		PF_RULES_WLOCK();
 		/* disable all altq interfaces on active list */
 		TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
 			if (altq->qname[0] == 0 && (altq->local_flags &
 			    PFALTQ_FLAG_IF_REMOVED) == 0) {
 				error = pf_disable_altq(altq);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error == 0)
 			V_pf_altq_running = 0;
 		PF_RULES_WUNLOCK();
 		DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
 		break;
 	}
 
 	case DIOCADDALTQ: {
 		struct pfioc_altq	*pa = (struct pfioc_altq *)addr;
 		struct pf_altq		*altq, *a;
 		struct ifnet		*ifp;
 
 		altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK);
 		bcopy(&pa->altq, altq, sizeof(struct pf_altq));
 		altq->local_flags = 0;
 
 		PF_RULES_WLOCK();
 		if (pa->ticket != V_ticket_altqs_inactive) {
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * if this is for a queue, find the discipline and
 		 * copy the necessary fields
 		 */
 		if (altq->qname[0] != 0) {
 			if ((altq->qid = pf_qname2qid(altq->qname)) == 0) {
 				PF_RULES_WUNLOCK();
 				error = EBUSY;
 				free(altq, M_PFALTQ);
 				break;
 			}
 			altq->altq_disc = NULL;
 			TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) {
 				if (strncmp(a->ifname, altq->ifname,
 				    IFNAMSIZ) == 0 && a->qname[0] == 0) {
 					altq->altq_disc = a->altq_disc;
 					break;
 				}
 			}
 		}
 
 		if ((ifp = ifunit(altq->ifname)) == NULL)
 			altq->local_flags |= PFALTQ_FLAG_IF_REMOVED;
 		else
 			error = altq_add(altq);
 
 		if (error) {
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
 			break;
 		}
 
 		TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries);
 		bcopy(altq, &pa->altq, sizeof(struct pf_altq));
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETALTQS: {
 		struct pfioc_altq	*pa = (struct pfioc_altq *)addr;
 		struct pf_altq		*altq;
 
 		PF_RULES_RLOCK();
 		pa->nr = 0;
 		TAILQ_FOREACH(altq, V_pf_altqs_active, entries)
 			pa->nr++;
 		pa->ticket = V_ticket_altqs_active;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETALTQ: {
 		struct pfioc_altq	*pa = (struct pfioc_altq *)addr;
 		struct pf_altq		*altq;
 		u_int32_t		 nr;
 
 		PF_RULES_RLOCK();
 		if (pa->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		nr = 0;
 		altq = TAILQ_FIRST(V_pf_altqs_active);
 		while ((altq != NULL) && (nr < pa->nr)) {
 			altq = TAILQ_NEXT(altq, entries);
 			nr++;
 		}
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		bcopy(altq, &pa->altq, sizeof(struct pf_altq));
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCCHANGEALTQ:
 		/* CHANGEALTQ not supported yet! */
 		error = ENODEV;
 		break;
 
 	case DIOCGETQSTATS: {
 		struct pfioc_qstats	*pq = (struct pfioc_qstats *)addr;
 		struct pf_altq		*altq;
 		u_int32_t		 nr;
 		int			 nbytes;
 
 		PF_RULES_RLOCK();
 		if (pq->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		nbytes = pq->nbytes;
 		nr = 0;
 		altq = TAILQ_FIRST(V_pf_altqs_active);
 		while ((altq != NULL) && (nr < pq->nr)) {
 			altq = TAILQ_NEXT(altq, entries);
 			nr++;
 		}
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
 			PF_RULES_RUNLOCK();
 			error = ENXIO;
 			break;
 		}
 		PF_RULES_RUNLOCK();
 		error = altq_getqstats(altq, pq->buf, &nbytes);
 		if (error == 0) {
 			pq->scheduler = altq->scheduler;
 			pq->nbytes = nbytes;
 		}
 		break;
 	}
 #endif /* ALTQ */
 
 	case DIOCBEGINADDRS: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 
 		PF_RULES_WLOCK();
 		pf_empty_pool(&V_pf_pabuf);
 		pp->ticket = ++V_ticket_pabuf;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCADDADDR: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_pooladdr	*pa;
 		struct pfi_kif		*kif = NULL;
 
 #ifndef INET
 		if (pp->af == AF_INET) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET */
 #ifndef INET6
 		if (pp->af == AF_INET6) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET6 */
 		if (pp->addr.addr.type != PF_ADDR_ADDRMASK &&
 		    pp->addr.addr.type != PF_ADDR_DYNIFTL &&
 		    pp->addr.addr.type != PF_ADDR_TABLE) {
 			error = EINVAL;
 			break;
 		}
 		pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK);
 		bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr));
 		if (pa->ifname[0])
 			kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
 		PF_RULES_WLOCK();
 		if (pp->ticket != V_ticket_pabuf) {
 			PF_RULES_WUNLOCK();
 			if (pa->ifname[0])
 				free(kif, PFI_MTYPE);
 			free(pa, M_PFRULE);
 			error = EBUSY;
 			break;
 		}
 		if (pa->ifname[0]) {
 			pa->kif = pfi_kif_attach(kif, pa->ifname);
 			pfi_kif_ref(pa->kif);
 		} else
 			pa->kif = NULL;
 		if (pa->addr.type == PF_ADDR_DYNIFTL && ((error =
 		    pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) {
 			if (pa->ifname[0])
 				pfi_kif_unref(pa->kif);
 			PF_RULES_WUNLOCK();
 			free(pa, M_PFRULE);
 			break;
 		}
 		TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETADDRS: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_pool		*pool;
 		struct pf_pooladdr	*pa;
 
 		PF_RULES_RLOCK();
 		pp->nr = 0;
 		pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
 		    pp->r_num, 0, 1, 0);
 		if (pool == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		TAILQ_FOREACH(pa, &pool->list, entries)
 			pp->nr++;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETADDR: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_pool		*pool;
 		struct pf_pooladdr	*pa;
 		u_int32_t		 nr = 0;
 
 		PF_RULES_RLOCK();
 		pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
 		    pp->r_num, 0, 1, 1);
 		if (pool == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pa = TAILQ_FIRST(&pool->list);
 		while ((pa != NULL) && (nr < pp->nr)) {
 			pa = TAILQ_NEXT(pa, entries);
 			nr++;
 		}
 		if (pa == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr));
 		pf_addr_copyout(&pp->addr.addr);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCCHANGEADDR: {
 		struct pfioc_pooladdr	*pca = (struct pfioc_pooladdr *)addr;
 		struct pf_pool		*pool;
 		struct pf_pooladdr	*oldpa = NULL, *newpa = NULL;
 		struct pf_ruleset	*ruleset;
 		struct pfi_kif		*kif = NULL;
 
 		if (pca->action < PF_CHANGE_ADD_HEAD ||
 		    pca->action > PF_CHANGE_REMOVE) {
 			error = EINVAL;
 			break;
 		}
 		if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
 		    pca->addr.addr.type != PF_ADDR_DYNIFTL &&
 		    pca->addr.addr.type != PF_ADDR_TABLE) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pca->action != PF_CHANGE_REMOVE) {
 #ifndef INET
 			if (pca->af == AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET */
 #ifndef INET6
 			if (pca->af == AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET6 */
 			newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
 			bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr));
 			if (newpa->ifname[0])
 				kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
 			newpa->kif = NULL;
 		}
 
 #define	ERROUT(x)	{ error = (x); goto DIOCCHANGEADDR_error; }
 		PF_RULES_WLOCK();
 		ruleset = pf_find_ruleset(pca->anchor);
 		if (ruleset == NULL)
 			ERROUT(EBUSY);
 
 		pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action,
 		    pca->r_num, pca->r_last, 1, 1);
 		if (pool == NULL)
 			ERROUT(EBUSY);
 
 		if (pca->action != PF_CHANGE_REMOVE) {
 			if (newpa->ifname[0]) {
 				newpa->kif = pfi_kif_attach(kif, newpa->ifname);
 				pfi_kif_ref(newpa->kif);
 				kif = NULL;
 			}
 
 			switch (newpa->addr.type) {
 			case PF_ADDR_DYNIFTL:
 				error = pfi_dynaddr_setup(&newpa->addr,
 				    pca->af);
 				break;
 			case PF_ADDR_TABLE:
 				newpa->addr.p.tbl = pfr_attach_table(ruleset,
 				    newpa->addr.v.tblname);
 				if (newpa->addr.p.tbl == NULL)
 					error = ENOMEM;
 				break;
 			}
 			if (error)
 				goto DIOCCHANGEADDR_error;
 		}
 
 		switch (pca->action) {
 		case PF_CHANGE_ADD_HEAD:
 			oldpa = TAILQ_FIRST(&pool->list);
 			break;
 		case PF_CHANGE_ADD_TAIL:
 			oldpa = TAILQ_LAST(&pool->list, pf_palist);
 			break;
 		default:
 			oldpa = TAILQ_FIRST(&pool->list);
 			for (int i = 0; oldpa && i < pca->nr; i++)
 				oldpa = TAILQ_NEXT(oldpa, entries);
 
 			if (oldpa == NULL)
 				ERROUT(EINVAL);
 		}
 
 		if (pca->action == PF_CHANGE_REMOVE) {
 			TAILQ_REMOVE(&pool->list, oldpa, entries);
 			switch (oldpa->addr.type) {
 			case PF_ADDR_DYNIFTL:
 				pfi_dynaddr_remove(oldpa->addr.p.dyn);
 				break;
 			case PF_ADDR_TABLE:
 				pfr_detach_table(oldpa->addr.p.tbl);
 				break;
 			}
 			if (oldpa->kif)
 				pfi_kif_unref(oldpa->kif);
 			free(oldpa, M_PFRULE);
 		} else {
 			if (oldpa == NULL)
 				TAILQ_INSERT_TAIL(&pool->list, newpa, entries);
 			else if (pca->action == PF_CHANGE_ADD_HEAD ||
 			    pca->action == PF_CHANGE_ADD_BEFORE)
 				TAILQ_INSERT_BEFORE(oldpa, newpa, entries);
 			else
 				TAILQ_INSERT_AFTER(&pool->list, oldpa,
 				    newpa, entries);
 		}
 
 		pool->cur = TAILQ_FIRST(&pool->list);
 		PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af);
 		PF_RULES_WUNLOCK();
 		break;
 
 #undef ERROUT
 DIOCCHANGEADDR_error:
 		if (newpa->kif)
 			pfi_kif_unref(newpa->kif);
 		PF_RULES_WUNLOCK();
 		if (newpa != NULL)
 			free(newpa, M_PFRULE);
 		if (kif != NULL)
 			free(kif, PFI_MTYPE);
 		break;
 	}
 
 	case DIOCGETRULESETS: {
 		struct pfioc_ruleset	*pr = (struct pfioc_ruleset *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_anchor	*anchor;
 
 		PF_RULES_RLOCK();
 		pr->path[sizeof(pr->path) - 1] = 0;
 		if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
 			PF_RULES_RUNLOCK();
 			error = ENOENT;
 			break;
 		}
 		pr->nr = 0;
 		if (ruleset->anchor == NULL) {
 			/* XXX kludge for pf_main_ruleset */
 			RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
 				if (anchor->parent == NULL)
 					pr->nr++;
 		} else {
 			RB_FOREACH(anchor, pf_anchor_node,
 			    &ruleset->anchor->children)
 				pr->nr++;
 		}
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETRULESET: {
 		struct pfioc_ruleset	*pr = (struct pfioc_ruleset *)addr;
 		struct pf_ruleset	*ruleset;
 		struct pf_anchor	*anchor;
 		u_int32_t		 nr = 0;
 
 		PF_RULES_RLOCK();
 		pr->path[sizeof(pr->path) - 1] = 0;
 		if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
 			PF_RULES_RUNLOCK();
 			error = ENOENT;
 			break;
 		}
 		pr->name[0] = 0;
 		if (ruleset->anchor == NULL) {
 			/* XXX kludge for pf_main_ruleset */
 			RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
 				if (anchor->parent == NULL && nr++ == pr->nr) {
 					strlcpy(pr->name, anchor->name,
 					    sizeof(pr->name));
 					break;
 				}
 		} else {
 			RB_FOREACH(anchor, pf_anchor_node,
 			    &ruleset->anchor->children)
 				if (nr++ == pr->nr) {
 					strlcpy(pr->name, anchor->name,
 					    sizeof(pr->name));
 					break;
 				}
 		}
 		if (!pr->name[0])
 			error = EBUSY;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCRCLRTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
 		    io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCRADDTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_tables(pfrts, io->pfrio_size,
 		    &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRDELTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_tables(pfrts, io->pfrio_size,
 		    &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = malloc(totlen, M_TEMP, M_WAITOK);
 		PF_RULES_RLOCK();
 		error = pfr_get_tables(&io->pfrio_table, pfrts,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfrts, io->pfrio_buffer, totlen);
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETTSTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_tstats *pfrtstats;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_tstats);
 		pfrtstats = malloc(totlen, M_TEMP, M_WAITOK);
 		PF_RULES_WLOCK();
 		error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0)
 			error = copyout(pfrtstats, io->pfrio_buffer, totlen);
 		free(pfrtstats, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRTSTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_tstats(pfrts, io->pfrio_size,
 		    &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRSETTFLAGS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_tflags(pfrts, io->pfrio_size,
 		    io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange,
 		    &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
 		    io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCRADDADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRDELADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRSETADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd,
 		    &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL, 0);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		PF_RULES_RLOCK();
 		error = pfr_get_addrs(&io->pfrio_table, pfras,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETASTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_astats *pfrastats;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_astats)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_astats);
 		pfrastats = malloc(totlen, M_TEMP, M_WAITOK);
 		PF_RULES_RLOCK();
 		error = pfr_get_astats(&io->pfrio_table, pfrastats,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfrastats, io->pfrio_buffer, totlen);
 		free(pfrastats, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRASTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_astats(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRTSTADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_RLOCK();
 		error = pfr_tst_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRINADEFINE: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_ina_define(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr,
 		    io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCOSFPADD: {
 		struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
 		PF_RULES_WLOCK();
 		error = pf_osfp_add(io);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCOSFPGET: {
 		struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
 		PF_RULES_RLOCK();
 		error = pf_osfp_get(io);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCXBEGIN: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioes, *ioe;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			switch (ioe->rs_num) {
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if ((error = pf_begin_altq(&ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_begin(&table,
 				    &ioe->ticket, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_begin_rules(&ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 		error = copyout(ioes, io->array, totlen);
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCXROLLBACK: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioe, *ioes;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			switch (ioe->rs_num) {
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if ((error = pf_rollback_altq(ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_rollback(&table,
 				    ioe->ticket, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_rollback_rules(ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCXCOMMIT: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioe, *ioes;
 		struct pf_ruleset	*rs;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = malloc(totlen, M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		/* First makes sure everything will succeed. */
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			switch (ioe->rs_num) {
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if (!V_altqs_inactive_open || ioe->ticket !=
 				    V_ticket_altqs_inactive) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 				rs = pf_find_ruleset(ioe->anchor);
 				if (rs == NULL || !rs->topen || ioe->ticket !=
 				    rs->tticket) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 			default:
 				if (ioe->rs_num < 0 || ioe->rs_num >=
 				    PF_RULESET_MAX) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				rs = pf_find_ruleset(ioe->anchor);
 				if (rs == NULL ||
 				    !rs->rules[ioe->rs_num].inactive.open ||
 				    rs->rules[ioe->rs_num].inactive.ticket !=
 				    ioe->ticket) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 			}
 		}
 		/* Now do the commit - no errors should happen here. */
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			switch (ioe->rs_num) {
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if ((error = pf_commit_altq(ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_commit(&table,
 				    ioe->ticket, NULL, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_commit_rules(ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCGETSRCNODES: {
 		struct pfioc_src_nodes	*psn = (struct pfioc_src_nodes *)addr;
 		struct pf_srchash	*sh;
 		struct pf_src_node	*n, *p, *pstore;
 		uint32_t		 i, nr = 0;
 
 		if (psn->psn_len == 0) {
 			for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 			    i++, sh++) {
 				PF_HASHROW_LOCK(sh);
 				LIST_FOREACH(n, &sh->nodes, entry)
 					nr++;
 				PF_HASHROW_UNLOCK(sh);
 			}
 			psn->psn_len = sizeof(struct pf_src_node) * nr;
 			break;
 		}
 
 		p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK);
 		for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 		    i++, sh++) {
 		    PF_HASHROW_LOCK(sh);
 		    LIST_FOREACH(n, &sh->nodes, entry) {
 			int	secs = time_uptime, diff;
 
 			if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len)
 				break;
 
 			bcopy(n, p, sizeof(struct pf_src_node));
 			if (n->rule.ptr != NULL)
 				p->rule.nr = n->rule.ptr->nr;
 			p->creation = secs - p->creation;
 			if (p->expire > secs)
 				p->expire -= secs;
 			else
 				p->expire = 0;
 
 			/* Adjust the connection rate estimate. */
 			diff = secs - n->conn_rate.last;
 			if (diff >= n->conn_rate.seconds)
 				p->conn_rate.count = 0;
 			else
 				p->conn_rate.count -=
 				    n->conn_rate.count * diff /
 				    n->conn_rate.seconds;
 			p++;
 			nr++;
 		    }
 		    PF_HASHROW_UNLOCK(sh);
 		}
 		error = copyout(pstore, psn->psn_src_nodes,
 		    sizeof(struct pf_src_node) * nr);
 		if (error) {
 			free(pstore, M_TEMP);
 			break;
 		}
 		psn->psn_len = sizeof(struct pf_src_node) * nr;
 		free(pstore, M_TEMP);
 		break;
 	}
 
 	case DIOCCLRSRCNODES: {
 
 		pf_clear_srcnodes(NULL);
 		pf_purge_expired_src_nodes();
 		break;
 	}
 
 	case DIOCKILLSRCNODES:
 		pf_kill_srcnodes((struct pfioc_src_node_kill *)addr);
 		break;
 
 	case DIOCSETHOSTID: {
 		u_int32_t	*hostid = (u_int32_t *)addr;
 
 		PF_RULES_WLOCK();
 		if (*hostid == 0)
 			V_pf_status.hostid = arc4random();
 		else
 			V_pf_status.hostid = *hostid;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCOSFPFLUSH:
 		PF_RULES_WLOCK();
 		pf_osfp_flush();
 		PF_RULES_WUNLOCK();
 		break;
 
 	case DIOCIGETIFACES: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 		struct pfi_kif *ifstore;
 		size_t bufsiz;
 
 		if (io->pfiio_esize != sizeof(struct pfi_kif)) {
 			error = ENODEV;
 			break;
 		}
 
 		bufsiz = io->pfiio_size * sizeof(struct pfi_kif);
 		ifstore = malloc(bufsiz, M_TEMP, M_WAITOK);
 		PF_RULES_RLOCK();
 		pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size);
 		PF_RULES_RUNLOCK();
 		error = copyout(ifstore, io->pfiio_buffer, bufsiz);
 		free(ifstore, M_TEMP);
 		break;
 	}
 
 	case DIOCSETIFFLAG: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 
 		PF_RULES_WLOCK();
 		error = pfi_set_flags(io->pfiio_name, io->pfiio_flags);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRIFFLAG: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 
 		PF_RULES_WLOCK();
 		error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	default:
 		error = ENODEV;
 		break;
 	}
 fail:
 	if (sx_xlocked(&pf_ioctl_lock))
 		sx_xunlock(&pf_ioctl_lock);
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 void
 pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
 {
 	bzero(sp, sizeof(struct pfsync_state));
 
 	/* copy from state key */
 	sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
 	sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
 	sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
 	sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
 	sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
 	sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
 	sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
 	sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
 	sp->proto = st->key[PF_SK_WIRE]->proto;
 	sp->af = st->key[PF_SK_WIRE]->af;
 
 	/* copy from state */
 	strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
 	bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
 	sp->creation = htonl(time_uptime - st->creation);
 	sp->expire = pf_state_expires(st);
 	if (sp->expire <= time_uptime)
 		sp->expire = htonl(0);
 	else
 		sp->expire = htonl(sp->expire - time_uptime);
 
 	sp->direction = st->direction;
 	sp->log = st->log;
 	sp->timeout = st->timeout;
 	sp->state_flags = st->state_flags;
 	if (st->src_node)
 		sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
 	if (st->nat_src_node)
 		sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
 
 	sp->id = st->id;
 	sp->creatorid = st->creatorid;
 	pf_state_peer_hton(&st->src, &sp->src);
 	pf_state_peer_hton(&st->dst, &sp->dst);
 
 	if (st->rule.ptr == NULL)
 		sp->rule = htonl(-1);
 	else
 		sp->rule = htonl(st->rule.ptr->nr);
 	if (st->anchor.ptr == NULL)
 		sp->anchor = htonl(-1);
 	else
 		sp->anchor = htonl(st->anchor.ptr->nr);
 	if (st->nat_rule.ptr == NULL)
 		sp->nat_rule = htonl(-1);
 	else
 		sp->nat_rule = htonl(st->nat_rule.ptr->nr);
 
 	pf_state_counter_hton(st->packets[0], sp->packets[0]);
 	pf_state_counter_hton(st->packets[1], sp->packets[1]);
 	pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
 	pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
 
 }
 
 static void
 pf_tbladdr_copyout(struct pf_addr_wrap *aw)
 {
 	struct pfr_ktable *kt;
 
 	KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type));
 
 	kt = aw->p.tbl;
 	if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
 		kt = kt->pfrkt_root;
 	aw->p.tbl = NULL;
 	aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
 		kt->pfrkt_cnt : -1;
 }
 
 /*
  * XXX - Check for version missmatch!!!
  */
 static void
 pf_clear_states(void)
 {
 	struct pf_state	*s;
 	u_int i;
 
 	for (i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 relock:
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			s->timeout = PFTM_PURGE;
 			/* Don't send out individual delete messages. */
 			s->sync_state = PFSTATE_NOSYNC;
 			pf_unlink_state(s, PF_ENTER_LOCKED);
 			goto relock;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 }
 
 static int
 pf_clear_tables(void)
 {
 	struct pfioc_table io;
 	int error;
 
 	bzero(&io, sizeof(io));
 
 	error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel,
 	    io.pfrio_flags);
 
 	return (error);
 }
 
 static void
 pf_clear_srcnodes(struct pf_src_node *n)
 {
 	struct pf_state *s;
 	int i;
 
 	for (i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			if (n == NULL || n == s->src_node)
 				s->src_node = NULL;
 			if (n == NULL || n == s->nat_src_node)
 				s->nat_src_node = NULL;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	if (n == NULL) {
 		struct pf_srchash *sh;
 
 		for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 		    i++, sh++) {
 			PF_HASHROW_LOCK(sh);
 			LIST_FOREACH(n, &sh->nodes, entry) {
 				n->expire = 1;
 				n->states = 0;
 			}
 			PF_HASHROW_UNLOCK(sh);
 		}
 	} else {
 		/* XXX: hash slot should already be locked here. */
 		n->expire = 1;
 		n->states = 0;
 	}
 }
 
 static void
 pf_kill_srcnodes(struct pfioc_src_node_kill *psnk)
 {
 	struct pf_src_node_list	 kill;
 
 	LIST_INIT(&kill);
 	for (int i = 0; i <= pf_srchashmask; i++) {
 		struct pf_srchash *sh = &V_pf_srchash[i];
 		struct pf_src_node *sn, *tmp;
 
 		PF_HASHROW_LOCK(sh);
 		LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp)
 			if (PF_MATCHA(psnk->psnk_src.neg,
 			      &psnk->psnk_src.addr.v.a.addr,
 			      &psnk->psnk_src.addr.v.a.mask,
 			      &sn->addr, sn->af) &&
 			    PF_MATCHA(psnk->psnk_dst.neg,
 			      &psnk->psnk_dst.addr.v.a.addr,
 			      &psnk->psnk_dst.addr.v.a.mask,
 			      &sn->raddr, sn->af)) {
 				pf_unlink_src_node(sn);
 				LIST_INSERT_HEAD(&kill, sn, entry);
 				sn->expire = 1;
 			}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	for (int i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 		struct pf_state *s;
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			if (s->src_node && s->src_node->expire == 1)
 				s->src_node = NULL;
 			if (s->nat_src_node && s->nat_src_node->expire == 1)
 				s->nat_src_node = NULL;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	psnk->psnk_killed = pf_free_src_nodes(&kill);
 }
 
 /*
  * XXX - Check for version missmatch!!!
  */
 
 /*
  * Duplicate pfctl -Fa operation to get rid of as much as we can.
  */
 static int
 shutdown_pf(void)
 {
 	int error = 0;
 	u_int32_t t[5];
 	char nn = '\0';
 
 	V_pf_status.running = 0;
 
 	counter_u64_free(V_pf_default_rule.states_cur);
 	counter_u64_free(V_pf_default_rule.states_tot);
 	counter_u64_free(V_pf_default_rule.src_nodes);
 
 	for (int i = 0; i < PFRES_MAX; i++)
 		counter_u64_free(V_pf_status.counters[i]);
 	for (int i = 0; i < LCNT_MAX; i++)
 		counter_u64_free(V_pf_status.lcounters[i]);
 	for (int i = 0; i < FCNT_MAX; i++)
 		counter_u64_free(V_pf_status.fcounters[i]);
 	for (int i = 0; i < SCNT_MAX; i++)
 		counter_u64_free(V_pf_status.scounters[i]);
 
 	do {
 		if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n"));
 			break;
 		}
 		if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n"));
 			break;		/* XXX: rollback? */
 		}
 
 		/* XXX: these should always succeed here */
 		pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn);
 		pf_commit_rules(t[1], PF_RULESET_FILTER, &nn);
 		pf_commit_rules(t[2], PF_RULESET_NAT, &nn);
 		pf_commit_rules(t[3], PF_RULESET_BINAT, &nn);
 		pf_commit_rules(t[4], PF_RULESET_RDR, &nn);
 
 		if ((error = pf_clear_tables()) != 0)
 			break;
 
 #ifdef ALTQ
 		if ((error = pf_begin_altq(&t[0])) != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n"));
 			break;
 		}
 		pf_commit_altq(t[0]);
 #endif
 
 		pf_clear_states();
 
 		pf_clear_srcnodes(NULL);
 
 		/* status does not use malloced mem so no need to cleanup */
 		/* fingerprints and interfaces have thier own cleanup code */
 	} while(0);
 
 	return (error);
 }
 
 #ifdef INET
 static int
 pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test(PF_IN, ifp, m, inp);
 	if (chk && *m) {
 		m_freem(*m);
 		*m = NULL;
 	}
 
 	return (chk);
 }
 
 static int
 pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test(PF_OUT, ifp, m, inp);
 	if (chk && *m) {
 		m_freem(*m);
 		*m = NULL;
 	}
 
 	return (chk);
 }
 #endif
 
 #ifdef INET6
 static int
 pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	int chk;
 
 	/*
 	 * In case of loopback traffic IPv6 uses the real interface in
 	 * order to support scoped addresses. In order to support stateful
 	 * filtering we have change this to lo0 as it is the case in IPv4.
 	 */
 	CURVNET_SET(ifp->if_vnet);
 	chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp);
 	CURVNET_RESTORE();
 	if (chk && *m) {
 		m_freem(*m);
 		*m = NULL;
 	}
 	return chk;
 }
 
 static int
 pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	int chk;
 
 	CURVNET_SET(ifp->if_vnet);
 	chk = pf_test6(PF_OUT, ifp, m, inp);
 	CURVNET_RESTORE();
 	if (chk && *m) {
 		m_freem(*m);
 		*m = NULL;
 	}
 	return chk;
 }
 #endif /* INET6 */
 
 static int
 hook_pf(void)
 {
 #ifdef INET
 	struct pfil_head *pfh_inet;
 #endif
 #ifdef INET6
 	struct pfil_head *pfh_inet6;
 #endif
 
 	if (V_pf_pfil_hooked)
 		return (0);
 
 #ifdef INET
 	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
 	if (pfh_inet == NULL)
 		return (ESRCH); /* XXX */
 	pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet);
 	pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet);
 #endif
 #ifdef INET6
 	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
 	if (pfh_inet6 == NULL) {
 #ifdef INET
 		pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
 		    pfh_inet);
 		pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
 		    pfh_inet);
 #endif
 		return (ESRCH); /* XXX */
 	}
 	pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6);
 	pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6);
 #endif
 
 	V_pf_pfil_hooked = 1;
 	return (0);
 }
 
 static int
 dehook_pf(void)
 {
 #ifdef INET
 	struct pfil_head *pfh_inet;
 #endif
 #ifdef INET6
 	struct pfil_head *pfh_inet6;
 #endif
 
 	if (V_pf_pfil_hooked == 0)
 		return (0);
 
 #ifdef INET
 	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
 	if (pfh_inet == NULL)
 		return (ESRCH); /* XXX */
 	pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
 	    pfh_inet);
 	pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
 	    pfh_inet);
 #endif
 #ifdef INET6
 	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
 	if (pfh_inet6 == NULL)
 		return (ESRCH); /* XXX */
 	pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK,
 	    pfh_inet6);
 	pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK,
 	    pfh_inet6);
 #endif
 
 	V_pf_pfil_hooked = 0;
 	return (0);
 }
 
 static int
 pf_load(void)
 {
 	int error;
 
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_pf_pfil_hooked = 0;
 		V_pf_end_threads = 0;
 		TAILQ_INIT(&V_pf_tags);
 		TAILQ_INIT(&V_pf_qids);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	rw_init(&pf_rules_lock, "pf rulesets");
 	sx_init(&pf_ioctl_lock, "pf ioctl");
 
 	pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME);
 	if ((error = pfattach()) != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 pf_unload(void)
 {
 	int error = 0;
 
 	V_pf_status.running = 0;
 	swi_remove(V_pf_swi_cookie);
 	error = dehook_pf();
 	if (error) {
 		/*
 		 * Should not happen!
 		 * XXX Due to error code ESRCH, kldunload will show
 		 * a message like 'No such process'.
 		 */
 		printf("%s : pfil unregisteration fail\n", __FUNCTION__);
 		return error;
 	}
 	PF_RULES_WLOCK();
 	shutdown_pf();
 	V_pf_end_threads = 1;
 	while (V_pf_end_threads < 2) {
 		wakeup_one(pf_purge_thread);
 		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftmo", 0);
 	}
 	PF_RULES_WUNLOCK();
 	pf_normalize_cleanup();
 	pfi_cleanup();
 	pfr_cleanup();
 	pf_osfp_flush();
 	pf_cleanup();
 	if (IS_DEFAULT_VNET(curvnet))
 		pf_mtag_cleanup();
 	destroy_dev(pf_dev);
 	rw_destroy(&pf_rules_lock);
 	sx_destroy(&pf_ioctl_lock);
 
 	return (error);
 }
 
 static int
 pf_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		error = pf_load();
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * Module should not be unloaded due to race conditions.
 		 */
 		error = EBUSY;
 		break;
 	case MOD_UNLOAD:
 		error = pf_unload();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pf_mod = {
 	"pf",
 	pf_modevent,
 	0
 };
 
-DECLARE_MODULE(pf, pf_mod, SI_SUB_PSEUDO, SI_ORDER_FIRST);
+DECLARE_MODULE(pf, pf_mod, SI_SUB_FW, SI_ORDER_FIRST);
 MODULE_VERSION(pf, PF_MODVER);
Index: projects/vnet/sys/sys/kernel.h
===================================================================
--- projects/vnet/sys/sys/kernel.h	(revision 294565)
+++ projects/vnet/sys/sys/kernel.h	(revision 294566)
@@ -1,366 +1,369 @@
 /*-
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernel.h	8.3 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_KERNEL_H_
 #define	_SYS_KERNEL_H_
 
 #include <sys/linker_set.h>
 
 #ifdef _KERNEL
 
 /* for intrhook below */
 #include <sys/queue.h>
 
 /* Global variables for the kernel. */
 
 /* 1.1 */
 extern char kernelname[MAXPATHLEN];
 
 extern int tick;			/* usec per tick (1000000 / hz) */
 extern int hz;				/* system clock's frequency */
 extern int psratio;			/* ratio: prof / stat */
 extern int stathz;			/* statistics clock's frequency */
 extern int profhz;			/* profiling clock's frequency */
 extern int profprocs;			/* number of process's profiling */
 extern volatile int ticks;
 
 #endif /* _KERNEL */
 
 /*
  * Enumerated types for known system startup interfaces.
  *
  * Startup occurs in ascending numeric order; the list entries are
  * sorted prior to attempting startup to guarantee order.  Items
  * of the same level are arbitrated for order based on the 'order'
  * element.
  *
  * These numbers are arbitrary and are chosen ONLY for ordering; the
  * enumeration values are explicit rather than implicit to provide
  * for binary compatibility with inserted elements.
  *
  * The SI_SUB_LAST value must have the highest lexical value.
  */
 enum sysinit_sub_id {
 	SI_SUB_DUMMY		= 0x0000000,	/* not executed; for linker*/
 	SI_SUB_DONE		= 0x0000001,	/* processed*/
 	SI_SUB_TUNABLES		= 0x0700000,	/* establish tunable values */
 	SI_SUB_COPYRIGHT	= 0x0800001,	/* first use of console*/
 	SI_SUB_VM		= 0x1000000,	/* virtual memory system init*/
 	SI_SUB_KMEM		= 0x1800000,	/* kernel memory*/
 	SI_SUB_HYPERVISOR	= 0x1A40000,	/*
 						 * Hypervisor detection and
 						 * virtualization support 
 						 * setup.
 						 */
 	SI_SUB_WITNESS		= 0x1A80000,	/* witness initialization */
 	SI_SUB_MTX_POOL_DYNAMIC	= 0x1AC0000,	/* dynamic mutex pool */
 	SI_SUB_LOCK		= 0x1B00000,	/* various locks */
 	SI_SUB_EVENTHANDLER	= 0x1C00000,	/* eventhandler init */
 	SI_SUB_VNET_PRELINK	= 0x1E00000,	/* vnet init before modules */
 	SI_SUB_KLD		= 0x2000000,	/* KLD and module setup */
 	SI_SUB_CPU		= 0x2100000,	/* CPU resource(s)*/
 	SI_SUB_RACCT		= 0x2110000,	/* resource accounting */
 	SI_SUB_KDTRACE		= 0x2140000,	/* Kernel dtrace hooks */
 	SI_SUB_RANDOM		= 0x2160000,	/* random number generator */
 	SI_SUB_MAC		= 0x2180000,	/* TrustedBSD MAC subsystem */
 	SI_SUB_MAC_POLICY	= 0x21C0000,	/* TrustedBSD MAC policies */
 	SI_SUB_MAC_LATE		= 0x21D0000,	/* TrustedBSD MAC subsystem */
 	SI_SUB_VNET		= 0x21E0000,	/* vnet 0 */
 	SI_SUB_INTRINSIC	= 0x2200000,	/* proc 0*/
 	SI_SUB_VM_CONF		= 0x2300000,	/* config VM, set limits*/
 	SI_SUB_DDB_SERVICES	= 0x2380000,	/* capture, scripting, etc. */
 	SI_SUB_RUN_QUEUE	= 0x2400000,	/* set up run queue*/
 	SI_SUB_KTRACE		= 0x2480000,	/* ktrace */
 	SI_SUB_OPENSOLARIS	= 0x2490000,	/* OpenSolaris compatibility */
 	SI_SUB_AUDIT		= 0x24C0000,	/* audit */
 	SI_SUB_CREATE_INIT	= 0x2500000,	/* create init process*/
 	SI_SUB_SCHED_IDLE	= 0x2600000,	/* required idle procs */
 	SI_SUB_MBUF		= 0x2700000,	/* mbuf subsystem */
 	SI_SUB_INTR		= 0x2800000,	/* interrupt threads */
 	SI_SUB_SOFTINTR		= 0x2800001,	/* start soft interrupt thread */
 	SI_SUB_DEVFS		= 0x2F00000,	/* devfs ready for devices */
 	SI_SUB_INIT_IF		= 0x3000000,	/* prep for net interfaces */
 	SI_SUB_NETGRAPH		= 0x3010000,	/* Let Netgraph initialize */
 	SI_SUB_DTRACE		= 0x3020000,	/* DTrace subsystem */
 	SI_SUB_DTRACE_PROVIDER	= 0x3048000,	/* DTrace providers */
 	SI_SUB_DTRACE_ANON	= 0x308C000,	/* DTrace anon enabling */
 	SI_SUB_DRIVERS		= 0x3100000,	/* Let Drivers initialize */
 	SI_SUB_CONFIGURE	= 0x3800000,	/* Configure devices */
 	SI_SUB_VFS		= 0x4000000,	/* virtual filesystem*/
 	SI_SUB_CLOCKS		= 0x4800000,	/* real time and stat clocks*/
 	SI_SUB_SYSV_SHM		= 0x6400000,	/* System V shared memory*/
 	SI_SUB_SYSV_SEM		= 0x6800000,	/* System V semaphores*/
 	SI_SUB_SYSV_MSG		= 0x6C00000,	/* System V message queues*/
 	SI_SUB_P1003_1B		= 0x6E00000,	/* P1003.1B realtime */
 	SI_SUB_PSEUDO		= 0x7000000,	/* pseudo devices*/
 	SI_SUB_EXEC		= 0x7400000,	/* execve() handlers */
 	SI_SUB_PROTO_BEGIN	= 0x8000000,	/* VNET initialization */
+	SI_SUB_PROTO_PFIL	= 0x8100000,	/* Intialize pfil before FWs */
 	SI_SUB_PROTO_IF		= 0x8400000,	/* interfaces*/
 	SI_SUB_PROTO_DOMAININIT	= 0x8600000,	/* domain registration system */
+	SI_SUB_PROTO_MC		= 0x8700000,	/* Multicast */
 	SI_SUB_PROTO_DOMAIN	= 0x8800000,	/* domains (address families?)*/
-	SI_SUB_PROTO_IFATTACHDOMAIN	= 0x8800001,	/* domain dependent data init*/
+	SI_SUB_FW		= 0x8806000,	/* Firewalls */
+	SI_SUB_PROTO_IFATTACHDOMAIN = 0x8808000,/* domain dependent data init */
 	SI_SUB_PROTO_END	= 0x8ffffff,	/* VNET helper functions */
 	SI_SUB_KPROF		= 0x9000000,	/* kernel profiling*/
 	SI_SUB_KICK_SCHEDULER	= 0xa000000,	/* start the timeout events*/
 	SI_SUB_INT_CONFIG_HOOKS	= 0xa800000,	/* Interrupts enabled config */
 	SI_SUB_ROOT_CONF	= 0xb000000,	/* Find root devices */
 	SI_SUB_INTRINSIC_POST	= 0xd000000,	/* proc 0 cleanup*/
 	SI_SUB_SYSCALLS		= 0xd800000,	/* register system calls */
 	SI_SUB_VNET_DONE	= 0xdc00000,	/* vnet registration complete */
 	SI_SUB_KTHREAD_INIT	= 0xe000000,	/* init process*/
 	SI_SUB_KTHREAD_PAGE	= 0xe400000,	/* pageout daemon*/
 	SI_SUB_KTHREAD_VM	= 0xe800000,	/* vm daemon*/
 	SI_SUB_KTHREAD_BUF	= 0xea00000,	/* buffer daemon*/
 	SI_SUB_KTHREAD_UPDATE	= 0xec00000,	/* update daemon*/
 	SI_SUB_KTHREAD_IDLE	= 0xee00000,	/* idle procs*/
 	SI_SUB_SMP		= 0xf000000,	/* start the APs*/
 	SI_SUB_RACCTD		= 0xf100000,	/* start racctd*/
 	SI_SUB_LAST		= 0xfffffff	/* final initialization */
 };
 
 
 /*
  * Some enumerated orders; "ANY" sorts last.
  */
 enum sysinit_elem_order {
 	SI_ORDER_FIRST		= 0x0000000,	/* first*/
 	SI_ORDER_SECOND		= 0x0000001,	/* second*/
 	SI_ORDER_THIRD		= 0x0000002,	/* third*/
 	SI_ORDER_FOURTH		= 0x0000003,	/* fourth*/
 	SI_ORDER_MIDDLE		= 0x1000000,	/* somewhere in the middle */
 	SI_ORDER_ANY		= 0xfffffff	/* last*/
 };
 
 
 /*
  * A system initialization call instance
  *
  * At the moment there is one instance of sysinit.  We probably do not
  * want two which is why this code is if'd out, but we definitely want
  * to discern SYSINIT's which take non-constant data pointers and
  * SYSINIT's which take constant data pointers,
  *
  * The C_* macros take functions expecting const void * arguments
  * while the non-C_* macros take functions expecting just void * arguments.
  *
  * With -Wcast-qual on, the compiler issues warnings:
  *	- if we pass non-const data or functions taking non-const data
  *	  to a C_* macro.
  *
  *	- if we pass const data to the normal macros
  *
  * However, no warning is issued if we pass a function taking const data
  * through a normal non-const macro.  This is ok because the function is
  * saying it won't modify the data so we don't care whether the data is
  * modifiable or not.
  */
 
 typedef void (*sysinit_nfunc_t)(void *);
 typedef void (*sysinit_cfunc_t)(const void *);
 
 struct sysinit {
 	enum sysinit_sub_id	subsystem;	/* subsystem identifier*/
 	enum sysinit_elem_order	order;		/* init order within subsystem*/
 	sysinit_cfunc_t func;			/* function		*/
 	const void	*udata;			/* multiplexer/argument */
 };
 
 /*
  * Default: no special processing
  *
  * The C_ version of SYSINIT is for data pointers to const
  * data ( and functions taking data pointers to const data ).
  * At the moment it is no different from SYSINIT and thus
  * still results in warnings.
  *
  * The casts are necessary to have the compiler produce the
  * correct warnings when -Wcast-qual is used.
  *
  */
 #define	C_SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_init = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		(ident)						\
 	};							\
 	DATA_SET(sysinit_set,uniquifier ## _sys_init)
 
 #define	SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSINIT(uniquifier, subsystem, order,			\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)(ident))
 
 /*
  * Called on module unload: no special processing
  */
 #define	C_SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_uninit = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		(ident)						\
 	};							\
 	DATA_SET(sysuninit_set,uniquifier ## _sys_uninit)
 
 #define	SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSUNINIT(uniquifier, subsystem, order,		\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)(ident))
 
 void	sysinit_add(struct sysinit **set, struct sysinit **set_end);
 
 /*
  * Infrastructure for tunable 'constants'.  Value may be specified at compile
  * time or kernel load time.  Rules relating tunables together can be placed
  * in a SYSINIT function at SI_SUB_TUNABLES with SI_ORDER_ANY.
  *
  * WARNING: developers should never use the reserved suffixes specified in
  * loader.conf(5) for any tunables or conflicts will result.
  */
 
 /*
  * int
  * please avoid using for new tunables!
  */
 extern void tunable_int_init(void *);
 struct tunable_int {
 	const char *path;
 	int *var;
 };
 #define	TUNABLE_INT(path, var)					\
 	static struct tunable_int __CONCAT(__tunable_int_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_int_init,	\
 	    &__CONCAT(__tunable_int_, __LINE__))
 
 #define	TUNABLE_INT_FETCH(path, var)	getenv_int((path), (var))
 
 /*
  * long
  */
 extern void tunable_long_init(void *);
 struct tunable_long {
 	const char *path;
 	long *var;
 };
 #define	TUNABLE_LONG(path, var)					\
 	static struct tunable_long __CONCAT(__tunable_long_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_long_init,\
 	    &__CONCAT(__tunable_long_, __LINE__))
 
 #define	TUNABLE_LONG_FETCH(path, var)	getenv_long((path), (var))
 
 /*
  * unsigned long
  */
 extern void tunable_ulong_init(void *);
 struct tunable_ulong {
 	const char *path;
 	unsigned long *var;
 };
 #define	TUNABLE_ULONG(path, var)				\
 	static struct tunable_ulong __CONCAT(__tunable_ulong_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_ulong_init, \
 	    &__CONCAT(__tunable_ulong_, __LINE__))
 
 #define	TUNABLE_ULONG_FETCH(path, var)	getenv_ulong((path), (var))
 
 /*
  * quad
  */
 extern void tunable_quad_init(void *);
 struct tunable_quad {
 	const char *path;
 	quad_t *var;
 };
 #define	TUNABLE_QUAD(path, var)					\
 	static struct tunable_quad __CONCAT(__tunable_quad_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_quad_init, \
 	    &__CONCAT(__tunable_quad_, __LINE__))
 
 #define	TUNABLE_QUAD_FETCH(path, var)	getenv_quad((path), (var))
 
 extern void tunable_str_init(void *);
 struct tunable_str {
 	const char *path;
 	char *var;
 	int size;
 };
 #define	TUNABLE_STR(path, var, size)				\
 	static struct tunable_str __CONCAT(__tunable_str_, __LINE__) = { \
 		(path),						\
 		(var),						\
 		(size),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_str_init,	\
 	    &__CONCAT(__tunable_str_, __LINE__))
 
 #define	TUNABLE_STR_FETCH(path, var, size)			\
 	getenv_string((path), (var), (size))
 
 struct intr_config_hook {
 	TAILQ_ENTRY(intr_config_hook) ich_links;
 	void	(*ich_func)(void *arg);
 	void	*ich_arg;
 };
 
 int	config_intrhook_establish(struct intr_config_hook *hook);
 void	config_intrhook_disestablish(struct intr_config_hook *hook);
 
 #endif /* !_SYS_KERNEL_H_*/