Index: head/sys/kern/kern_poll.c
===================================================================
--- head/sys/kern/kern_poll.c	(revision 185347)
+++ head/sys/kern/kern_poll.c	(revision 185348)
@@ -1,596 +1,597 @@
 /*-
  * Copyright (c) 2001-2002 Luigi Rizzo
  *
  * Supported by: the Xorp Project (www.xorp.org)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>			/* needed by net/if.h		*/
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>			/* for IFF_* flags		*/
 #include <net/netisr.h>			/* for NETISR_POLL		*/
 
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/kthread.h>
 #include <sys/vimage.h>
 
 static void netisr_poll(void);		/* the two netisr handlers      */
 static void netisr_pollmore(void);
 static int poll_switch(SYSCTL_HANDLER_ARGS);
 
 void hardclock_device_poll(void);	/* hook from hardclock		*/
 void ether_poll(int);			/* polling in idle loop		*/
 
 static struct mtx	poll_mtx;
 
 /*
  * Polling support for [network] device drivers.
  *
  * Drivers which support this feature can register with the
  * polling code.
  *
  * If registration is successful, the driver must disable interrupts,
  * and further I/O is performed through the handler, which is invoked
  * (at least once per clock tick) with 3 arguments: the "arg" passed at
  * register time (a struct ifnet pointer), a command, and a "count" limit.
  *
  * The command can be one of the following:
  *  POLL_ONLY: quick move of "count" packets from input/output queues.
  *  POLL_AND_CHECK_STATUS: as above, plus check status registers or do
  *	other more expensive operations. This command is issued periodically
  *	but less frequently than POLL_ONLY.
  *
  * The count limit specifies how much work the handler can do during the
  * call -- typically this is the number of packets to be received, or
  * transmitted, etc. (drivers are free to interpret this number, as long
  * as the max time spent in the function grows roughly linearly with the
  * count).
  *
  * Polling is enabled and disabled via setting IFCAP_POLLING flag on
  * the interface. The driver ioctl handler should register interface
  * with polling and disable interrupts, if registration was successful.
  *
  * A second variable controls the sharing of CPU between polling/kernel
  * network processing, and other activities (typically userlevel tasks):
  * kern.polling.user_frac (between 0 and 100, default 50) sets the share
  * of CPU allocated to user tasks. CPU is allocated proportionally to the
  * shares, by dynamically adjusting the "count" (poll_burst).
  *
  * Other parameters can should be left to their default values.
  * The following constraints hold
  *
  *	1 <= poll_each_burst <= poll_burst <= poll_burst_max
  *	0 <= poll_each_burst
  *	MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
  */
 
 #define MIN_POLL_BURST_MAX	10
 #define MAX_POLL_BURST_MAX	1000
 
 static uint32_t poll_burst = 5;
 static uint32_t poll_burst_max = 150;	/* good for 100Mbit net and HZ=1000 */
 static uint32_t poll_each_burst = 5;
 
 SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
 	"Device polling parameters");
 
 SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD,
 	&poll_burst, 0, "Current polling burst size");
 
 static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = poll_burst_max;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	poll_burst_max = val;
 	if (poll_burst > poll_burst_max)
 		poll_burst = poll_burst_max;
 	if (poll_each_burst > poll_burst_max)
 		poll_each_burst = MIN_POLL_BURST_MAX;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, burst_max, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), poll_burst_max_sysctl, "I", "Max Polling burst size");
 
 static int poll_each_burst_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = poll_each_burst;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	if (val > poll_burst_max) {
 		mtx_unlock(&poll_mtx);
 		return (EINVAL);
 	}
 	poll_each_burst = val;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, each_burst, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), poll_each_burst_sysctl, "I",
 	"Max size of each burst");
 
 static uint32_t poll_in_idle_loop=0;	/* do we poll in idle loop ? */
 SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
 	&poll_in_idle_loop, 0, "Enable device polling in idle loop");
 
 static uint32_t user_frac = 50;
 static int user_frac_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = user_frac;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 0 || val > 99)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	user_frac = val;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, user_frac, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), user_frac_sysctl, "I",
 	"Desired user fraction of cpu time");
 
 static uint32_t reg_frac_count = 0;
 static uint32_t reg_frac = 20 ;
 static int reg_frac_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = reg_frac;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1 || val > hz)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	reg_frac = val;
 	if (reg_frac_count >= reg_frac)
 		reg_frac_count = 0;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, reg_frac, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), reg_frac_sysctl, "I",
 	"Every this many cycles check registers");
 
 static uint32_t short_ticks;
 SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RD,
 	&short_ticks, 0, "Hardclock ticks shorter than they should be");
 
 static uint32_t lost_polls;
 SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RD,
 	&lost_polls, 0, "How many times we would have lost a poll tick");
 
 static uint32_t pending_polls;
 SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RD,
 	&pending_polls, 0, "Do we need to poll again");
 
 static int residual_burst = 0;
 SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RD,
 	&residual_burst, 0, "# of residual cycles in burst");
 
 static uint32_t poll_handlers; /* next free entry in pr[]. */
 SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
 	&poll_handlers, 0, "Number of registered poll handlers");
 
 static int polling = 0;
 SYSCTL_PROC(_kern_polling, OID_AUTO, enable, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(int), poll_switch, "I", "Switch polling for all interfaces");
 
 static uint32_t phase;
 SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RD,
 	&phase, 0, "Polling phase");
 
 static uint32_t suspect;
 SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RD,
 	&suspect, 0, "suspect event");
 
 static uint32_t stalled;
 SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RD,
 	&stalled, 0, "potential stalls");
 
 static uint32_t idlepoll_sleeping; /* idlepoll is sleeping */
 SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
 	&idlepoll_sleeping, 0, "idlepoll is sleeping");
 
 
 #define POLL_LIST_LEN  128
 struct pollrec {
 	poll_handler_t	*handler;
 	struct ifnet	*ifp;
 };
 
 static struct pollrec pr[POLL_LIST_LEN];
 
 static void
 init_device_poll(void)
 {
 
 	mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
 	netisr_register(NETISR_POLL, (netisr_t *)netisr_poll, NULL, 0);
 	netisr_register(NETISR_POLLMORE, (netisr_t *)netisr_pollmore, NULL, 0);
 }
 SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL);
 
 
 /*
  * Hook from hardclock. Tries to schedule a netisr, but keeps track
  * of lost ticks due to the previous handler taking too long.
  * Normally, this should not happen, because polling handler should
  * run for a short time. However, in some cases (e.g. when there are
  * changes in link status etc.) the drivers take a very long time
  * (even in the order of milliseconds) to reset and reconfigure the
  * device, causing apparent lost polls.
  *
  * The first part of the code is just for debugging purposes, and tries
  * to count how often hardclock ticks are shorter than they should,
  * meaning either stray interrupts or delayed events.
  */
 void
 hardclock_device_poll(void)
 {
 	static struct timeval prev_t, t;
 	int delta;
 
 	if (poll_handlers == 0)
 		return;
 
 	microuptime(&t);
 	delta = (t.tv_usec - prev_t.tv_usec) +
 		(t.tv_sec - prev_t.tv_sec)*1000000;
 	if (delta * hz < 500000)
 		short_ticks++;
 	else
 		prev_t = t;
 
 	if (pending_polls > 100) {
 		/*
 		 * Too much, assume it has stalled (not always true
 		 * see comment above).
 		 */
 		stalled++;
 		pending_polls = 0;
 		phase = 0;
 	}
 
 	if (phase <= 2) {
 		if (phase != 0)
 			suspect++;
 		phase = 1;
 		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
 		phase = 2;
 	}
 	if (pending_polls++ > 0)
 		lost_polls++;
 }
 
 /*
  * ether_poll is called from the idle loop.
  */
 void
 ether_poll(int count)
 {
 	int i;
 
 	mtx_lock(&poll_mtx);
 
 	if (count > poll_each_burst)
 		count = poll_each_burst;
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		pr[i].handler(pr[i].ifp, POLL_ONLY, count);
 
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * netisr_pollmore is called after other netisr's, possibly scheduling
  * another NETISR_POLL call, or adapting the burst size for the next cycle.
  *
  * It is very bad to fetch large bursts of packets from a single card at once,
  * because the burst could take a long time to be completely processed, or
  * could saturate the intermediate queue (ipintrq or similar) leading to
  * losses or unfairness. To reduce the problem, and also to account better for
  * time spent in network-related processing, we split the burst in smaller
  * chunks of fixed size, giving control to the other netisr's between chunks.
  * This helps in improving the fairness, reducing livelock (because we
  * emulate more closely the "process to completion" that we have with
  * fastforwarding) and accounting for the work performed in low level
  * handling and forwarding.
  */
 
 static struct timeval poll_start_t;
 
 void
 netisr_pollmore()
 {
 	struct timeval t;
 	int kern_load;
 
 	mtx_lock(&poll_mtx);
 	phase = 5;
 	if (residual_burst > 0) {
 		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
 		mtx_unlock(&poll_mtx);
 		/* will run immediately on return, followed by netisrs */
 		return;
 	}
 	/* here we can account time spent in netisr's in this tick */
 	microuptime(&t);
 	kern_load = (t.tv_usec - poll_start_t.tv_usec) +
 		(t.tv_sec - poll_start_t.tv_sec)*1000000;	/* us */
 	kern_load = (kern_load * hz) / 10000;			/* 0..100 */
 	if (kern_load > (100 - user_frac)) { /* try decrease ticks */
 		if (poll_burst > 1)
 			poll_burst--;
 	} else {
 		if (poll_burst < poll_burst_max)
 			poll_burst++;
 	}
 
 	pending_polls--;
 	if (pending_polls == 0) /* we are done */
 		phase = 0;
 	else {
 		/*
 		 * Last cycle was long and caused us to miss one or more
 		 * hardclock ticks. Restart processing again, but slightly
 		 * reduce the burst size to prevent that this happens again.
 		 */
 		poll_burst -= (poll_burst / 8);
 		if (poll_burst < 1)
 			poll_burst = 1;
 		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
 		phase = 6;
 	}
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * netisr_poll is scheduled by schednetisr when appropriate, typically once
  * per tick.
  */
 static void
 netisr_poll(void)
 {
 	int i, cycles;
 	enum poll_cmd arg = POLL_ONLY;
 
 	mtx_lock(&poll_mtx);
 	phase = 3;
 	if (residual_burst == 0) { /* first call in this tick */
 		microuptime(&poll_start_t);
 		if (++reg_frac_count == reg_frac) {
 			arg = POLL_AND_CHECK_STATUS;
 			reg_frac_count = 0;
 		}
 
 		residual_burst = poll_burst;
 	}
 	cycles = (residual_burst < poll_each_burst) ?
 		residual_burst : poll_each_burst;
 	residual_burst -= cycles;
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		pr[i].handler(pr[i].ifp, arg, cycles);
 
 	phase = 4;
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * Try to register routine for polling. Returns 0 if successful
  * (and polling should be enabled), error code otherwise.
  * A device is not supposed to register itself multiple times.
  *
  * This is called from within the *_ioctl() functions.
  */
 int
 ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
 {
 	int i;
 
 	KASSERT(h != NULL, ("%s: handler is NULL", __func__));
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	mtx_lock(&poll_mtx);
 	if (poll_handlers >= POLL_LIST_LEN) {
 		/*
 		 * List full, cannot register more entries.
 		 * This should never happen; if it does, it is probably a
 		 * broken driver trying to register multiple times. Checking
 		 * this at runtime is expensive, and won't solve the problem
 		 * anyways, so just report a few times and then give up.
 		 */
 		static int verbose = 10 ;
 		if (verbose >0) {
 			log(LOG_ERR, "poll handlers list full, "
 			    "maybe a broken driver ?\n");
 			verbose--;
 		}
 		mtx_unlock(&poll_mtx);
 		return (ENOMEM); /* no polling for you */
 	}
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		if (pr[i].ifp == ifp && pr[i].handler != NULL) {
 			mtx_unlock(&poll_mtx);
 			log(LOG_DEBUG, "ether_poll_register: %s: handler"
 			    " already registered\n", ifp->if_xname);
 			return (EEXIST);
 		}
 
 	pr[poll_handlers].handler = h;
 	pr[poll_handlers].ifp = ifp;
 	poll_handlers++;
 	mtx_unlock(&poll_mtx);
 	if (idlepoll_sleeping)
 		wakeup(&idlepoll_sleeping);
 	return (0);
 }
 
 /*
  * Remove interface from the polling list. Called from *_ioctl(), too.
  */
 int
 ether_poll_deregister(struct ifnet *ifp)
 {
 	int i;
 
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	mtx_lock(&poll_mtx);
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		if (pr[i].ifp == ifp) /* found it */
 			break;
 	if (i == poll_handlers) {
 		log(LOG_DEBUG, "ether_poll_deregister: %s: not found!\n",
 		    ifp->if_xname);
 		mtx_unlock(&poll_mtx);
 		return (ENOENT);
 	}
 	poll_handlers--;
 	if (i < poll_handlers) { /* Last entry replaces this one. */
 		pr[i].handler = pr[poll_handlers].handler;
 		pr[i].ifp = pr[poll_handlers].ifp;
 	}
 	mtx_unlock(&poll_mtx);
 	return (0);
 }
 
 /*
  * Legacy interface for turning polling on all interfaces at one time.
  */
 static int
 poll_switch(SYSCTL_HANDLER_ARGS)
 {
+	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	int error;
 	int val = polling;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
 	if (val == polling)
 		return (0);
 
 	if (val < 0 || val > 1)
 		return (EINVAL);
 
 	polling = val;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_capabilities & IFCAP_POLLING) {
 			struct ifreq ifr;
 
 			if (val == 1)
 				ifr.ifr_reqcap =
 				    ifp->if_capenable | IFCAP_POLLING;
 			else
 				ifr.ifr_reqcap =
 				    ifp->if_capenable & ~IFCAP_POLLING;
 			IFF_LOCKGIANT(ifp);	/* LOR here */
 			(void) (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr);
 			IFF_UNLOCKGIANT(ifp);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	log(LOG_ERR, "kern.polling.enable is deprecated. Use ifconfig(8)");
 
 	return (0);
 }
 
 static void
 poll_idle(void)
 {
 	struct thread *td = curthread;
 	struct rtprio rtp;
 
 	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
 	rtp.type = RTP_PRIO_IDLE;
 	PROC_SLOCK(td->td_proc);
 	rtp_to_pri(&rtp, td);
 	PROC_SUNLOCK(td->td_proc);
 
 	for (;;) {
 		if (poll_in_idle_loop && poll_handlers > 0) {
 			idlepoll_sleeping = 0;
 			ether_poll(poll_each_burst);
 			thread_lock(td);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(td);
 		} else {
 			idlepoll_sleeping = 1;
 			tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
 		}
 	}
 }
 
 static struct proc *idlepoll;
 static struct kproc_desc idlepoll_kp = {
 	 "idlepoll",
 	 poll_idle,
 	 &idlepoll
 };
 SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start,
     &idlepoll_kp);
Index: head/sys/kern/kern_sysctl.c
===================================================================
--- head/sys/kern/kern_sysctl.c	(revision 185347)
+++ head/sys/kern/kern_sysctl.c	(revision 185348)
@@ -1,1652 +1,1655 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
+#include <sys/vimage.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
 static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 /*
  * Locking - this locks the sysctl tree in memory.
  */
 static struct sx sysctllock;
 
 #define	SYSCTL_LOCK()		sx_xlock(&sysctllock)
 #define	SYSCTL_UNLOCK()		sx_xunlock(&sysctllock)
 #define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 struct sysctl_oid_list sysctl__children; /* root list */
 
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, list, oid_link) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Initialization of the MIB tree.
  *
  * Order by number in each list.
  */
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid_list *parent = oidp->oid_parent;
 	struct sysctl_oid *p;
 	struct sysctl_oid *q;
 
 	/*
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			p->oid_refcnt++;
 			return;
 		} else {
 			printf("can't re-use a leaf (%s)!\n", p->oid_name);
 			return;
 		}
 	}
 	/*
 	 * If this oid has a number OID_AUTO, give it a number which
 	 * is greater than any current oid.
 	 * NOTE: DO NOT change the starting value here, change it in
 	 * <sys/sysctl.h>, and make sure it is at least 256 to
 	 * accomodate e.g. net.inet.raw as a static sysctl node.
 	 */
 	if (oidp->oid_number == OID_AUTO) {
 		static int newoid = CTL_AUTO_START;
 
 		oidp->oid_number = newoid++;
 		if (newoid == 0x7fffffff)
 			panic("out of oids");
 	}
 #if 0
 	else if (oidp->oid_number >= CTL_AUTO_START) {
 		/* do not panic; this happens when unregistering sysctl sets */
 		printf("static sysctl oid too high: %d", oidp->oid_number);
 	}
 #endif
 
 	/*
 	 * Insert the oid into the parent's list in order.
 	 */
 	q = NULL;
 	SLIST_FOREACH(p, parent, oid_link) {
 		if (oidp->oid_number < p->oid_number)
 			break;
 		q = p;
 	}
 	if (q)
 		SLIST_INSERT_AFTER(q, oidp, oid_link);
 	else
 		SLIST_INSERT_HEAD(parent, oidp, oid_link);
 }
 
 void
 sysctl_unregister_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	error = ENOENT;
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
 	} else {
 		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
 			if (p == oidp) {
 				SLIST_REMOVE(oidp->oid_parent, oidp,
 				    sysctl_oid, oid_link);
 				error = 0;
 				break;
 			}
 		}
 	}
 
 	/* 
 	 * This can happen when a module fails to register and is
 	 * being unloaded afterwards.  It should not be a panic()
 	 * for normal use.
 	 */
 	if (error)
 		printf("%s: failed to unregister sysctl\n", __func__);
 }
 
 /* Initialize a new context to keep track of dynamically added sysctls. */
 int
 sysctl_ctx_init(struct sysctl_ctx_list *c)
 {
 
 	if (c == NULL) {
 		return (EINVAL);
 	}
 	TAILQ_INIT(c);
 	return (0);
 }
 
 /* Free the context, and destroy all dynamic oids registered in this context */
 int
 sysctl_ctx_free(struct sysctl_ctx_list *clist)
 {
 	struct sysctl_ctx_entry *e, *e1;
 	int error;
 
 	error = 0;
 	/*
 	 * First perform a "dry run" to check if it's ok to remove oids.
 	 * XXX FIXME
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
 	TAILQ_FOREACH(e, clist, link) {
 		error = sysctl_remove_oid(e->entry, 0, 0);
 		if (error)
 			break;
 	}
 	/*
 	 * Restore deregistered entries, either from the end,
 	 * or from the place where error occured.
 	 * e contains the entry that was not unregistered
 	 */
 	if (error)
 		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
 	else
 		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
 	while (e1 != NULL) {
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
 	if (error)
 		return(EBUSY);
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
 		error = sysctl_remove_oid(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
 	return (error);
 }
 
 /* Add an entry to the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
 	e->entry = oidp;
 	TAILQ_INSERT_HEAD(clist, e, link);
 	return (e);
 }
 
 /* Find an entry in the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
 		if(e->entry == oidp)
 			return(e);
 	}
 	return (e);
 }
 
 /*
  * Delete an entry from the context.
  * NOTE: this function doesn't free oidp! You have to remove it
  * with sysctl_remove_oid().
  */
 int
 sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
 		free(e, M_SYSCTLOID);
 		return (0);
 	} else
 		return (ENOENT);
 }
 
 /*
  * Remove dynamically created sysctl trees.
  * oidp - top of the tree to be removed
  * del - if 0 - just deregister, otherwise free up entries as well
  * recurse - if != 0 traverse the subtree to be deleted
  */
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
 		printf("can't remove non-dynamic nodes!\n");
 		return (EINVAL);
 	}
 	/*
 	 * WARNING: normal method to do this should be through
 	 * sysctl_ctx_free(). Use recursing as the last resort
 	 * method to purge your sysctl tree of leftovers...
 	 * However, if some other code still references these nodes,
 	 * it will panic.
 	 */
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		if (oidp->oid_refcnt == 1) {
 			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
 				if (!recurse)
 					return (ENOTEMPTY);
 				error = sysctl_remove_oid(p, del, recurse);
 				if (error)
 					return (error);
 			}
 			if (del)
 				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
 		}
 	}
 	if (oidp->oid_refcnt > 1 ) {
 		oidp->oid_refcnt--;
 	} else {
 		if (oidp->oid_refcnt == 0) {
 			printf("Warning: bad oid_refcnt=%u (%s)!\n",
 				oidp->oid_refcnt, oidp->oid_name);
 			return (EINVAL);
 		}
 		sysctl_unregister_oid(oidp);
 		if (del) {
 			if (oidp->oid_descr)
 				free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID);
 			free((void *)(uintptr_t)(const void *)oidp->oid_name,
 			     M_SYSCTLOID);
 			free(oidp, M_SYSCTLOID);
 		}
 	}
 	return (0);
 }
 
 /*
  * Create new sysctls at run time.
  * clist may point to a valid context initialized with sysctl_ctx_init().
  */
 struct sysctl_oid *
 sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
 	int number, const char *name, int kind, void *arg1, int arg2,
 	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
 {
 	struct sysctl_oid *oidp;
 	ssize_t len;
 	char *newname;
 
 	/* You have to hook up somewhere.. */
 	if (parent == NULL)
 		return(NULL);
 	/* Check if the node already exists, otherwise create it */
 	oidp = sysctl_find_oidname(name, parent);
 	if (oidp != NULL) {
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			oidp->oid_refcnt++;
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
 			return (oidp);
 		} else {
 			printf("can't re-use a leaf (%s)!\n", name);
 			return (NULL);
 		}
 	}
 	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
 	oidp->oid_parent = parent;
 	SLIST_NEXT(oidp, oid_link) = NULL;
 	oidp->oid_number = number;
 	oidp->oid_refcnt = 1;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	oidp->oid_name = newname;
 	oidp->oid_handler = handler;
 	oidp->oid_kind = CTLFLAG_DYN | kind;
 	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
 		/* Allocate space for children */
 		SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
 		    M_SYSCTLOID, M_WAITOK));
 		SLIST_INIT(SYSCTL_CHILDREN(oidp));
 	} else {
 		oidp->oid_arg1 = arg1;
 		oidp->oid_arg2 = arg2;
 	}
 	oidp->oid_fmt = fmt;
 	if (descr) {
 		int len = strlen(descr) + 1;
 		oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
 		if (oidp->oid_descr)
 			strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr);
 	}
 	/* Update the context, if used */
 	if (clist != NULL)
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
 	return (oidp);
 }
 
 /*
  * Rename an existing oid.
  */
 void
 sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
 {
 	ssize_t len;
 	char *newname;
 	void *oldname;
 
 	oldname = (void *)(uintptr_t)(const void *)oidp->oid_name;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	oidp->oid_name = newname;
 	free(oldname, M_SYSCTLOID);
 }
 
 /*
  * Reparent an existing oid.
  */
 int
 sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
 {
 	struct sysctl_oid *oidp;
 
 	if (oid->oid_parent == parent)
 		return (0);
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
 	if (oidp != NULL)
 		return (EEXIST);
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
 	return (0);
 }
 
 /*
  * Register the kernel's oids on startup.
  */
 SET_DECLARE(sysctl_set, struct sysctl_oid);
 
 static void
 sysctl_register_all(void *arg)
 {
 	struct sysctl_oid **oidp;
 
 	SYSCTL_INIT();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
 
 /*
  * "Staff-functions"
  *
  * These functions implement a presently undocumented interface 
  * used by the sysctl program to walk the tree, and get the type
  * so it can print the value.
  * This interface is under work and consideration, and should probably
  * be killed with a big axe by the first person who can find the time.
  * (be aware though, that the proper interface isn't as obvious as it
  * may seem, there are various conflicting requirements.
  *
  * {0,0}	printf the entire MIB-tree.
  * {0,1,...}	return the name of the "..." OID.
  * {0,2,...}	return the next OID.
  * {0,3}	return the OID of the name in "new"
  * {0,4,...}	return the kind & format info for the "..." OID.
  * {0,5,...}	return the description the "..." OID.
  */
 
 #ifdef SYSCTL_DEBUG
 static void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
 	int k;
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, l, oid_link) {
 
 		for (k=0; k<i; k++)
 			printf(" ");
 
 		printf("%d %s ", oidp->oid_number, oidp->oid_name);
 
 		printf("%c%c",
 			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
 			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
 
 		if (oidp->oid_handler)
 			printf(" *Handler");
 
 		switch (oidp->oid_kind & CTLTYPE) {
 			case CTLTYPE_NODE:
 				printf(" Node\n");
 				if (!oidp->oid_handler) {
 					sysctl_sysctl_debug_dump_node(
 						oidp->oid_arg1, i+2);
 				}
 				break;
 			case CTLTYPE_INT:    printf(" Int\n"); break;
 			case CTLTYPE_STRING: printf(" String\n"); break;
 			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
 			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
 			default:	     printf("\n");
 		}
 
 	}
 }
 
 static int
 sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
 	return (ENOENT);
 }
 
 SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_sysctl_debug, "-", "");
 #endif
 
 static int
 sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int error = 0;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	char buf[10];
 
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, buf, strlen(buf));
 			if (error)
 				return (error);
 			namelen--;
 			name++;
 			continue;
 		}
 		lsp2 = 0;
 		SLIST_FOREACH(oid, lsp, oid_link) {
 			if (oid->oid_number != *name)
 				continue;
 
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, oid->oid_name,
 					strlen(oid->oid_name));
 			if (error)
 				return (error);
 
 			namelen--;
 			name++;
 
 			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				break;
 
 			if (oid->oid_handler)
 				break;
 
 			lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
 			break;
 		}
 		lsp = lsp2;
 	}
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
 
 static int
 sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
 	int *next, int *len, int level, struct sysctl_oid **oidpp)
 {
 	struct sysctl_oid *oidp;
 
 	*len = level;
 	SLIST_FOREACH(oidp, lsp, oid_link) {
 		*next = oidp->oid_number;
 		*oidpp = oidp;
 
 		if (oidp->oid_kind & CTLFLAG_SKIP)
 			continue;
 
 		if (!namelen) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				return (0);
 			if (oidp->oid_handler) 
 				/* We really should call the handler here...*/
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
 				len, level+1, oidpp))
 				return (0);
 			goto emptynode;
 		}
 
 		if (oidp->oid_number < *name)
 			continue;
 
 		if (oidp->oid_number > *name) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 				return (0);
 			if (oidp->oid_handler)
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
 				next+1, len, level+1, oidpp))
 				return (0);
 			goto next;
 		}
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			continue;
 
 		if (oidp->oid_handler)
 			continue;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
 			len, level+1, oidpp))
 			return (0);
 	next:
 		namelen = 1;
 	emptynode:
 		*len = level;
 	}
 	return (1);
 }
 
 static int
 sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int i, j, error;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	int newoid[CTL_MAXNAME];
 
 	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
 	if (i)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
 
 static int
 name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp)
 {
 	int i;
 	struct sysctl_oid *oidp;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	char *p;
 
 	if (!*name)
 		return (ENOENT);
 
 	p = name + strlen(name) - 1 ;
 	if (*p == '.')
 		*p = '\0';
 
 	*len = 0;
 
 	for (p = name; *p && *p != '.'; p++) 
 		;
 	i = *p;
 	if (i == '.')
 		*p = '\0';
 
 	oidp = SLIST_FIRST(lsp);
 
 	while (oidp && *len < CTL_MAXNAME) {
 		if (strcmp(name, oidp->oid_name)) {
 			oidp = SLIST_NEXT(oidp, oid_link);
 			continue;
 		}
 		*oid++ = oidp->oid_number;
 		(*len)++;
 
 		if (!i) {
 			if (oidpp)
 				*oidpp = oidp;
 			return (0);
 		}
 
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			break;
 
 		if (oidp->oid_handler)
 			break;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		oidp = SLIST_FIRST(lsp);
 		name = p+1;
 		for (p = name; *p && *p != '.'; p++) 
 				;
 		i = *p;
 		if (i == '.')
 			*p = '\0';
 	}
 	return (ENOENT);
 }
 
 static int
 sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
 {
 	char *p;
 	int error, oid[CTL_MAXNAME], len;
 	struct sysctl_oid *op = 0;
 
 	if (!req->newlen) 
 		return (ENOENT);
 	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
 		return (ENAMETOOLONG);
 
 	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
 
 	error = SYSCTL_IN(req, p, req->newlen);
 	if (error) {
 		free(p, M_SYSCTL);
 		return (error);
 	}
 
 	p [req->newlen] = '\0';
 
 	error = name2oid(p, oid, &len, &op);
 
 	free(p, M_SYSCTL);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
 	return (error);
 }
 
 SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
 	sysctl_sysctl_name2oid, "I", "");
 
 static int
 sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_fmt)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
 	return (error);
 }
 
 
 static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
 
 static int
 sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_descr)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
 
 /*
  * Default "handler" functions.
  */
 
 /*
  * Handle an int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 	int tmpout, error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 
 /*
  * Based on on sysctl_handle_int() convert milliseconds into ticks.
  */
 
 int
 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int error, s, tt;
 
 	tt = *(int *)oidp->oid_arg1;
 	s = (int)((int64_t)tt * 1000 / hz);
 
 	error = sysctl_handle_int(oidp, &s, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	tt = (int)((int64_t)s * hz / 1000);
 	if (tt < 1)
 		return (EINVAL);
 
 	*(int *)oidp->oid_arg1 = tt;
 	return (0);
 }
 
 
 /*
  * Handle a long, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_long(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmplong;
 #ifdef SCTL_MASK32
 	int tmpint;
 #endif
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmplong = *(long *)arg1;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		tmpint = tmplong;
 		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
 
 	if (error || !req->newptr)
 		return (error);
 
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		error = SYSCTL_IN(req, &tmpint, sizeof(int));
 		*(long *)arg1 = (long)tmpint;
 	} else
 #endif
 		error = SYSCTL_IN(req, arg1, sizeof(long));
 	return (error);
 }
 
 /*
  * Handle a 64 bit int, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_quad(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	uint64_t tmpout;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmpout = *(uint64_t *)arg1;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
 
 	if (error || !req->newptr)
 		return (error);
 
 	error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
 	return (error);
 }
 
 /*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
  * 	a constant string:  point arg1 at it, arg2 is zero.
  */
 
 int
 sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 	int error=0;
 	char *tmparg;
 	size_t outlen;
 
 	/*
 	 * Attempt to get a coherent snapshot by copying to a
 	 * temporary kernel buffer.
 	 */
 retry:
 	outlen = strlen((char *)arg1)+1;
 	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
 
 	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
 		free(tmparg, M_SYSCTLTMP);
 		goto retry;
 	}
 
 	error = SYSCTL_OUT(req, tmparg, outlen);
 	free(tmparg, M_SYSCTLTMP);
 
 	if (error || !req->newptr)
 		return (error);
 
 	if ((req->newlen - req->newidx) >= arg2) {
 		error = EINVAL;
 	} else {
 		arg2 = (req->newlen - req->newidx);
 		error = SYSCTL_IN(req, arg1, arg2);
 		((char *)arg1)[arg2] = '\0';
 	}
 
 	return (error);
 }
 
 /*
  * Handle any kind of opaque data.
  * arg1 points to it, arg2 is the size.
  */
 
 int
 sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 	int error, tries;
 	u_int generation;
 	struct sysctl_req req2;
 
 	/*
 	 * Attempt to get a coherent snapshot, by using the thread
 	 * pre-emption counter updated from within mi_switch() to
 	 * determine if we were pre-empted during a bcopy() or
 	 * copyout(). Make 3 attempts at doing this before giving up.
 	 * If we encounter an error, stop immediately.
 	 */
 	tries = 0;
 	req2 = *req;
 retry:
 	generation = curthread->td_generation;
 	error = SYSCTL_OUT(req, arg1, arg2);
 	if (error)
 		return (error);
 	tries++;
 	if (generation != curthread->td_generation && tries < 3) {
 		*req = req2;
 		goto retry;
 	}
 
 	error = SYSCTL_IN(req, arg1, arg2);
 
 	return (error);
 }
 
 /*
  * Transfer functions to/from kernel space.
  * XXX: rather untested at this point
  */
 static int
 sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i = 0;
 
 	if (req->oldptr) {
 		i = l;
 		if (req->oldlen <= req->oldidx)
 			i = 0;
 		else
 			if (i > req->oldlen - req->oldidx)
 				i = req->oldlen - req->oldidx;
 		if (i > 0)
 			bcopy(p, (char *)req->oldptr + req->oldidx, i);
 	}
 	req->oldidx += l;
 	if (req->oldptr && i != l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
 {
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	bcopy((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (0);
 }
 
 int
 kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
 
 	error = sysctl_root(0, name, namelen, &req);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 int
 kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
     void *new, size_t newlen, size_t *retval, int flags)
 {
         int oid[CTL_MAXNAME];
         size_t oidlen, plen;
 	int error;
 
 	oid[0] = 0;		/* sysctl internal magic */
 	oid[1] = 3;		/* name2oid */
 	oidlen = sizeof(oid);
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
 	    (void *)name, strlen(name), &plen, flags);
 	if (error)
 		return (error);
 
 	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
 	    new, newlen, retval, flags);
 	return (error);
 }
 
 /*
  * Transfer function to/from user space.
  */
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
 	int error = 0;
 	size_t i, len, origidx;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
 	if (req->oldptr == NULL)
 		return (0);
 	/*
 	 * If we have not wired the user supplied buffer and we are currently
 	 * holding locks, drop a witness warning, as it's possible that
 	 * write operations to the user page can sleep.
 	 */
 	if (req->lock != REQ_WIRED)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "sysctl_old_user()");
 	i = l;
 	len = req->validlen;
 	if (len <= origidx)
 		i = 0;
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
 		error = copyout(p, (char *)req->oldptr + origidx, i);
 	}
 	if (error)
 		return (error);
 	if (i < l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
 {
 	int error;
 
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "sysctl_new_user()");
 	error = copyin((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (error);
 }
 
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
 {
 	int ret;
 	size_t i, wiredlen;
 	char *cp, dummy;
 
 	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
 	ret = 0;
 	if (req->lock == REQ_LOCKED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
 		if (wiredlen != 0) {
 			ret = vslock(req->oldptr, wiredlen);
 			if (ret != 0) {
 				if (ret != ENOMEM)
 					return (ret);
 				wiredlen = 0;
 			}
 			/*
 			 * Touch all the wired pages to avoid PTE modified
 			 * bit emulation traps on Alpha while holding locks
 			 * in the sysctl handler.
 			 */
 			for (i = (wiredlen + PAGE_SIZE - 1) / PAGE_SIZE,
 			    cp = req->oldptr; i > 0; i--, cp += PAGE_SIZE) {
 				copyin(cp, &dummy, 1);
 				copyout(&dummy, cp, 1);
 			}
 		}
 		req->lock = REQ_WIRED;
 		req->validlen = wiredlen;
 	}
 	return (0);
 }
 
 int
 sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
     int *nindx, struct sysctl_req *req)
 {
 	struct sysctl_oid *oid;
 	int indx;
 
 	oid = SLIST_FIRST(&sysctl__children);
 	indx = 0;
 	while (oid && indx < CTL_MAXNAME) {
 		if (oid->oid_number == name[indx]) {
 			indx++;
 			if (oid->oid_kind & CTLFLAG_NOLOCK)
 				req->lock = REQ_UNLOCKED;
 			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 				if (oid->oid_handler != NULL ||
 				    indx == namelen) {
 					*noid = oid;
 					if (nindx != NULL)
 						*nindx = indx;
 					return (0);
 				}
 				oid = SLIST_FIRST(
 				    (struct sysctl_oid_list *)oid->oid_arg1);
 			} else if (indx == namelen) {
 				*noid = oid;
 				if (nindx != NULL)
 					*nindx = indx;
 				return (0);
 			} else {
 				return (ENOTDIR);
 			}
 		} else {
 			oid = SLIST_NEXT(oid, oid_link);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * to, and return the resulting error code.
  */
 
 static int
 sysctl_root(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error, indx, lvl;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
 		return (error);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		/*
 		 * You can't call a sysctl when it's a node, but has
 		 * no handler.  Inform the user that it's a node.
 		 * The indx may or may not be the same as namelen.
 		 */
 		if (oid->oid_handler == NULL)
 			return (EISDIR);
 	}
 
 	/* Is this sysctl writable? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
 		return (EPERM);
 
 	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
 
 	/* Is this sysctl sensitive to securelevels? */
 	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
 		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
 		error = securelevel_gt(req->td->td_ucred, lvl);
 		if (error)
 			return (error);
 	}
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
 		if (oid->oid_kind & CTLFLAG_PRISON)
 			error = priv_check(req->td, PRIV_SYSCTL_WRITEJAIL);
 		else
 			error = priv_check(req->td, PRIV_SYSCTL_WRITE);
 		if (error)
 			return (error);
 	}
 
 	if (!oid->oid_handler)
 		return (EINVAL);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		arg1 = (int *)arg1 + indx;
 		arg2 -= indx;
 	} else {
 		arg1 = oid->oid_arg1;
 		arg2 = oid->oid_arg2;
 	}
 #ifdef MAC
 	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
 	    req);
 	if (error != 0)
 		return (error);
 #endif
 	error = oid->oid_handler(oid, arg1, arg2, req);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysctl_args {
 	int	*name;
 	u_int	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
 
  	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, uap->oldlenp, 0,
 		uap->new, uap->newlen, &j, 0);
 	if (error && error != ENOMEM)
 		goto done2;
 	if (uap->oldlenp) {
 		int i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			error = i;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * This is used from various compatibility syscalls too.  That's why name
  * must be in kernel space.
  */
 int
 userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
     int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		if (inkernel) {
 			req.oldlen = *oldlenp;
 		} else {
 			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
 			if (error)
 				return (error);
 		}
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
 			return (EFAULT);
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		if (!useracc(new, newlen, VM_PROT_READ))
 			return (EFAULT);
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_user;
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
+	CURVNET_SET(TD_TO_VNET(curthread));
 
 	do {
 		req.oldidx = 0;
 		req.newidx = 0;
 		error = sysctl_root(0, name, namelen, &req);
 	} while (error == EAGAIN);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
+	CURVNET_RESTORE();
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43
 #include <sys/socket.h>
 #include <vm/vm_param.h>
 
 #define	KINFO_PROC		(0<<8)
 #define	KINFO_RT		(1<<8)
 #define	KINFO_VNODE		(2<<8)
 #define	KINFO_FILE		(3<<8)
 #define	KINFO_METER		(4<<8)
 #define	KINFO_LOADAVG		(5<<8)
 #define	KINFO_CLOCKRATE		(6<<8)
 
 /* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
 #define	KINFO_BSDI_SYSINFO	(101<<8)
 
 /*
  * XXX this is bloat, but I hope it's better here than on the potentially
  * limited kernel stack...  -Peter
  */
 
 static struct {
 	int	bsdi_machine;		/* "i386" on BSD/386 */
 /*      ^^^ this is an offset to the string, relative to the struct start */
 	char	*pad0;
 	long	pad1;
 	long	pad2;
 	long	pad3;
 	u_long	pad4;
 	u_long	pad5;
 	u_long	pad6;
 
 	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
 	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
 	long	pad7;
 	long	pad8;
 	char	*pad9;
 
 	long	pad10;
 	long	pad11;
 	int	pad12;
 	long	pad13;
 	quad_t	pad14;
 	long	pad15;
 
 	struct	timeval pad16;
 	/* we dont set this, because BSDI's uname used gethostname() instead */
 	int	bsdi_hostname;		/* hostname on BSD/386 */
 
 	/* the actual string data is appended here */
 
 } bsdi_si;
 
 /*
  * this data is appended to the end of the bsdi_si structure during copyout.
  * The "char *" offsets are relative to the base of the bsdi_si struct.
  * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
  * should not exceed the length of the buffer here... (or else!! :-)
  */
 static char bsdi_strings[80];	/* It had better be less than this! */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getkerninfo_args {
 	int	op;
 	char	*where;
 	size_t	*size;
 	int	arg;
 };
 #endif
 int
 ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
 {
 	int error, name[6];
 	size_t size;
 	u_int needed = 0;
 
 	mtx_lock(&Giant);
 
 	switch (uap->op & 0xff00) {
 
 	case KINFO_RT:
 		name[0] = CTL_NET;
 		name[1] = PF_ROUTE;
 		name[2] = 0;
 		name[3] = (uap->op & 0xff0000) >> 16;
 		name[4] = uap->op & 0xff;
 		name[5] = uap->arg;
 		error = userland_sysctl(td, name, 6, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_VNODE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_VNODE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_PROC:
 		name[0] = CTL_KERN;
 		name[1] = KERN_PROC;
 		name[2] = uap->op & 0xff;
 		name[3] = uap->arg;
 		error = userland_sysctl(td, name, 4, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_FILE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_FILE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_METER:
 		name[0] = CTL_VM;
 		name[1] = VM_TOTAL;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_LOADAVG:
 		name[0] = CTL_VM;
 		name[1] = VM_LOADAVG;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_CLOCKRATE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_CLOCKRATE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_BSDI_SYSINFO: {
 		/*
 		 * this is pretty crude, but it's just enough for uname()
 		 * from BSDI's 1.x libc to work.
 		 *
 		 * *size gives the size of the buffer before the call, and
 		 * the amount of data copied after a successful call.
 		 * If successful, the return value is the amount of data
 		 * available, which can be larger than *size.
 		 *
 		 * BSDI's 2.x product apparently fails with ENOMEM if *size
 		 * is too small.
 		 */
 
 		u_int left;
 		char *s;
 
 		bzero((char *)&bsdi_si, sizeof(bsdi_si));
 		bzero(bsdi_strings, sizeof(bsdi_strings));
 
 		s = bsdi_strings;
 
 		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, ostype);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, osrelease);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, machine);
 		s += strlen(s) + 1;
 
 		needed = sizeof(bsdi_si) + (s - bsdi_strings);
 
 		if ((uap->where == NULL) || (uap->size == NULL)) {
 			/* process is asking how much buffer to supply.. */
 			size = needed;
 			error = 0;
 			break;
 		}
 
 		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
 			break;
 
 		/* if too much buffer supplied, trim it down */
 		if (size > needed)
 			size = needed;
 
 		/* how much of the buffer is remaining */
 		left = size;
 
 		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
 			break;
 
 		/* is there any point in continuing? */
 		if (left > sizeof(bsdi_si)) {
 			left -= sizeof(bsdi_si);
 			error = copyout(&bsdi_strings,
 					uap->where + sizeof(bsdi_si), left);
 		}
 		break;
 	}
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error == 0) {
 		td->td_retval[0] = needed ? needed : size;
 		if (uap->size) {
 			error = copyout(&size, uap->size, sizeof(size));
 		}
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif /* COMPAT_43 */
Index: head/sys/net/bpf.c
===================================================================
--- head/sys/net/bpf.c	(revision 185347)
+++ head/sys/net/bpf.c	(revision 185348)
@@ -1,2151 +1,2158 @@
 /*-
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_mac.h"
 #include "opt_netgraph.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
+#include <sys/vimage.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT.  The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 static LIST_HEAD(, bpf_if)	bpf_iflist;
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct timeval *);
 static void	reset_d(struct bpf_d *);
 static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops =
 	{ 1, NULL, filt_bpfdetach, filt_bpfread };
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 
 	if (len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	if ((unsigned)len > MJUM16BYTES)
 		return (EIO);
 
 	if (len <= MHLEN)
 		MGETHDR(m, M_WAIT, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_getjcl(M_WAIT, MT_DATA, M_PKTHDR,
 #if (MJUMPAGESIZE > MCLBYTES)
 		    len <= MJUMPAGESIZE ? MJUMPAGESIZE :
 #endif
 		    (len <= MJUM9BYTES ? MJUM9BYTES : MJUM16BYTES));
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	*mp = m;
 
 	if (m->m_len < hlen) {
 		error = EPERM;
 		goto bad;
 	}
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(m->m_data, sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	/*
 	 * Point d at bp, and add d to the interface's list of listeners.
 	 * Finally, point the driver's bpf cookie at the interface so
 	 * it will divert packets to bpf.
 	 */
 	BPFIF_LOCK(bp);
 	d->bd_bif = bp;
 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	bpf_bpfd_cnt++;
 	BPFIF_UNLOCK(bp);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	int error;
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 
 	bp = d->bd_bif;
 	BPFIF_LOCK(bp);
 	BPFD_LOCK(d);
 	ifp = d->bd_bif->bif_ifp;
 
 	/*
 	 * Remove d from the interface's descriptor list.
 	 */
 	LIST_REMOVE(d, bd_next);
 
 	bpf_bpfd_cnt--;
 	d->bd_bif = NULL;
 	BPFD_UNLOCK(d);
 	BPFIF_UNLOCK(bp);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so, turn it off.
 	 */
 	if (d->bd_promisc) {
 		d->bd_promisc = 0;
+		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
+		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	mtx_lock(&bpf_mtx);
 	if (d->bd_bif)
 		bpf_detachd(d);
 	mtx_unlock(&bpf_mtx);
 	selwakeuppri(&d->bd_sel, PRINET);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	knlist_destroy(&d->bd_sel.si_note);
 	bpf_freed(d);
 	free(d, M_BPF);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	d->bd_pid = td->td_proc->p_pid;
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init(&d->bd_callout, CALLOUT_MPSAFE);
 	knlist_init(&d->bd_sel.si_note, &d->bd_mtx, NULL, NULL, NULL);
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int timed_out;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	BPFD_LOCK(d);
 	d->bd_pid = curthread->td_proc->p_pid;
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 * Rotate the buffers and return what's here.
 			 */
 			ROTATE_BUFFERS(d);
 			break;
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (ioflag & O_NONBLOCK) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
 	 *
 	 * XXXRW: More synchronization needed here: what if a second thread
 	 * issues a read on the same fd at the same time?  Don't want this
 	 * getting invalidated.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 	BPFD_UNLOCK(d);
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	struct sockaddr dst;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	d->bd_pid = curthread->td_proc->p_pid;
 	d->bd_wcount++;
 	if (d->bd_bif == NULL) {
 		d->bd_wdcount++;
 		return (ENXIO);
 	}
 
 	ifp = d->bd_bif->bif_ifp;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		d->bd_wdcount++;
 		return (ENETDOWN);
 	}
 
 	if (uio->uio_resid == 0) {
 		d->bd_wdcount++;
 		return (0);
 	}
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
 	    &m, &dst, &hlen, d->bd_wfilter);
 	if (error) {
 		d->bd_wdcount++;
 		return (error);
 	}
 	d->bd_wfcount++;
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_DONTWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 #ifdef MAC
 	BPFD_LOCK(d);
+	CURVNET_SET(ifp->if_vnet);
 	mac_bpfdesc_create_mbuf(d, m);
+	CURVNET_RESTORE();
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 	BPFD_UNLOCK(d);
 #endif
 
 	error = (*ifp->if_output)(ifp, m, &dst, NULL);
 	if (error)
 		d->bd_wdcount++;
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the
  * receive and drop counts.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	mtx_assert(&d->bd_mtx, MA_OWNED);
 	if (d->bd_hbuf) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		bpf_buf_reclaimed(d);
 	}
 	d->bd_slen = 0;
 	d->bd_hlen = 0;
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
 	d->bd_wcount = 0;
 	d->bd_wfcount = 0;
 	d->bd_wdcount = 0;
 	d->bd_zcopy = 0;
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  SIOCGIFADDR		Get interface address - convenient hook to driver.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	d->bd_pid = td->td_proc->p_pid;
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
+	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	case SIOCGIFADDR:
 		{
 			struct ifnet *ifp;
 
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				ifp = d->bd_bif->bif_ifp;
 				error = (*ifp->if_ioctl)(ifp, cmd, addr);
 			}
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		*(u_int *)addr = d->bd_bufsize;
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
 		if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 	case BIOCGDLTLIST:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		error = bpf_setif(d, (struct ifreq *)addr);
 		break;
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 		{
 			struct timeval *tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 		{
 			struct timeval *tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = d->bd_rcount;
 			bs->bs_drop = d->bd_dcount;
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		d->bd_immediate = *(u_int *)addr;
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		*(u_int *)addr = d->bd_hdrcmplt;
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		*(u_int *)addr = d->bd_direction;
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				d->bd_direction = direction;
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		d->bd_feedback = *(u_int *)addr;
 		break;
 
 	case BIOCLOCK:
 		d->bd_locked = 1;
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		d->bd_async = *(int *)addr;
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)addr = fgetown(&d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else
 				d->bd_sig = sig;
 			break;
 		}
 	case BIOCGRSIG:
 		*(u_int *)addr = d->bd_sig;
 		break;
 
 	case BIOCGETBUFMODE:
 		*(u_int *)addr = d->bd_bufmode;
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
 
 	case BIOCSETZBUF:
 		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
 
 	case BIOCROTZBUF:
 		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
 	}
+	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp.  If this file already has a filter,
  * free it and replace it.  Returns EINVAL for bogus requests.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 	struct bpf_insn *fcode, *old;
 	u_int wfilter, flen, size;
 #ifdef BPF_JITTER
 	bpf_jit_filter *ofunc;
 #endif
 
 	if (cmd == BIOCSETWF) {
 		old = d->bd_wfilter;
 		wfilter = 1;
 #ifdef BPF_JITTER
 		ofunc = NULL;
 #endif
 	} else {
 		wfilter = 0;
 		old = d->bd_rfilter;
 #ifdef BPF_JITTER
 		ofunc = d->bd_bfilter;
 #endif
 	}
 	if (fp->bf_insns == NULL) {
 		if (fp->bf_len != 0)
 			return (EINVAL);
 		BPFD_LOCK(d);
 		if (wfilter)
 			d->bd_wfilter = NULL;
 		else {
 			d->bd_rfilter = NULL;
 #ifdef BPF_JITTER
 			d->bd_bfilter = NULL;
 #endif
 			if (cmd == BIOCSETF)
 				reset_d(d);
 		}
 		BPFD_UNLOCK(d);
 		if (old != NULL)
 			free((caddr_t)old, M_BPF);
 #ifdef BPF_JITTER
 		if (ofunc != NULL)
 			bpf_destroy_jit_filter(ofunc);
 #endif
 		return (0);
 	}
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns)
 		return (EINVAL);
 
 	size = flen * sizeof(*fp->bf_insns);
 	fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK);
 	if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 &&
 	    bpf_validate(fcode, (int)flen)) {
 		BPFD_LOCK(d);
 		if (wfilter)
 			d->bd_wfilter = fcode;
 		else {
 			d->bd_rfilter = fcode;
 #ifdef BPF_JITTER
 			d->bd_bfilter = bpf_jitter(fcode, flen);
 #endif
 			if (cmd == BIOCSETF)
 				reset_d(d);
 		}
 		BPFD_UNLOCK(d);
 		if (old != NULL)
 			free((caddr_t)old, M_BPF);
 #ifdef BPF_JITTER
 		if (ofunc != NULL)
 			bpf_destroy_jit_filter(ofunc);
 #endif
 
 		return (0);
 	}
 	free((caddr_t)fcode, M_BPF);
 	return (EINVAL);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 
 	/*
 	 * Behavior here depends on the buffering model.  If we're using
 	 * kernel memory buffers, then we can allocate them here.  If we're
 	 * using zero-copy, then the user process must have registered
 	 * buffers by the time we get here.  If not, return an error.
 	 *
 	 * XXXRW: There are locking issues here with multi-threaded use: what
 	 * if two threads try to set the interface at once?
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		if (d->bd_sbuf == NULL)
 			bpf_buffer_alloc(d);
 		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
 		break;
 
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif) {
 		if (d->bd_bif)
 			/*
 			 * Detach if attached to something else.
 			 */
 			bpf_detachd(d);
 
 		bpf_attachd(d, bp);
 	}
 	BPFD_LOCK(d);
 	reset_d(d);
 	BPFD_UNLOCK(d);
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	d->bd_pid = td->td_proc->p_pid;
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
 	    kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	d->bd_pid = curthread->td_proc->p_pid;
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		if (d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	}
 	else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct bpf_d *d;
 	u_int slen;
 	int gottime;
 	struct timeval tv;
 
 	gottime = 0;
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
 		 * way for the caller to indiciate to us whether this packet
 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
 		 * the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL)
 			slen = (*(d->bd_bfilter->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 	struct timeval tv;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	gottime = 0;
 
 	pktlen = m_length(m, NULL);
 
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 #ifdef BPF_JITTER
 		/* XXX We cannot handle multiple mbufs. */
 		if (bpf_jitter_enable != 0 && d->bd_bfilter != NULL &&
 		    m->m_next == NULL)
 			slen = (*(d->bd_bfilter->func))(mtod(m, u_char *),
 			    pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 	struct timeval tv;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	gottime = 0;
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	BPFIF_LOCK(bp);
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		BPFD_LOCK(d);
 		++d->bd_rcount;
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
 				microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
 #undef	BPF_CHECK_DIRECTION
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct timeval *tv)
 {
 	struct bpf_hdr hdr;
 	int totlen, curlen;
 	int hdrlen = d->bd_bif->bif_hdrlen;
 	int do_wakeup = 0;
 
 	BPFD_LOCK_ASSERT(d);
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 	curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			++d->bd_dcount;
 			return;
 		}
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has already
 		 * expired during a select call.  A packet arrived, so the
 		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	hdr.bh_tstamp = *tv;
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = totlen - hdrlen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	bpf_free(d);
 	if (d->bd_rfilter) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
 		bpf_destroy_jit_filter(d->bd_bfilter);
 #endif
 	}
 	if (d->bd_wfilter)
 		free((caddr_t)d->bd_wfilter, M_BPF);
 	mtx_destroy(&d->bd_mtx);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
 	if (bp == NULL)
 		panic("bpfattach");
 
 	LIST_INIT(&bp->bif_dlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF);
 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
 	*driverp = bp;
 
 	mtx_lock(&bpf_mtx);
 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	mtx_unlock(&bpf_mtx);
 
 	/*
 	 * Compute the length of the bpf header.  This is not necessarily
 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
 	 * that the network layer header begins on a longword boundary (for
 	 * performance reasons and to alleviate alignment restrictions).
 	 */
 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
 
 	if (bootverbose)
 		if_printf(ifp, "bpf attached\n");
 }
 
 /*
  * Detach bpf from an interface.  This involves detaching each descriptor
  * associated with the interface, and leaving bd_bif NULL.  Notify each
  * descriptor as it's detached so that any sleepers wake up and get
  * ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if	*bp;
 	struct bpf_d	*d;
 
 	/* Locate BPF interface information */
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (ifp == bp->bif_ifp)
 			break;
 	}
 
 	/* Interface wasn't attached */
 	if ((bp == NULL) || (bp->bif_ifp == NULL)) {
 		mtx_unlock(&bpf_mtx);
 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
 		return;
 	}
 
 	LIST_REMOVE(bp, bif_next);
 	mtx_unlock(&bpf_mtx);
 
 	while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
 		bpf_detachd(d);
 		BPFD_LOCK(d);
 		bpf_wakeup(d);
 		BPFD_UNLOCK(d);
 	}
 
 	mtx_destroy(&bp->bif_mtx);
 	free(bp, M_BPF);
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	int n, error;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	ifp = d->bd_bif->bif_ifp;
 	n = 0;
 	error = 0;
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		if (bfl->bfl_list != NULL) {
 			if (n >= bfl->bfl_len) {
 				mtx_unlock(&bpf_mtx);
 				return (ENOMEM);
 			}
 			error = copyout(&bp->bif_dlt,
 			    bfl->bfl_list + n, sizeof(u_int));
 		}
 		n++;
 	}
 	mtx_unlock(&bpf_mtx);
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 	ifp = d->bd_bif->bif_ifp;
 	mtx_lock(&bpf_mtx);
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 	mtx_unlock(&bpf_mtx);
 	if (bp != NULL) {
 		opromisc = d->bd_promisc;
 		bpf_detachd(d);
 		bpf_attachd(d, bp);
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (opromisc) {
 			error = ifpromisc(bp->bif_ifp, 1);
 			if (error)
 				if_printf(bp->bif_ifp,
 					"bpf_setdlt: ifpromisc failed (%d)\n",
 					error);
 			else
 				d->bd_promisc = 1;
 		}
 	}
 	return (bp == NULL ? EINVAL : 0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
 	LIST_INIT(&bpf_iflist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 
 }
 
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	bzero(d, sizeof(*d));
 	BPFD_LOCK_ASSERT(bd);
 	d->bd_structsize = sizeof(*d);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = bd->bd_rcount;
 	d->bd_dcount = bd->bd_dcount;
 	d->bd_fcount = bd->bd_fcount;
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = bd->bd_wcount;
 	d->bd_wdcount = bd->bd_wdcount;
 	d->bd_wfcount = bd->bd_wfcount;
 	d->bd_zcopy = bd->bd_zcopy;
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct xbpf_d *xbdbuf, *xbd;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	mtx_lock(&bpf_mtx);
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		mtx_unlock(&bpf_mtx);
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_LOCK(bp);
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_UNLOCK(bp);
 	}
 	mtx_unlock(&bpf_mtx);
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 static struct bpf_if bp_null;
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = &bp_null;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
Index: head/sys/net/if.c
===================================================================
--- head/sys/net/if.c	(revision 185347)
+++ head/sys/net/if.c	(revision 185348)
@@ -1,2879 +1,2878 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 #include "opt_mac.h"
 #include "opt_carp.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/vimage.h>
 #include <machine/stdarg.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 
 #if defined(INET) || defined(INET6)
 /*XXX*/
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 #endif
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif
 #ifdef DEV_CARP
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 void	(*bstp_linkstate_p)(struct ifnet *ifp, int state);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_init(void *);
 static void	if_qflush(struct ifnet *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static void	if_slowtimo(void *);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	if_rtdel(struct radix_node *, void *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	if_start_deferred(void *context, int pending);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #ifdef VIMAGE_GLOBALS
 struct	ifnethead ifnet;	/* depend on static init XXX */
 struct	ifgrouphead ifg_head;
 int	if_index;
 static	int if_indexlim;
 /* Table of ifnet/cdev by index.  Locked with ifnet_lock. */
 static struct ifindex_entry *ifindex_table;
 static struct	knlist ifklist;
 #endif
 
 int	ifqmaxlen = IFQ_MAXLEN;
 struct	mtx ifnet_lock;
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static void	filt_netdetach(struct knote *kn);
 static int	filt_netdev(struct knote *kn, long hint);
 
 static struct filterops netdev_filtops =
     { 1, NULL, filt_netdetach, filt_netdev };
 
 /*
  * System initialization
  */
 SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL);
 SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_slowtimo, NULL);
 
 MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 
 	IFNET_RLOCK();
 	ifp = V_ifindex_table[idx].ife_ifnet;
 	IFNET_RUNLOCK();
 	return (ifp);
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 	INIT_VNET_NET(curvnet);
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx].ife_ifnet = ifp;
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
-	INIT_VNET_NET(curvnet);
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK();
 	ifa = ifnet_byindex(idx)->if_addr;
 	IFNET_RUNLOCK();
 	return (ifa);
 }
 
 struct cdev *
 ifdev_byindex(u_short idx)
 {
 	INIT_VNET_NET(curvnet);
 	struct cdev *cdev;
 
 	IFNET_RLOCK();
 	cdev = V_ifindex_table[idx].ife_dev;
 	IFNET_RUNLOCK();
 	return (cdev);
 }
 
 static void
 ifdev_setbyindex(u_short idx, struct cdev *cdev)
 {
 	INIT_VNET_NET(curvnet);
 
 	IFNET_WLOCK();
 	V_ifindex_table[idx].ife_dev = cdev;
 	IFNET_WUNLOCK();
 }
 
 static d_open_t		netopen;
 static d_close_t	netclose;
 static d_ioctl_t	netioctl;
 static d_kqfilter_t	netkqfilter;
 
 static struct cdevsw net_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	netopen,
 	.d_close =	netclose,
 	.d_ioctl =	netioctl,
 	.d_name =	"net",
 	.d_kqfilter =	netkqfilter,
 };
 
 static int
 netopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	return (0);
 }
 
 static int
 netclose(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	return (0);
 }
 
 static int
 netioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
 {
 	struct ifnet *ifp;
 	int error, idx;
 
 	/* only support interface specific ioctls */
 	if (IOCGROUP(cmd) != 'i')
 		return (EOPNOTSUPP);
 	idx = dev2unit(dev);
 	if (idx == 0) {
 		/*
 		 * special network device, not interface.
 		 */
 		if (cmd == SIOCGIFCONF)
 			return (ifconf(cmd, data));	/* XXX remove cmd */
 #ifdef __amd64__
 		if (cmd == SIOCGIFCONF32)
 			return (ifconf(cmd, data));	/* XXX remove cmd */
 #endif
 		return (EOPNOTSUPP);
 	}
 
 	ifp = ifnet_byindex(idx);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error == ENOIOCTL)
 		error = EOPNOTSUPP;
 	return (error);
 }
 
 static int
 netkqfilter(struct cdev *dev, struct knote *kn)
 {
 	INIT_VNET_NET(curvnet);
 	struct knlist *klist;
 	struct ifnet *ifp;
 	int idx;
 
 	switch (kn->kn_filter) {
 	case EVFILT_NETDEV:
 		kn->kn_fop = &netdev_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	idx = dev2unit(dev);
 	if (idx == 0) {
 		klist = &V_ifklist;
 	} else {
 		ifp = ifnet_byindex(idx);
 		if (ifp == NULL)
 			return (1);
 		klist = &ifp->if_klist;
 	}
 
 	kn->kn_hook = (caddr_t)klist;
 
 	knlist_add(klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_netdetach(struct knote *kn)
 {
 	struct knlist *klist = (struct knlist *)kn->kn_hook;
 
 	knlist_remove(klist, kn, 0);
 }
 
 static int
 filt_netdev(struct knote *kn, long hint)
 {
 	struct knlist *klist = (struct knlist *)kn->kn_hook;
 
 	/*
 	 * Currently NOTE_EXIT is abused to indicate device detach.
 	 */
 	if (hint == NOTE_EXIT) {
 		kn->kn_data = NOTE_LINKINV;
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		knlist_remove_inevent(klist, kn);
 		return (1);
 	}
 	if (hint != 0)
 		kn->kn_data = hint;			/* current status */
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 /* ARGSUSED*/
 static void
 if_init(void *dummy __unused)
 {
 	INIT_VNET_NET(curvnet);
 
 	V_if_index = 0;
 	V_ifindex_table = NULL;
 	V_if_indexlim = 8;
 
 	IFNET_LOCK_INIT();
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	knlist_init(&V_ifklist, NULL, NULL, NULL, NULL);
 	if_grow();				/* create initial table */
 	ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL,
 	    0600, "network"));
 	if_clone_init();
 }
 
 static void
 if_grow(void)
 {
 	INIT_VNET_NET(curvnet);
 	u_int n;
 	struct ifindex_entry *e;
 
 	V_if_indexlim <<= 1;
 	n = V_if_indexlim * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet*
 if_alloc(u_char type)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 
 	/*
 	 * Try to find an empty slot below if_index.  If we fail, take
 	 * the next slot.
 	 *
 	 * XXX: should be locked!
 	 */
 	for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) {
 		if (ifnet_byindex(ifp->if_index) == NULL)
 			break;
 	}
 	/* Catch if_index overflow. */
 	if (ifp->if_index < 1) {
 		free(ifp, M_IFNET);
 		return (NULL);
 	}
 	if (ifp->if_index > V_if_index)
 		V_if_index = ifp->if_index;
 	if (V_if_index >= V_if_indexlim)
 		if_grow();
 
 	ifp->if_type = type;
 
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			return (NULL);
 		}
 	}
 	IFNET_WLOCK();
 	ifnet_setbyindex(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 	IF_ADDR_LOCK_INIT(ifp);
 
 	return (ifp);
 }
 
 /*
  * Free the struct ifnet, the associated index, and the layer 2 common
  * structure if needed.  All the work is done in if_free_type().
  *
  * Do not add code to this function!  Add it to if_free_type().
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	if_free_type(ifp, ifp->if_type);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, associated index, and
  * layer 2 common structure.  This version should only be called by
  * intefaces that switch their type after calling if_alloc().
  */
 void
 if_free_type(struct ifnet *ifp, u_char type)
 {
 	INIT_VNET_NET(curvnet); /* ifp->if_vnet can be NULL here ! */
 
 	if (ifp != ifnet_byindex(ifp->if_index)) {
 		if_printf(ifp, "%s: value was not if_alloced, skipping\n",
 		    __func__);
 		return;
 	}
 
 	IFNET_WLOCK();
 	ifnet_setbyindex(ifp->if_index, NULL);
 
 	/* XXX: should be locked with if_findindex() */
 	while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL)
 		V_if_index--;
 	IFNET_WUNLOCK();
 
 	if (if_com_free[type] != NULL)
 		if_com_free[type](ifp->if_l2com, type);
 
 	IF_ADDR_LOCK_DESTROY(ifp);
 	free(ifp, M_IFNET);
 };
 
 void
 ifq_attach(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_detach(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We do more initialization here then is probably a good idea.
  *    Some of this should probably move to if_alloc().
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 	INIT_VNET_NET(curvnet);
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 	TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	IF_AFDATA_LOCK_INIT(ifp);
 	ifp->if_afdata_initialized = 0;
 
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_prefixhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 
 	if_addgroup(ifp, IFG_ALL);
 
 	knlist_init(&ifp->if_klist, NULL, NULL, NULL, NULL);
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_data.ifi_epoch = time_uptime;
 	ifp->if_data.ifi_datalen = sizeof(struct if_data);
 	ifp->if_transmit = if_transmit;
 	ifp->if_qflush = if_qflush;
 #ifdef MAC
 	mac_ifnet_init(ifp);
 	mac_ifnet_create(ifp);
 #endif
 
 	ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw,
 	    ifp->if_index, UID_ROOT, GID_WHEEL, 0600, "%s/%s",
 	    net_cdevsw.d_name, ifp->if_xname));
 	make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d",
 	    net_cdevsw.d_name, ifp->if_index);
 
 	ifq_attach(&ifp->if_snd, ifp);
 
 	/*
 	 * create a Link Level name for this device
 	 */
 	namelen = strlen(ifp->if_xname);
 	/*
 	 * Always save enough space for any possiable name so we can do
 	 * a rename in place later.
 	 */
 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 	socksize = masklen + ifp->if_addrlen;
 	if (socksize < sizeof(*sdl))
 		socksize = sizeof(*sdl);
 	socksize = roundup2(socksize, sizeof(long));
 	ifasize = sizeof(*ifa) + 2 * socksize;
 	ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
 	IFA_LOCK_INIT(ifa);
 	sdl = (struct sockaddr_dl *)(ifa + 1);
 	sdl->sdl_len = socksize;
 	sdl->sdl_family = AF_LINK;
 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 	sdl->sdl_nlen = namelen;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifp->if_type;
 	ifp->if_addr = ifa;
 	ifa->ifa_ifp = ifp;
 	ifa->ifa_rtrequest = link_rtrequest;
 	ifa->ifa_addr = (struct sockaddr *)sdl;
 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 	ifa->ifa_netmask = (struct sockaddr *)sdl;
 	sdl->sdl_len = masklen;
 	while (namelen != 0)
 		sdl->sdl_data[--namelen] = 0xff;
 	ifa->ifa_refcnt = 1;
 	TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 	ifp->if_broadcastaddr = NULL; /* reliably crash if used uninitialized */
 
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 	if (ifp->if_watchdog != NULL)
 		if_printf(ifp,
 		    "WARNING: using obsoleted if_watchdog interface\n");
 	if (ifp->if_flags & IFF_NEEDSGIANT)
 		if_printf(ifp,
 		    "WARNING: using obsoleted IFF_NEEDSGIANT flag\n");
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	int s;
 
 	s = splnet();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 	splx(s);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 	int s;
 
 	s = splnet();
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0) {
 		splx(s);
 		return;
 	}
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		splx(s);
 		printf("if_attachdomain called more than once on %s\n",
 		    ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 
 	splx(s);
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		IFAFREE(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface.
  */
 void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_LOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_UNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the
  * list of "active" interfaces.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 	INIT_VNET_NET(ifp->if_vnet);
 	struct ifaddr *ifa;
 	struct radix_node_head	*rnh;
 	int s, i, j;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 	IFNET_WUNLOCK();
 	if (!found)
 		return;
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	s = splnet();
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/*
 	 * Remove link ifaddr pointer and maybe decrement if_index.
 	 * Clean up all addresses.
 	 */
 	ifp->if_addr = NULL;
 	destroy_dev(ifdev_byindex(ifp->if_index));
 	ifdev_setbyindex(ifp->if_index, NULL);	
 
 	/* We can now free link ifaddr. */
 	if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 		ifa = TAILQ_FIRST(&ifp->if_addrhead);
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		IFAFREE(ifa);
 	}
 
 	/*
 	 * Delete all remaining routes using this interface
 	 * Unfortuneatly the only way to do this is to slog through
 	 * the entire routing table looking for routes which point
 	 * to this interface...oh well...
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		for (j = 0; j < rt_numfibs; j++) {
 			if ((rnh = V_rt_tables[j][i]) == NULL)
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 	}
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	IF_AFDATA_LOCK(ifp);
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT);
 	knlist_clear(&ifp->if_klist, 0);
 	knlist_destroy(&ifp->if_klist);
 	ifq_detach(&ifp->if_snd);
 	IF_AFDATA_DESTROY(ifp);
 	splx(s);
 }
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	INIT_VNET_NET(ifp->if_vnet);
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_LOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_UNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	INIT_VNET_NET(ifp->if_vnet);
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_LOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_UNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	}
 	IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_LOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_UNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_LOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_UNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_UNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_UNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rn	pointer to node in the routing table
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  *
  */
 static int
 if_rtdel(struct radix_node *rn, void *arg)
 {
 	struct rtentry	*rt = (struct rtentry *)rn;
 	struct ifnet	*ifp = arg;
 	int		err;
 
 	if (rt->rt_ifp == ifp) {
 
 		/*
 		 * Protect (sorta) against walktree recursion problems
 		 * with cloned routes
 		 */
 		if ((rt->rt_flags & RTF_UP) == 0)
 			return (0);
 
 		err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 				rt_mask(rt), rt->rt_flags,
 				(struct rtentry **) NULL, rt->rt_fibnum);
 		if (err) {
 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_equal(a1, a2)	\
 	(bcmp((a1), (a2), ((a1))->sa_len) == 0)
 
 #define	sa_dl_equal(a1, a2)	\
 	((((struct sockaddr_dl *)(a1))->sdl_len ==			\
 	 ((struct sockaddr_dl *)(a2))->sdl_len) &&			\
 	 (bcmp(LLADDR((struct sockaddr_dl *)(a1)),			\
 	       LLADDR((struct sockaddr_dl *)(a2)),			\
 	       ((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithaddr(struct sockaddr *addr)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr))
 				goto done;
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr))
 				goto done;
 		}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK();
 	return (ifa);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(struct sockaddr *addr)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr))
 				goto done;
 		}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(struct sockaddr *addr)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		}
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(struct sockaddr *addr)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = (struct ifaddr *) 0;
 	u_int af = addr->sa_family;
 	char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have
 	 * addresses in this address family.
 	 */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr))
 					goto done;
 			} else {
 				/*
 				 * if we have a special address handler,
 				 * then use it instead of the generic one.
 				 */
 				if (ifa->ifa_claim_addr) {
 					if ((*ifa->ifa_claim_addr)(ifa, addr))
 						goto done;
 					continue;
 				}
 
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one) then remember the new one
 				 * before continuing to search
 				 * for an even better one.
 				 */
 				if (ifa_maybe == 0 ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask))
 					ifa_maybe = ifa;
 			}
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	IFNET_RUNLOCK();
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = 0;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (0);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == 0)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	return (ifa);
 }
 
 #include <net/route.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		IFAREF(ifa);		/* XXX */
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		IFAFREE(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  * NOTE: must be called at splnet or eqivalent.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 	
 #ifdef DEV_CARP
 	if (ifp->if_carp)
 		carp_carpdev_state(ifp->if_carp);
 #endif
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  * NOTE: must be called at splnet or eqivalent.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 #ifdef DEV_CARP
 	if (ifp->if_carp)
 		carp_carpdev_state(ifp->if_carp);
 #endif
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *, int);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	int link;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (link_state == LINK_STATE_UP)
 		link = NOTE_LINKUP;
 	else if (link_state == LINK_STATE_DOWN)
 		link = NOTE_LINKDOWN;
 	else
 		link = NOTE_LINKINV;
 	KNOTE_UNLOCKED(&ifp->if_klist, link);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp, link);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    IFP2AC(ifp)->ac_netgraph != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 #ifdef DEV_CARP
 	if (ifp->if_carp)
 		carp_carpdev_state(ifp->if_carp);
 #endif
 	if (ifp->if_bridge) {
 		KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!"));
 		(*bstp_linkstate_p)(ifp, link_state);
 	}
 	if (ifp->if_lagg) {
 		KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!"));
 		(*lagg_linkstate_p)(ifp, link_state);
 	}
 
 	devctl_notify("IFNET", ifp->if_xname,
 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  * NOTE: must be called at splnet or eqivalent.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  * NOTE: must be called at splnet or eqivalent.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 static void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_act;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Handle interface watchdog timer routines.  Called
  * from softclock, we decrement timers (if set) and
  * call the appropriate interface routine on expiration.
  *
  * XXXRW: Note that because timeouts run with Giant, if_watchdog() is called
  * holding Giant.  If we switch to an MPSAFE callout, we likely need to grab
  * Giant before entering if_watchdog() on an IFF_NEEDSGIANT interface.
  */
 static void
 if_slowtimo(void *arg)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ifnet *ifp;
 	int s = splimp();
 
 	IFNET_RLOCK();
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INIT_VNET_NET(vnet_iter);
 		TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 			if (ifp->if_timer == 0 || --ifp->if_timer)
 				continue;
 			if (ifp->if_watchdog)
 				(*ifp->if_watchdog)(ifp);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 	IFNET_RUNLOCK();
 	splx(s);
 	timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ);
 }
 
 /*
  * Map interface name to
  * interface structure pointer.
  */
 struct ifnet *
 ifunit(const char *name)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	struct ifstat *ifs;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		ifr->ifr_phys = ifp->if_physical;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_SMART) {
 			/* Smart drivers twiddle their own routes */
 		} else if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			int s = splimp();
 			if_down(ifp);
 			splx(s);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			int s = splimp();
 			if_up(ifp);
 			splx(s);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			IFF_LOCKGIANT(ifp);
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 			IFF_UNLOCKGIANT(ifp);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		IFF_LOCKGIANT(ifp);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		IFF_UNLOCKGIANT(ifp);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		IFA_LOCK(ifa);
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IFA_UNLOCK(ifa);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 		break;
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		IFF_LOCKGIANT(ifp);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		IFF_UNLOCKGIANT(ifp);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		IFF_LOCKGIANT(ifp);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		IFF_UNLOCKGIANT(ifp);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_LOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_UNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSLIFPHYADDR:
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		IFF_LOCKGIANT(ifp);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		IFF_UNLOCKGIANT(ifp);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
 		ifs->ascii[0] = '\0';
 
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGLIFPHYADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		IFF_LOCKGIANT(ifp);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		IFF_UNLOCKGIANT(ifp);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	switch (cmd) {
 	case SIOCGIFCONF:
 	case OSIOCGIFCONF:
 #ifdef __amd64__
 	case SIOCGIFCONF32:
 #endif
 		return (ifconf(cmd, data));
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error)
 			return (error);
 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
 			cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error)
 			return (error);
 		return if_clone_destroy(ifr->ifr_name);
 
 	case SIOCIFGCLONERS:
 		return (if_clone_list((struct if_clonereq *)data));
 	case SIOCGIFGMEMB:
 		return (if_getgroupmembers((struct ifgroupreq *)data));
 	}
 
 	ifp = ifunit(ifr->ifr_name);
 	if (ifp == 0)
 		return (ENXIO);
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL)
 		return (error);
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == 0)
 		return (EOPNOTSUPP);
 #ifndef COMPAT_43
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd,
 								 data,
 								 ifp, td));
 #else
 	{
 		int ocmd = cmd;
 
 		switch (cmd) {
 
 		case SIOCSIFDSTADDR:
 		case SIOCSIFADDR:
 		case SIOCSIFBRDADDR:
 		case SIOCSIFNETMASK:
 #if BYTE_ORDER != BIG_ENDIAN
 			if (ifr->ifr_addr.sa_family == 0 &&
 			    ifr->ifr_addr.sa_len < 16) {
 				ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
 				ifr->ifr_addr.sa_len = 16;
 			}
 #else
 			if (ifr->ifr_addr.sa_len == 0)
 				ifr->ifr_addr.sa_len = 16;
 #endif
 			break;
 
 		case OSIOCGIFADDR:
 			cmd = SIOCGIFADDR;
 			break;
 
 		case OSIOCGIFDSTADDR:
 			cmd = SIOCGIFDSTADDR;
 			break;
 
 		case OSIOCGIFBRDADDR:
 			cmd = SIOCGIFBRDADDR;
 			break;
 
 		case OSIOCGIFNETMASK:
 			cmd = SIOCGIFNETMASK;
 		}
 		error =  ((*so->so_proto->pr_usrreqs->pru_control)(so,
 								   cmd,
 								   data,
 								   ifp, td));
 		switch (ocmd) {
 
 		case OSIOCGIFADDR:
 		case OSIOCGIFDSTADDR:
 		case OSIOCGIFBRDADDR:
 		case OSIOCGIFNETMASK:
 			*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
 
 		}
 	}
 #endif /* COMPAT_43 */
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		DELAY(100);/* XXX: temporary workaround for fxp issue*/
 		if (ifp->if_flags & IFF_UP) {
 			int s = splimp();
 			in6_if_up(ifp);
 			splx(s);
 		}
 #endif
 	}
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	IFF_LOCKGIANT(ifp);
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	IFF_UNLOCKGIANT(ifp);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifconf *ifc = (struct ifconf *)data;
 #ifdef __amd64__
 	struct ifconf32 *ifc32 = (struct ifconf32 *)data;
 	struct ifconf ifc_swab;
 #endif
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 #ifdef __amd64__
 	if (cmd == SIOCGIFCONF32) {
 		ifc_swab.ifc_len = ifc32->ifc_len;
 		ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
 		ifc = &ifc_swab;
 	}
 #endif
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();		/* could sleep XXX */
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (jailed(curthread->td_ucred) &&
 			    prison_if(curthread->td_ucred, sa))
 				continue;
 			addrs++;
 #ifdef COMPAT_43
 			if (cmd == OSIOCGIFCONF) {
 				struct osockaddr *osa =
 					 (struct osockaddr *)&ifr.ifr_addr;
 				ifr.ifr_addr = *sa;
 				osa->sa_family = sa->sa_family;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else
 #endif
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (!sbuf_overflowed(sb))
 				valid_len = sbuf_len(sb);
 		}
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (!sbuf_overflowed(sb))
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 #ifdef __amd64__
 	if (cmd == SIOCGIFCONF32)
 		ifc32->ifc_len = valid_len;
 #endif
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 	KASSERT(ifma->ifma_protospec == NULL,
 	    ("if_freemulti: protospec not NULL"));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_LOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_UNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.  If 'llsa' was
 	 * returned, we will need to free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_UNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		IFF_LOCKGIANT(ifp);
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 		IFF_UNLOCKGIANT(ifp);
 	}
 
 	if (llsa != NULL)
 		free(llsa, M_IFMADDR);
 
 	return (0);
 
 free_llsa_out:
 	if (llsa != NULL)
 		free(llsa, M_IFMADDR);
 
 unlock_out:
 	IF_ADDR_UNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 	INIT_VNET_NET(ifp->if_vnet);
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_LOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_UNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		IFF_LOCKGIANT(ifp);
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		IFF_UNLOCKGIANT(ifp);
 	}
 
 	return (0);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared. Callers should
  * hold IFF_LOCKGIANT() to avoid a LOR in case the hardware needs to be
  * reconfigured.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 #ifdef DIAGNOSTIC
 	INIT_VNET_NET(curvnet);
 #endif
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_LOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_UNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			IFF_LOCKGIANT(ifp);
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 			IFF_UNLOCKGIANT(ifp);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_LOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  */
 int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	ifa = ifp->if_addr;
 	if (ifa == NULL)
 		return (EINVAL);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL)
 		return (EINVAL);
 	if (len != sdl->sdl_alen)	/* don't allow length to change */
 		return (EINVAL);
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 		bcopy(lladdr, LLADDR(sdl), len);
 		break;
 	default:
 		return (ENODEV);
 	}
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			IFF_LOCKGIANT(ifp);
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			IFF_UNLOCKGIANT(ifp);
 		}
 #ifdef INET
 		/*
 		 * Also send gratuitous ARPs to notify other nodes about
 		 * the address change.
 		 */
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET)
 				arp_ifinit(ifp, ifa);
 		}
 #endif
 	}
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * When an interface is marked IFF_NEEDSGIANT, its if_start() routine cannot
  * be called without Giant.  However, we often can't acquire the Giant lock
  * at those points; instead, we run it via a task queue that holds Giant via
  * if_start_deferred.
  *
  * XXXRW: We need to make sure that the ifnet isn't fully detached until any
  * outstanding if_start_deferred() tasks that will run after the free.  This
  * probably means waiting in if_detach().
  */
 void
 if_start(struct ifnet *ifp)
 {
 
 	if (ifp->if_flags & IFF_NEEDSGIANT) {
 		if (mtx_owned(&Giant))
 			(*(ifp)->if_start)(ifp);
 		else
 			taskqueue_enqueue(taskqueue_swi_giant,
 			    &ifp->if_starttask);
 	} else
 		(*(ifp)->if_start)(ifp);
 }
 
 static void
 if_start_deferred(void *context, int pending)
 {
 	struct ifnet *ifp;
 
 	GIANT_REQUIRED;
 
 	ifp = context;
 	(ifp->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		_IF_DROP(ifq);
 		IF_UNLOCK(ifq);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		ifp->if_obytes += m->m_pkthdr.len + adjust;
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			ifp->if_omcasts++;
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		if_start(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
Index: head/sys/net/if_loop.c
===================================================================
--- head/sys/net/if_loop.c	(revision 185347)
+++ head/sys/net/if_loop.c	(revision 185348)
@@ -1,374 +1,378 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_loop.c	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 /*
  * Loopback interface driver for protocol testing and timing.
  */
 
 #include "opt_atalk.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipx.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 
 #ifdef	INET
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #endif
 
 #ifdef IPX
 #include <netipx/ipx.h>
 #include <netipx/ipx_if.h>
 #endif
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #endif
 
 #ifdef NETATALK
 #include <netatalk/at.h>
 #include <netatalk/at_var.h>
 #endif
 
 #ifdef TINY_LOMTU
 #define	LOMTU	(1024+512)
 #elif defined(LARGE_LOMTU)
 #define LOMTU	131072
 #else
 #define LOMTU	16384
 #endif
 
 int		loioctl(struct ifnet *, u_long, caddr_t);
 static void	lortrequest(int, struct rtentry *, struct rt_addrinfo *);
 int		looutput(struct ifnet *ifp, struct mbuf *m,
 		    struct sockaddr *dst, struct rtentry *rt);
 static int	lo_clone_create(struct if_clone *, int, caddr_t);
 static void	lo_clone_destroy(struct ifnet *);
 
 #ifdef VIMAGE_GLOBALS
 struct ifnet *loif;			/* Used externally */
 #endif
 
 IFC_SIMPLE_DECLARE(lo, 1);
 
 static void
 lo_clone_destroy(struct ifnet *ifp)
 {
+#ifdef INVARIANTS
+	INIT_VNET_NET(ifp->if_vnet);
+#endif
 
 	/* XXX: destroying lo0 will lead to panics. */
 	KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__));
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 }
 
 static int
 lo_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	INIT_VNET_NET(curvnet);
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_LOOP);
 	if (ifp == NULL)
 		return (ENOSPC);
 
 	if_initname(ifp, ifc->ifc_name, unit);
 	ifp->if_mtu = LOMTU;
 	ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	ifp->if_ioctl = loioctl;
 	ifp->if_output = looutput;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	if (V_loif == NULL)
 		V_loif = ifp;
 
 	return (0);
 }
 
 static int
 loop_modevent(module_t mod, int type, void *data)
 {
+	INIT_VNET_NET(curvnet);
 
 	switch (type) {
 	case MOD_LOAD:
 		V_loif = NULL;
 		if_clone_attach(&lo_cloner);
 		break;
 
 	case MOD_UNLOAD:
 		printf("loop module unload - not possible for this module type\n");
 		return (EINVAL);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t loop_mod = {
 	"loop",
 	loop_modevent,
 	0
 };
 
 DECLARE_MODULE(loop, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 
 int
 looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
     struct rtentry *rt)
 {
 	u_int32_t af;
 
 	M_ASSERTPKTHDR(m); /* check if we have the packet header */
 
 	if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		m_freem(m);
 		return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
 		        rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
 	}
 
 	ifp->if_opackets++;
 	ifp->if_obytes += m->m_pkthdr.len;
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC) {
 		bcopy(dst->sa_data, &af, sizeof(af));
 		dst->sa_family = af;
 	}
 
 #if 1	/* XXX */
 	switch (dst->sa_family) {
 	case AF_INET:
 	case AF_INET6:
 	case AF_IPX:
 	case AF_APPLETALK:
 		break;
 	default:
 		printf("looutput: af=%d unexpected\n", dst->sa_family);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 #endif
 	return (if_simloop(ifp, m, dst->sa_family, 0));
 }
 
 /*
  * if_simloop()
  *
  * This function is to support software emulation of hardware loopback,
  * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't
  * hear their own broadcasts, we create a copy of the packet that we
  * would normally receive via a hardware loopback.
  *
  * This function expects the packet to include the media header of length hlen.
  */
 int
 if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen)
 {
 	INIT_VNET_NET(ifp->if_vnet);
 	int isr;
 
 	M_ASSERTPKTHDR(m);
 	m_tag_delete_nonpersistent(m);
 	m->m_pkthdr.rcvif = ifp;
 
 	/*
 	 * Let BPF see incoming packet in the following manner:
 	 *  - Emulated packet loopback for a simplex interface
 	 *    (net/if_ethersubr.c)
 	 *	-> passes it to ifp's BPF
 	 *  - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c)
 	 *	-> not passes it to any BPF
 	 *  - Normal packet loopback from myself to myself (net/if_loop.c)
 	 *	-> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0)
 	 */
 	if (hlen > 0) {
 		if (bpf_peers_present(ifp->if_bpf)) {
 			bpf_mtap(ifp->if_bpf, m);
 		}
 	} else {
 		if (bpf_peers_present(V_loif->if_bpf)) {
 			if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) {
 				/* XXX beware sizeof(af) != 4 */
 				u_int32_t af1 = af;
 
 				/*
 				 * We need to prepend the address family.
 				 */
 				bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m);
 			}
 		}
 	}
 
 	/* Strip away media header */
 	if (hlen > 0) {
 		m_adj(m, hlen);
 #ifndef __NO_STRICT_ALIGNMENT
 		/*
 		 * Some archs do not like unaligned data, so
 		 * we move data down in the first mbuf.
 		 */
 		if (mtod(m, vm_offset_t) & 3) {
 			KASSERT(hlen >= 3, ("if_simloop: hlen too small"));
 			bcopy(m->m_data,
 			    (char *)(mtod(m, vm_offset_t)
 				- (mtod(m, vm_offset_t) & 3)),
 			    m->m_len);
 			m->m_data -= (mtod(m,vm_offset_t) & 3);
 		}
 #endif
 	}
 
 	/* Deliver to upper layer protocol */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		m->m_flags |= M_LOOP;
 		isr = NETISR_IPV6;
 		break;
 #endif
 #ifdef IPX
 	case AF_IPX:
 		isr = NETISR_IPX;
 		break;
 #endif
 #ifdef NETATALK
 	case AF_APPLETALK:
 		isr = NETISR_ATALK2;
 		break;
 #endif
 	default:
 		printf("if_simloop: can't handle af=%d\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	ifp->if_ipackets++;
 	ifp->if_ibytes += m->m_pkthdr.len;
 	netisr_queue(isr, m);	/* mbuf is free'd on failure. */
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 
 	RT_LOCK_ASSERT(rt);
 	rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
 }
 
 /*
  * Process an ioctl request.
  */
 /* ARGSUSED */
 int
 loioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifaddr *ifa;
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifa = (struct ifaddr *)data;
 		ifa->ifa_rtrequest = lortrequest;
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == 0) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	case SIOCSIFFLAGS:
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c	(revision 185347)
+++ head/sys/net/route.c	(revision 185348)
@@ -1,1791 +1,1792 @@
 /*-
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_route.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
 /*
  * Allow the boot code to allow LESS than RT_MAXFIBS to be used.
  * We can't do more because storage is statically allocated for now.
  * (for compatibility reasons.. this will change).
  */
 TUNABLE_INT("net.fibs", &rt_numfibs);
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  */
 u_int rt_add_addr_allfibs = 1;
 SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
     &rt_add_addr_allfibs, 0, "");
 TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
 
 #ifdef VIMAGE_GLOBALS
 static struct rtstat rtstat;
 
 /* by default only the first 'row' of tables will be accessed. */
 /* 
  * XXXMRT When we fix netstat, and do this differnetly,
  * we can allocate this dynamically. As long as we are keeping
  * things backwards compaitble we need to allocate this 
  * statically.
  */
 struct radix_node_head *rt_tables[RT_MAXFIBS][AF_MAX+1];
 
 static int	rttrash;		/* routes not in table but not freed */
 #endif
 
 static void rt_maskedcopy(struct sockaddr *,
 	    struct sockaddr *, struct sockaddr *);
 
 /* compare two sockaddr structures */
 #define	sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 static uma_zone_t rtzone;		/* Routing table UMA zone. */
 
 #if 0
 /* default fib for tunnels to use */
 u_int tunnel_fib = 0;
 SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, "");
 #endif
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
 
 static void
 route_init(void)
 {
+	INIT_VNET_INET(curvnet);
 	int table;
 	struct domain *dom;
 	int fam;
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 	rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	rn_init();	/* initialize all zeroes, all ones, mask table */
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach)  {
 			for  (table = 0; table < rt_numfibs; table++) {
 				if ( (fam = dom->dom_family) == AF_INET ||
 				    table == 0) {
  			        	/* for now only AF_INET has > 1 table */
 					/* XXX MRT 
 					 * rtattach will be also called
 					 * from vfs_export.c but the
 					 * offset will be 0
 					 * (only for AF_INET and AF_INET6
 					 * which don't need it anyhow)
 					 */
 					dom->dom_rtattach(
 				    	    (void **)&V_rt_tables[table][fam],
 				    	    dom->dom_rtoffset);
 				} else {
 					break;
 				}
 			}
 		}
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Packet routing routines.
  */
 void
 rtalloc(struct route *ro)
 {
 	rtalloc_ign_fib(ro, 0UL, 0);
 }
 
 void
 rtalloc_fib(struct route *ro, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, 0UL, fibnum);
 }
 
 void
 rtalloc_ign(struct route *ro, u_long ignore)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 void
 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  *
  * The returned route, if any, is locked.
  */
 struct rtentry *
 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
 {
 	return (rtalloc1_fib(dst, report, ignflags, 0));
 }
 
 struct rtentry *
 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
 		    u_int fibnum)
 {
 	INIT_VNET_NET(curvnet);
 	struct radix_node_head *rnh;
 	struct rtentry *rt;
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
 	u_long nflags;
 	int err = 0, msgtype = RTM_MISS;
 
 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
 	if (dst->sa_family != AF_INET)	/* Only INET supports > 1 fib now */
 		fibnum = 0;
 	rnh = V_rt_tables[fibnum][dst->sa_family];
 	newrt = NULL;
 	/*
 	 * Look up the address in the table for that Address Family
 	 */
 	if (rnh == NULL) {
 		V_rtstat.rts_unreach++;
 		goto miss2;
 	}
 	RADIX_NODE_HEAD_LOCK(rnh);
 	if ((rn = rnh->rnh_matchaddr(dst, rnh)) &&
 	    (rn->rn_flags & RNF_ROOT) == 0) {
 		/*
 		 * If we find it and it's not the root node, then
 		 * get a reference on the rtentry associated.
 		 */
 		newrt = rt = RNTORT(rn);
 		nflags = rt->rt_flags & ~ignflags;
 		if (report && (nflags & RTF_CLONING)) {
 			/*
 			 * We are apparently adding (report = 0 in delete).
 			 * If it requires that it be cloned, do so.
 			 * (This implies it wasn't a HOST route.)
 			 */
 			err = rtrequest_fib(RTM_RESOLVE, dst, NULL,
 					      NULL, 0, &newrt, fibnum);
 			if (err) {
 				/*
 				 * If the cloning didn't succeed, maybe
 				 * what we have will do. Return that.
 				 */
 				newrt = rt;		/* existing route */
 				RT_LOCK(newrt);
 				RT_ADDREF(newrt);
 				goto miss;
 			}
 			KASSERT(newrt, ("no route and no error"));
 			RT_LOCK(newrt);
 			if (newrt->rt_flags & RTF_XRESOLVE) {
 				/*
 				 * If the new route specifies it be
 				 * externally resolved, then go do that.
 				 */
 				msgtype = RTM_RESOLVE;
 				goto miss;
 			}
 			/* Inform listeners of the new route. */
 			bzero(&info, sizeof(info));
 			info.rti_info[RTAX_DST] = rt_key(newrt);
 			info.rti_info[RTAX_NETMASK] = rt_mask(newrt);
 			info.rti_info[RTAX_GATEWAY] = newrt->rt_gateway;
 			if (newrt->rt_ifp != NULL) {
 				info.rti_info[RTAX_IFP] =
 				    newrt->rt_ifp->if_addr->ifa_addr;
 				info.rti_info[RTAX_IFA] = newrt->rt_ifa->ifa_addr;
 			}
 			rt_missmsg(RTM_ADD, &info, newrt->rt_flags, 0);
 		} else {
 			RT_LOCK(newrt);
 			RT_ADDREF(newrt);
 		}
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	} else {
 		/*
 		 * Either we hit the root or couldn't find any match,
 		 * Which basically means
 		 * "caint get there frm here"
 		 */
 		V_rtstat.rts_unreach++;
 	miss:
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	miss2:	if (report) {
 			/*
 			 * If required, report the failure to the supervising
 			 * Authorities.
 			 * For a delete, this is not an error. (report == 0)
 			 */
 			bzero(&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			rt_missmsg(msgtype, &info, 0, err);
 		}
 	}
 	if (newrt)
 		RT_LOCK_ASSERT(newrt);
 	return (newrt);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 	INIT_VNET_NET(curvnet);
 	struct radix_node_head *rnh;
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 	rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
 	 * we should come here exactly with the last reference.
 	 */
 	RT_REMREF(rt);
 	if (rt->rt_refcnt > 0) {
 		printf("%s: %p has %lu refs\n", __func__, rt, rt->rt_refcnt);
 		goto done;
 	}
 
 	/*
 	 * On last reference give the "close method" a chance
 	 * to cleanup private state.  This also permits (for
 	 * IPv4 and IPv6) a chance to decide if the routing table
 	 * entry should be purged immediately or at a later time.
 	 * When an immediate purge is to happen the close routine
 	 * typically calls rtexpunge which clears the RTF_UP flag
 	 * on the entry so that the code below reclaims the storage.
 	 */
 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
 		rnh->rnh_close((struct radix_node *)rt, rnh);
 
 	/*
 	 * If we are no longer "up" (and ref == 0)
 	 * then we can free the resources associated
 	 * with the route.
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0) {
 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic("rtfree 2");
 		/*
 		 * the rtentry must have been removed from the routing table
 		 * so it is represented in rttrash.. remove that now.
 		 */
 		V_rttrash--;
 #ifdef	DIAGNOSTIC
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			goto done;
 		}
 #endif
 		/*
 		 * release references on items we hold them on..
 		 * e.g other routes and ifaddrs.
 		 */
 		if (rt->rt_ifa)
 			IFAFREE(rt->rt_ifa);
 		rt->rt_parent = NULL;		/* NB: no refcnt on parent */
 
 		/*
 		 * The key is separatly alloc'd so free it (see rt_setgate()).
 		 * This also frees the gateway, as they are always malloc'd
 		 * together.
 		 */
 		Free(rt_key(rt));
 
 		/*
 		 * and the rtentry itself of course
 		 */
 		RT_LOCK_DESTROY(rt);
 		uma_zfree(rtzone, rt);
 		return;
 	}
 done:
 	RT_UNLOCK(rt);
 }
 
 
 /*
  * Force a routing table entry to the specified
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
  */
 void
 rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src)
 {
 	rtredirect_fib(dst, gateway, netmask, flags, src, 0);
 }
 
 void
 rtredirect_fib(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	INIT_VNET_NET(curvnet);
 	struct rtentry *rt, *rt0 = NULL;
 	int error = 0;
 	short *stat = NULL;
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 
 	/* verify the gateway is directly reachable */
 	if ((ifa = ifa_ifwithnet(gateway)) == NULL) {
 		error = ENETUNREACH;
 		goto out;
 	}
 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!(flags & RTF_DONE) && rt &&
 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
 		error = EINVAL;
 	else if (ifa_ifwithaddr(gateway))
 		error = EHOSTUNREACH;
 	if (error)
 		goto done;
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the the lookup failed.  This is necessary for hosts
 	 * which use routing redirects generated by smart gateways
 	 * to dynamically build the routing tables.
 	 */
 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
 		goto create;
 	/*
 	 * Don't listen to the redirect if it's
 	 * for a route to an interface.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
 			/*
 			 * Changing from route to net => route to host.
 			 * Create new route, rather than smashing route to net.
 			 */
 		create:
 			rt0 = rt;
 			rt = NULL;
 		
 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
 			bzero((caddr_t)&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			info.rti_info[RTAX_GATEWAY] = gateway;
 			info.rti_info[RTAX_NETMASK] = netmask;
 			info.rti_ifa = ifa;
 			info.rti_flags = flags;
 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 			if (rt != NULL) {
 				RT_LOCK(rt);
 				EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
 				flags = rt->rt_flags;
 			}
 			if (rt0)
 				RTFREE_LOCKED(rt0);
 			
 			stat = &V_rtstat.rts_dynamic;
 		} else {
 			struct rtentry *gwrt;
 
 			/*
 			 * Smash the current notion of the gateway to
 			 * this destination.  Should check about netmask!!!
 			 */
 			rt->rt_flags |= RTF_MODIFIED;
 			flags |= RTF_MODIFIED;
 			stat = &V_rtstat.rts_newgateway;
 			/*
 			 * add the key and gateway (in one malloc'd chunk).
 			 */
 			rt_setgate(rt, rt_key(rt), gateway);
 			gwrt = rtalloc1(gateway, 1, 0);
 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
 			RTFREE_LOCKED(gwrt);
 		}
 	} else
 		error = EHOSTUNREACH;
 done:
 	if (rt)
 		RTFREE_LOCKED(rt);
 out:
 	if (error)
 		V_rtstat.rts_badredirect++;
 	else if (stat != NULL)
 		(*stat)++;
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	info.rti_info[RTAX_AUTHOR] = src;
 	rt_missmsg(RTM_REDIRECT, &info, flags, error);
 }
 
 int
 rtioctl(u_long req, caddr_t data)
 {
 	return (rtioctl_fib(req, data, 0));
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
 {
 	return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
 }
 
 struct ifaddr *
 ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
 				u_int fibnum)
 {
 	register struct ifaddr *ifa;
 	int not_found = 0;
 
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway);
 	if (ifa == NULL) {
 		struct rtentry *rt = rtalloc1_fib(gateway, 0, 0UL, fibnum);
 		if (rt == NULL)
 			return (NULL);
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		switch (gateway->sa_family) {
 		case AF_INET:
 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
 				not_found = 1;
 			break;
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
 				not_found = 1;
 			break;
 		default:
 			break;
 		}
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 		if (not_found)
 			return (NULL);
 		if ((ifa = rt->rt_ifa) == NULL)
 			return (NULL);
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 	}
 	return (ifa);
 }
 
 static walktree_f_t rt_fixdelete;
 static walktree_f_t rt_fixchange;
 
 struct rtfc_arg {
 	struct rtentry *rt0;
 	struct radix_node_head *rnh;
 };
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt)
 {
 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
 }
 
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 /*
  * These (questionable) definitions of apparent local variables apply
  * to the next two functions.  XXXXXX!!!
  */
 #define	dst	info->rti_info[RTAX_DST]
 #define	gateway	info->rti_info[RTAX_GATEWAY]
 #define	netmask	info->rti_info[RTAX_NETMASK]
 #define	ifaaddr	info->rti_info[RTAX_IFA]
 #define	ifpaddr	info->rti_info[RTAX_IFP]
 #define	flags	info->rti_flags
 
 int
 rt_getifa(struct rt_addrinfo *info)
 {
 	return (rt_getifa_fib(info, 0));
 }
 
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int error = 0;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK &&
 	    (ifa = ifa_ifwithnet(ifpaddr)) != NULL)
 		info->rti_ifp = ifa->ifa_ifp;
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		struct sockaddr *sa;
 
 		sa = ifaaddr != NULL ? ifaaddr :
 		    (gateway != NULL ? gateway : dst);
 		if (sa != NULL && info->rti_ifp != NULL)
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 		else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
 							fibnum);
 	}
 	if ((ifa = info->rti_ifa) != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 /*
  * Expunges references to a route that's about to be reclaimed.
  * The route must be locked.
  */
 int
 rtexpunge(struct rtentry *rt)
 {
 	INIT_VNET_NET(curvnet);
 	struct radix_node *rn;
 	struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	int error = 0;
 
 	RT_LOCK_ASSERT(rt);
 #if 0
 	/*
 	 * We cannot assume anything about the reference count
 	 * because protocols call us in many situations; often
 	 * before unwinding references to the table entry.
 	 */
 	KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt));
 #endif
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	RADIX_NODE_HEAD_LOCK(rnh);
 
 	/*
 	 * Remove the item from the tree; it should be there,
 	 * but when callers invoke us blindly it may not (sigh).
 	 */
 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
 	if (rn == NULL) {
 		error = ESRCH;
 		goto bad;
 	}
 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
 		("unexpected flags 0x%x", rn->rn_flags));
 	KASSERT(rt == RNTORT(rn),
 		("lookup mismatch, rt %p rn %p", rt, rn));
 
 	rt->rt_flags &= ~RTF_UP;
 
 	/*
 	 * Now search what's left of the subtree for any cloned
 	 * routes which might have been formed from this node.
 	 */
 	if ((rt->rt_flags & RTF_CLONING) && rt_mask(rt))
 		rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
 				       rt_fixdelete, rt);
 
 	/*
 	 * Remove any external references we may have.
 	 * This might result in another rtentry being freed if
 	 * we held its last reference.
 	 */
 	if (rt->rt_gwroute) {
 		RTFREE(rt->rt_gwroute);
 		rt->rt_gwroute = NULL;
 	}
 
 	/*
 	 * Give the protocol a chance to keep things in sync.
 	 */
 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
 		struct rt_addrinfo info;
 
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_flags = rt->rt_flags;
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
 	}
 
 	/*
 	 * one more rtentry floating around that is not
 	 * linked to the routing table.
 	 */
 	V_rttrash++;
 bad:
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (error);
 }
 
 int
 rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt)
 {
 	return (rtrequest1_fib(req, info, ret_nrt, 0));
 }
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	INIT_VNET_NET(curvnet);
 	int error = 0;
 	register struct rtentry *rt;
 	register struct radix_node *rn;
 	register struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
 #define senderr(x) { error = x ; goto bad; }
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	if (dst->sa_family != AF_INET)	/* Only INET supports > 1 fib now */
 		fibnum = 0;
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = V_rt_tables[fibnum][dst->sa_family];
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 	RADIX_NODE_HEAD_LOCK(rnh);
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (flags & RTF_HOST) {
 		netmask = NULL;
 		flags &= ~RTF_CLONING;
 	}
 	switch (req) {
 	case RTM_DELETE:
 #ifdef RADIX_MPATH
 		/*
 		 * if we got multipath routes, we require users to specify
 		 * a matching RTAX_GATEWAY.
 		 */
 		if (rn_mpath_capable(rnh)) {
 			struct rtentry *rto = NULL;
 
 			rn = rnh->rnh_matchaddr(dst, rnh);
 			if (rn == NULL)
 				senderr(ESRCH);
  			rto = rt = RNTORT(rn);
 			rt = rt_mpath_matchgate(rt, gateway);
 			if (!rt)
 				senderr(ESRCH);
 			/*
 			 * this is the first entry in the chain
 			 */
 			if (rto == rt) {
 				rn = rn_mpath_next((struct radix_node *)rt);
 				/*
 				 * there is another entry, now it's active
 				 */
 				if (rn) {
 					rto = RNTORT(rn);
 					RT_LOCK(rto);
 					rto->rt_flags |= RTF_UP;
 					RT_UNLOCK(rto);
 				} else if (rt->rt_flags & RTF_GATEWAY) {
 					/*
 					 * For gateway routes, we need to 
 					 * make sure that we we are deleting
 					 * the correct gateway. 
 					 * rt_mpath_matchgate() does not 
 					 * check the case when there is only
 					 * one route in the chain.  
 					 */
 					if (gateway &&
 					    (rt->rt_gateway->sa_len != gateway->sa_len ||
 					    memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
 						senderr(ESRCH);
 				}
 				/*
 				 * use the normal delete code to remove
 				 * the first entry
 				 */
 				goto normal_rtdel;
 			}
 			/*
 			 * if the entry is 2nd and on up
 			 */
 			if (!rt_mpath_deldup(rto, rt))
 				panic ("rtrequest1: rt_mpath_deldup");
 			RT_LOCK(rt);
 			RT_ADDREF(rt);
 			rt->rt_flags &= ~RTF_UP;
 			goto deldone;  /* done with the RTM_DELETE command */
 		}
 
 normal_rtdel:
 #endif
 		/*
 		 * Remove the item from the tree and return it.
 		 * Complain if it is not there and do no more processing.
 		 */
 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
 		if (rn == NULL)
 			senderr(ESRCH);
 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic ("rtrequest delete");
 		rt = RNTORT(rn);
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
 
 		/*
 		 * Now search what's left of the subtree for any cloned
 		 * routes which might have been formed from this node.
 		 */
 		if ((rt->rt_flags & RTF_CLONING) &&
 		    rt_mask(rt)) {
 			rnh->rnh_walktree_from(rnh, dst, rt_mask(rt),
 					       rt_fixdelete, rt);
 		}
 
 		/*
 		 * Remove any external references we may have.
 		 * This might result in another rtentry being freed if
 		 * we held its last reference.
 		 */
 		if (rt->rt_gwroute) {
 			RTFREE(rt->rt_gwroute);
 			rt->rt_gwroute = NULL;
 		}
 
 		/*
 		 * give the protocol a chance to keep things in sync.
 		 */
 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 
 #ifdef RADIX_MPATH
 deldone:
 #endif
 		/*
 		 * One more rtentry floating around that is not
 		 * linked to the routing table. rttrash will be decremented
 		 * when RTFREE(rt) is eventually called.
 		 */
 		V_rttrash++;
 
 		/*
 		 * If the caller wants it, then it can have it,
 		 * but it's up to it to free the rtentry as we won't be
 		 * doing it.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_UNLOCK(rt);
 		} else
 			RTFREE_LOCKED(rt);
 		break;
 
 	case RTM_RESOLVE:
 		if (ret_nrt == NULL || (rt = *ret_nrt) == NULL)
 			senderr(EINVAL);
 		ifa = rt->rt_ifa;
 		/* XXX locking? */
 		flags = rt->rt_flags &
 		    ~(RTF_CLONING | RTF_STATIC);
 		flags |= RTF_WASCLONED;
 		gateway = rt->rt_gateway;
 		if ((netmask = rt->rt_genmask) == NULL)
 			flags |= RTF_HOST;
 		goto makeroute;
 
 	case RTM_ADD:
 		if ((flags & RTF_GATEWAY) && !gateway)
 			senderr(EINVAL);
 		if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 			senderr(EINVAL);
 
 		if (info->rti_ifa == NULL && (error = rt_getifa_fib(info, fibnum)))
 			senderr(error);
 		ifa = info->rti_ifa;
 
 	makeroute:
 		rt = uma_zalloc(rtzone, M_NOWAIT | M_ZERO);
 		if (rt == NULL)
 			senderr(ENOBUFS);
 		RT_LOCK_INIT(rt);
 		rt->rt_flags = RTF_UP | flags;
 		rt->rt_fibnum = fibnum;
 		/*
 		 * Add the gateway. Possibly re-malloc-ing the storage for it
 		 * also add the rt_gwroute if possible.
 		 */
 		RT_LOCK(rt);
 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
 			RT_LOCK_DESTROY(rt);
 			uma_zfree(rtzone, rt);
 			senderr(error);
 		}
 
 		/*
 		 * point to the (possibly newly malloc'd) dest address.
 		 */
 		ndst = (struct sockaddr *)rt_key(rt);
 
 		/*
 		 * make sure it contains the value we want (masked if needed).
 		 */
 		if (netmask) {
 			rt_maskedcopy(dst, ndst, netmask);
 		} else
 			bcopy(dst, ndst, dst->sa_len);
 
 		/*
 		 * Note that we now have a reference to the ifa.
 		 * This moved from below so that rnh->rnh_addaddr() can
 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
 		 */
 		IFAREF(ifa);
 		rt->rt_ifa = ifa;
 		rt->rt_ifp = ifa->ifa_ifp;
 
 #ifdef RADIX_MPATH
 		/* do not permit exactly the same dst/mask/gw pair */
 		if (rn_mpath_capable(rnh) &&
 			rt_mpath_conflict(rnh, rt, netmask)) {
 			if (rt->rt_gwroute)
 				RTFREE(rt->rt_gwroute);
 			if (rt->rt_ifa) {
 				IFAFREE(rt->rt_ifa);
 			}
 			Free(rt_key(rt));
 			RT_LOCK_DESTROY(rt);
 			uma_zfree(rtzone, rt);
 			senderr(EEXIST);
 		}
 #endif
 
 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
 		if (rn == NULL) {
 			struct rtentry *rt2;
 			/*
 			 * Uh-oh, we already have one of these in the tree.
 			 * We do a special hack: if the route that's already
 			 * there was generated by the cloning mechanism
 			 * then we just blow it away and retry the insertion
 			 * of the new one.
 			 */
 			rt2 = rtalloc1_fib(dst, 0, 0, fibnum);
 			if (rt2 && rt2->rt_parent) {
 				rtexpunge(rt2);
 				RT_UNLOCK(rt2);
 				rn = rnh->rnh_addaddr(ndst, netmask,
 						      rnh, rt->rt_nodes);
 			} else if (rt2) {
 				/* undo the extra ref we got */
 				RTFREE_LOCKED(rt2);
 			}
 		}
 
 		/*
 		 * If it still failed to go into the tree,
 		 * then un-make it (this should be a function)
 		 */
 		if (rn == NULL) {
 			if (rt->rt_gwroute)
 				RTFREE(rt->rt_gwroute);
 			if (rt->rt_ifa)
 				IFAFREE(rt->rt_ifa);
 			Free(rt_key(rt));
 			RT_LOCK_DESTROY(rt);
 			uma_zfree(rtzone, rt);
 			senderr(EEXIST);
 		}
 
 		rt->rt_parent = NULL;
 
 		/*
 		 * If we got here from RESOLVE, then we are cloning
 		 * so clone the rest, and note that we
 		 * are a clone (and increment the parent's references)
 		 */
 		if (req == RTM_RESOLVE) {
 			KASSERT(ret_nrt && *ret_nrt,
 				("no route to clone from"));
 			rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */
 			rt->rt_rmx.rmx_pksent = 0; /* reset packet counter */
 			if ((*ret_nrt)->rt_flags & RTF_CLONING) {
 				/*
 				 * NB: We do not bump the refcnt on the parent
 				 * entry under the assumption that it will
 				 * remain so long as we do.  This is
 				 * important when deleting the parent route
 				 * as this operation requires traversing
 				 * the tree to delete all clones and futzing
 				 * with refcnts requires us to double-lock
 				 * parent through this back reference.
 				 */
 				rt->rt_parent = *ret_nrt;
 			}
 		}
 
 		/*
 		 * If this protocol has something to add to this then
 		 * allow it to do that as well.
 		 */
 		if (ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(req, rt, info);
 
 		/*
 		 * We repeat the same procedure from rt_setgate() here because
 		 * it doesn't fire when we call it there because the node
 		 * hasn't been added to the tree yet.
 		 */
 		if (req == RTM_ADD &&
 		    !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) {
 			struct rtfc_arg arg;
 			arg.rnh = rnh;
 			arg.rt0 = rt;
 			rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
 					       rt_fixchange, &arg);
 		}
 
 		/*
 		 * actually return a resultant rtentry and
 		 * give the caller a single reference.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_ADDREF(rt);
 		}
 		RT_UNLOCK(rt);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 bad:
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (error);
 #undef senderr
 }
 
 #undef dst
 #undef gateway
 #undef netmask
 #undef ifaaddr
 #undef ifpaddr
 #undef flags
 
 /*
  * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family''
  * (i.e., the routes related to it by the operation of cloning).  This
  * routine is iterated over all potential former-child-routes by way of
  * rnh->rnh_walktree_from() above, and those that actually are children of
  * the late parent (passed in as VP here) are themselves deleted.
  */
 static int
 rt_fixdelete(struct radix_node *rn, void *vp)
 {
 	struct rtentry *rt = RNTORT(rn);
 	struct rtentry *rt0 = vp;
 
 	if (rt->rt_parent == rt0 &&
 	    !(rt->rt_flags & (RTF_PINNED | RTF_CLONING))) {
 		return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt),
 				 rt->rt_flags, NULL, rt->rt_fibnum);
 	}
 	return 0;
 }
 
 /*
  * This routine is called from rt_setgate() to do the analogous thing for
  * adds and changes.  There is the added complication in this case of a
  * middle insert; i.e., insertion of a new network route between an older
  * network route and (cloned) host routes.  For this reason, a simple check
  * of rt->rt_parent is insufficient; each candidate route must be tested
  * against the (mask, value) of the new route (passed as before in vp)
  * to see if the new route matches it.
  *
  * XXX - it may be possible to do fixdelete() for changes and reserve this
  * routine just for adds.  I'm not sure why I thought it was necessary to do
  * changes this way.
  */
 
 static int
 rt_fixchange(struct radix_node *rn, void *vp)
 {
 	struct rtentry *rt = RNTORT(rn);
 	struct rtfc_arg *ap = vp;
 	struct rtentry *rt0 = ap->rt0;
 	struct radix_node_head *rnh = ap->rnh;
 	u_char *xk1, *xm1, *xk2, *xmp;
 	int i, len, mlen;
 
 	/* make sure we have a parent, and route is not pinned or cloning */
 	if (!rt->rt_parent ||
 	    (rt->rt_flags & (RTF_PINNED | RTF_CLONING)))
 		return 0;
 
 	if (rt->rt_parent == rt0)	/* parent match */
 		goto delete_rt;
 	/*
 	 * There probably is a function somewhere which does this...
 	 * if not, there should be.
 	 */
 	len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len);
 
 	xk1 = (u_char *)rt_key(rt0);
 	xm1 = (u_char *)rt_mask(rt0);
 	xk2 = (u_char *)rt_key(rt);
 
 	/* avoid applying a less specific route */
 	xmp = (u_char *)rt_mask(rt->rt_parent);
 	mlen = rt_key(rt->rt_parent)->sa_len;
 	if (mlen > rt_key(rt0)->sa_len)		/* less specific route */
 		return 0;
 	for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++)
 		if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i])
 			return 0;	/* less specific route */
 
 	for (i = rnh->rnh_treetop->rn_offset; i < len; i++)
 		if ((xk2[i] & xm1[i]) != xk1[i])
 			return 0;	/* no match */
 
 	/*
 	 * OK, this node is a clone, and matches the node currently being
 	 * changed/added under the node's mask.  So, get rid of it.
 	 */
 delete_rt:
 	return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL,
 			 rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum);
 }
 
 int
 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
 	INIT_VNET_NET(curvnet);
 	/* XXX dst may be overwritten, can we move this to below */
 	struct radix_node_head *rnh =
 	    V_rt_tables[rt->rt_fibnum][dst->sa_family];
 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
 
 again:
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * A host route with the destination equal to the gateway
 	 * will interfere with keeping LLINFO in the routing
 	 * table, so disallow it.
 	 */
 	if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) ==
 					(RTF_HOST|RTF_GATEWAY)) &&
 	    dst->sa_len == gate->sa_len &&
 	    bcmp(dst, gate, dst->sa_len) == 0) {
 		/*
 		 * The route might already exist if this is an RTM_CHANGE
 		 * or a routing redirect, so try to delete it.
 		 */
 		if (rt_key(rt))
 			rtexpunge(rt);
 		return EADDRNOTAVAIL;
 	}
 
 	/*
 	 * Cloning loop avoidance in case of bad configuration.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		struct rtentry *gwrt;
 
 		RT_UNLOCK(rt);		/* XXX workaround LOR */
 		gwrt = rtalloc1_fib(gate, 1, 0, rt->rt_fibnum);
 		if (gwrt == rt) {
 			RT_REMREF(rt);
 			return (EADDRINUSE); /* failure */
 		}
 		/*
 		 * Try to reacquire the lock on rt, and if it fails,
 		 * clean state and restart from scratch.
 		 */
 		if (!RT_TRYLOCK(rt)) {
 			RTFREE_LOCKED(gwrt);
 			RT_LOCK(rt);
 			goto again;
 		}
 		/*
 		 * If there is already a gwroute, then drop it. If we
 		 * are asked to replace route with itself, then do
 		 * not leak its refcounter.
 		 */
 		if (rt->rt_gwroute != NULL) {
 			if (rt->rt_gwroute == gwrt) {
 				RT_REMREF(rt->rt_gwroute);
 			} else
 				RTFREE(rt->rt_gwroute);
 		}
 
 		if ((rt->rt_gwroute = gwrt) != NULL)
 			RT_UNLOCK(rt->rt_gwroute);
 	}
 
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
 	 * Both dst and gateway are stored one after the other in the same
 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
 	 * rt_gateway already points to the right place.
 	 * Otherwise, malloc a new block and update the 'dst' address.
 	 */
 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
 		caddr_t new;
 
 		R_Malloc(new, caddr_t, dlen + glen);
 		if (new == NULL)
 			return ENOBUFS;
 		/*
 		 * XXX note, we copy from *dst and not *rt_key(rt) because
 		 * rt_setgate() can be called to initialize a newly
 		 * allocated route entry, in which case rt_key(rt) == NULL
 		 * (and also rt->rt_gateway == NULL).
 		 * Free()/free() handle a NULL argument just fine.
 		 */
 		bcopy(dst, new, dlen);
 		Free(rt_key(rt));	/* free old block, if any */
 		rt_key(rt) = (struct sockaddr *)new;
 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
 	}
 
 	/*
 	 * Copy the new gateway value into the memory chunk.
 	 */
 	bcopy(gate, rt->rt_gateway, glen);
 
 	/*
 	 * This isn't going to do anything useful for host routes, so
 	 * don't bother.  Also make sure we have a reasonable mask
 	 * (we don't yet have one during adds).
 	 */
 	if (!(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) {
 		struct rtfc_arg arg;
 
 		arg.rnh = rnh;
 		arg.rt0 = rt;
 		RT_UNLOCK(rt);		/* XXX workaround LOR */
 		RADIX_NODE_HEAD_LOCK(rnh);
 		RT_LOCK(rt);
 		rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
 				       rt_fixchange, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	}
 
 	return 0;
 }
 
 static void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	register u_char *cp1 = (u_char *)src;
 	register u_char *cp2 = (u_char *)dst;
 	register u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	INIT_VNET_NET(curvnet);
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if ( dst->sa_family != AF_INET)
 		fibnum = 0;
 	if (fibnum == -1) {
 		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
 			startfib = endfib = curthread->td_proc->p_fibnum;
 		} else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	}
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do. XXX For this version only AF_INET.
 	 * When that changes code should be refactored to protocol
 	 * independent parts and protocol dependent parts.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node_head *rnh;
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			if ((rnh = V_rt_tables[fibnum][dst->sa_family]) == NULL)
 				/* this table doesn't exist but others might */
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 #ifdef RADIX_MPATH
 			if (rn_mpath_capable(rnh)) {
 
 				rn = rnh->rnh_matchaddr(dst, rnh);
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the
 					 * rt->rt_gateway is sockaddr_intf
 					 * for cloning ARP entries, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (!rt) 
 						error = ESRCH;
 				}
 			}
 			else
 #endif
 			rn = rnh->rnh_lookup(dst, netmask, rnh);
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_ifa != ifa ||
 			    !sa_equal((struct sockaddr *)rn->rn_key, dst));
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags | ifa->ifa_flags;
 		info.rti_info[RTAX_DST] = dst;
 		info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 			RT_LOCK(rt);
 #ifdef RADIX_MPATH
 			/*
 			 * in case address alias finds the first address
 			 * e.g. ifconfig bge0 192.103.54.246/24
 			 * e.g. ifconfig bge0 192.103.54.247/24
 			 * the address set in the route is 192.103.54.246
 			 * so we need to replace it with 192.103.54.247
 			 */
 			if (memcmp(rt->rt_ifa->ifa_addr,
 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
 				IFAFREE(rt->rt_ifa);
 				IFAREF(ifa);
 				rt->rt_ifp = ifa->ifa_ifp;
 				rt->rt_ifa = ifa;
 			}
 #endif
 			rt_newaddrmsg(cmd, ifa, error, rt);
 			if (cmd == RTM_DELETE) {
 				/*
 				 * If we are deleting, and we found an entry,
 				 * then it's been removed from the tree..
 				 * now throw it away.
 				 */
 				RTFREE_LOCKED(rt);
 			} else {
 				if (cmd == RTM_ADD) {
 					/*
 					 * We just wanted to add it..
 					 * we don't actually need a reference.
 					 */
 					RT_REMREF(rt);
 				}
 				RT_UNLOCK(rt);
 			}
 			didwork = 1;
 		}
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /* special one for inet internal use. may not use. */
 int
 rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
 {
 	return (rtinit1(ifa, cmd, flags, -1));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = 0;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	if (dst->sa_family == AF_INET)
 		fib = -1;
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * rt_check() is invoked on each layer 2 output path, prior to
  * encapsulating outbound packets.
  *
  * The function is mostly used to find a routing entry for the gateway,
  * which in some protocol families could also point to the link-level
  * address for the gateway itself (the side effect of revalidating the
  * route to the destination is rather pointless at this stage, we did it
  * already a moment before in the pr_output() routine to locate the ifp
  * and gateway to use).
  *
  * When we remove the layer-3 to layer-2 mapping tables from the
  * routing table, this function can be removed.
  *
  * === On input ===
  *   *dst is the address of the NEXT HOP (which coincides with the
  *	final destination if directly reachable);
  *   *lrt0 points to the cached route to the final destination;
  *   *lrt is not meaningful;
  *	(*lrt0 has no ref held on it by us so REMREF is not needed.
  *	Refs only account for major structural references and not usages,
  * 	which is actually a bit of a problem.)
  *
  * === Operation ===
  * If the route is marked down try to find a new route.  If the route
  * to the gateway is gone, try to setup a new route.  Otherwise,
  * if the route is marked for packets to be rejected, enforce that.
  * Note that rtalloc returns an rtentry with an extra REF that we may
  * need to lose.
  *
  * === On return ===
  *   *dst is unchanged;
  *   *lrt0 points to the (possibly new) route to the final destination
  *   *lrt points to the route to the next hop   [LOCKED]
  *
  * Their values are meaningful ONLY if no error is returned.
  *
  * To follow this you have to remember that:
  * RT_REMREF reduces the reference count by 1 but doesn't check it for 0 (!)
  * RTFREE_LOCKED includes an RT_REMREF (or an rtfree if refs == 1)
  *    and an RT_UNLOCK
  * RTFREE does an RT_LOCK and an RTFREE_LOCKED
  * The gwroute pointer counts as a reference on the rtentry to which it points.
  * so when we add it we use the ref that rtalloc gives us and when we lose it
  * we need to remove the reference.
  * RT_TEMP_UNLOCK does an RT_ADDREF before freeing the lock, and
  * RT_RELOCK locks it (it can't have gone away due to the ref) and
  * drops the ref, possibly freeing it and zeroing the pointer if
  * the ref goes to 0 (unlocking in the process).
  */
 int
 rt_check(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst)
 {
 	struct rtentry *rt;
 	struct rtentry *rt0;
 	u_int fibnum;
 
 	KASSERT(*lrt0 != NULL, ("rt_check"));
 	rt0 = *lrt0;
 	rt = NULL;
 	fibnum = rt0->rt_fibnum;
 
 	/* NB: the locking here is tortuous... */
 	RT_LOCK(rt0);
 retry:
 	if (rt0 && (rt0->rt_flags & RTF_UP) == 0) {
 		/* Current rt0 is useless, try get a replacement. */
 		RT_UNLOCK(rt0);
 		rt0 = NULL;
 	}
 	if (rt0 == NULL) {
 		rt0 = rtalloc1_fib(dst, 1, 0UL, fibnum);
 		if (rt0 == NULL) {
 			return (EHOSTUNREACH);
 		}
 		RT_REMREF(rt0); /* don't need the reference. */
 	}
 
 	if (rt0->rt_flags & RTF_GATEWAY) {
 		if ((rt = rt0->rt_gwroute) != NULL) {
 			RT_LOCK(rt);		/* NB: gwroute */
 			if ((rt->rt_flags & RTF_UP) == 0) {
 				/* gw route is dud. ignore/lose it */
 				RTFREE_LOCKED(rt); /* unref (&unlock) gwroute */
 				rt = rt0->rt_gwroute = NULL;
 			}
 		}
 		
 		if (rt == NULL) {  /* NOT AN ELSE CLAUSE */
 			RT_TEMP_UNLOCK(rt0); /* MUST return to undo this */
 			rt = rtalloc1_fib(rt0->rt_gateway, 1, 0UL, fibnum);
 			if ((rt == rt0) || (rt == NULL)) {
 				/* the best we can do is not good enough */
 				if (rt) {
 					RT_REMREF(rt); /* assumes ref > 0 */
 					RT_UNLOCK(rt);
 				}
 				RTFREE(rt0); /* lock, unref, (unlock) */
 				return (ENETUNREACH);
 			}
 			/*
 			 * Relock it and lose the added reference.
 			 * All sorts of things could have happenned while we
 			 * had no lock on it, so check for them.
 			 */
 			RT_RELOCK(rt0);
 			if (rt0 == NULL || ((rt0->rt_flags & RTF_UP) == 0))
 				/* Ru-roh.. what we had is no longer any good */
 				goto retry;
 			/* 
 			 * While we were away, someone replaced the gateway.
 			 * Since a reference count is involved we can't just
 			 * overwrite it.
 			 */
 			if (rt0->rt_gwroute) {
 				if (rt0->rt_gwroute != rt) {
 					RTFREE_LOCKED(rt);
 					goto retry;
 				}
 			} else {
 				rt0->rt_gwroute = rt;
 			}
 		}
 		RT_LOCK_ASSERT(rt);
 		RT_UNLOCK(rt0);
 	} else {
 		/* think of rt as having the lock from now on.. */
 		rt = rt0;
 	}
 	/* XXX why are we inspecting rmx_expire? */
 	if ((rt->rt_flags & RTF_REJECT) &&
 	    (rt->rt_rmx.rmx_expire == 0 ||
 	    time_uptime < rt->rt_rmx.rmx_expire)) {
 		RT_UNLOCK(rt);
 		return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
 	}
 
 	*lrt = rt;
 	*lrt0 = rt0;
 	return (0);
 }
 
 /* This must be before ip6_init2(), which is now SI_ORDER_MIDDLE */
 SYSINIT(route, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
Index: head/sys/netinet/if_ether.c
===================================================================
--- head/sys/netinet/if_ether.c	(revision 185347)
+++ head/sys/netinet/if_ether.c	(revision 185348)
@@ -1,1091 +1,1091 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_mac.h"
 #include "opt_carp.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 
 #include <net/if_arc.h>
 #include <net/iso88025.h>
 
 #ifdef DEV_CARP
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((struct sockaddr_in *)s)
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 SYSCTL_DECL(_net_link_ether);
 SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 
 /* timer values */
 #ifdef VIMAGE_GLOBALS
 static int	arpt_keep; /* once resolved, good for 20 more minutes */
 static int	arp_maxtries;
 static int	useloopback; /* use loopback interface for local traffic */
 static int	arp_proxyall;
 #endif
 
-SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, 
-	   &arpt_keep, 0, "ARP entry lifetime in seconds");
+SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, max_age,
+    CTLFLAG_RW, arpt_keep, 0, "ARP entry lifetime in seconds");
 
 #define	rt_expire rt_rmx.rmx_expire
 
 struct llinfo_arp {
 	struct	callout la_timer;
 	struct	rtentry *la_rt;
 	struct	mbuf *la_hold;	/* last packet until resolved/timeout */
 	u_short	la_preempt;	/* countdown for pre-expiry arps */
 	u_short	la_asked;	/* # requests sent */
 };
 
 static struct	ifqueue arpintrq;
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries,
 	CTLFLAG_RW, arp_maxtries, 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, useloopback,
 	CTLFLAG_RW, useloopback, 0,
 	"Use the loopback interface for local traffic");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, proxyall,
 	CTLFLAG_RW, arp_proxyall, 0,
 	"Enable proxy ARP for all suitable requests");
 
 static void	arp_init(void);
 static void	arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static void	arprequest(struct ifnet *,
 			struct in_addr *, struct in_addr *, u_char *);
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 static struct rtentry
 		*arplookup(u_long, int, int, int);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 /*
  * Timeout routine.
  */
 static void
 arptimer(void *arg)
 {
 	struct rtentry *rt = (struct rtentry *)arg;
 
 	RT_LOCK_ASSERT(rt);
 	/*
 	 * The lock is needed to close a theoretical race
 	 * between spontaneous expiry and intentional removal.
 	 * We still got an extra reference on rtentry, so can
 	 * safely pass pointers to its contents.
 	 */
 	RT_UNLOCK(rt);
 
 	in_rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL,
 	    rt->rt_fibnum);
 }
 
 /*
  * Parallel to llc_rtrequest.
  */
 static void
 arp_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET(curvnet);
 	struct sockaddr *gate;
 	struct llinfo_arp *la;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct in_ifaddr *ia;
 	struct ifaddr *ifa;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (rt->rt_flags & RTF_GATEWAY)
 		return;
 	gate = rt->rt_gateway;
 	la = (struct llinfo_arp *)rt->rt_llinfo;
 	switch (req) {
 
 	case RTM_ADD:
 		/*
 		 * XXX: If this is a manually added route to interface
 		 * such as older version of routed or gated might provide,
 		 * restore cloning bit.
 		 */
 		if ((rt->rt_flags & RTF_HOST) == 0 &&
 		    rt_mask(rt) != NULL &&
 		    SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff)
 			rt->rt_flags |= RTF_CLONING;
 		if (rt->rt_flags & RTF_CLONING) {
 			/*
 			 * Case 1: This route should come from a route to iface.
 			 */
 			rt_setgate(rt, rt_key(rt),
 					(struct sockaddr *)&null_sdl);
 			gate = rt->rt_gateway;
 			SDL(gate)->sdl_type = rt->rt_ifp->if_type;
 			SDL(gate)->sdl_index = rt->rt_ifp->if_index;
 			rt->rt_expire = time_uptime;
 			break;
 		}
 		/* Announce a new entry if requested. */
 		if (rt->rt_flags & RTF_ANNOUNCE)
 			arprequest(rt->rt_ifp,
 			    &SIN(rt_key(rt))->sin_addr,
 			    &SIN(rt_key(rt))->sin_addr,
 			    (u_char *)LLADDR(SDL(gate)));
 		/*FALLTHROUGH*/
 	case RTM_RESOLVE:
 		if (gate->sa_family != AF_LINK ||
 		    gate->sa_len < sizeof(null_sdl)) {
 			log(LOG_DEBUG, "%s: bad gateway %s%s\n", __func__,
 			    inet_ntoa(SIN(rt_key(rt))->sin_addr),
 			    (gate->sa_family != AF_LINK) ?
 			    " (!AF_LINK)": "");
 			break;
 		}
 		SDL(gate)->sdl_type = rt->rt_ifp->if_type;
 		SDL(gate)->sdl_index = rt->rt_ifp->if_index;
 		if (la != 0)
 			break; /* This happens on a route change */
 		/*
 		 * Case 2:  This route may come from cloning, or a manual route
 		 * add with a LL address.
 		 */
 		R_Zalloc(la, struct llinfo_arp *, sizeof(*la));
 		rt->rt_llinfo = (caddr_t)la;
 		if (la == 0) {
 			log(LOG_DEBUG, "%s: malloc failed\n", __func__);
 			break;
 		}
 		/*
 		 * We are storing a route entry outside of radix tree. So,
 		 * it can be found and accessed by other means than radix
 		 * lookup. The routing code assumes that any rtentry detached
 		 * from radix can be destroyed safely. To prevent this, we
 		 * add an additional reference.
 		 */
 		RT_ADDREF(rt);
 		la->la_rt = rt;
 		rt->rt_flags |= RTF_LLINFO;
 		callout_init_mtx(&la->la_timer, &rt->rt_mtx,
 		    CALLOUT_RETURNUNLOCKED);
 
 #ifdef INET
 		/*
 		 * This keeps the multicast addresses from showing up
 		 * in `arp -a' listings as unresolved.  It's not actually
 		 * functional.  Then the same for broadcast.
 		 */
 		if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr)) &&
 		    rt->rt_ifp->if_type != IFT_ARCNET) {
 			ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr,
 					       LLADDR(SDL(gate)));
 			SDL(gate)->sdl_alen = 6;
 			rt->rt_expire = 0;
 		}
 		if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) {
 			memcpy(LLADDR(SDL(gate)), rt->rt_ifp->if_broadcastaddr,
 			       rt->rt_ifp->if_addrlen);
 			SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen;
 			rt->rt_expire = 0;
 		}
 #endif
 
 		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 			if (ia->ia_ifp == rt->rt_ifp &&
 			    SIN(rt_key(rt))->sin_addr.s_addr ==
 			    (IA_SIN(ia))->sin_addr.s_addr)
 				break;
 		}
 		if (ia) {
 		    /*
 		     * This test used to be
 		     *	if (loif.if_flags & IFF_UP)
 		     * It allowed local traffic to be forced
 		     * through the hardware by configuring the loopback down.
 		     * However, it causes problems during network configuration
 		     * for boards that can't receive packets they send.
 		     * It is now necessary to clear "useloopback" and remove
 		     * the route to force traffic out to the hardware.
 		     */
 			rt->rt_expire = 0;
 			bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)),
 			      SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen);
 			if (V_useloopback) {
 				rt->rt_ifp = V_loif;
 				rt->rt_rmx.rmx_mtu = V_loif->if_mtu;
 			}
 
 		    /*
 		     * make sure to set rt->rt_ifa to the interface
 		     * address we are using, otherwise we will have trouble
 		     * with source address selection.
 		     */
 			ifa = &ia->ia_ifa;
 			if (ifa != rt->rt_ifa) {
 				IFAFREE(rt->rt_ifa);
 				IFAREF(ifa);
 				rt->rt_ifa = ifa;
 			}
 		}
 		break;
 
 	case RTM_DELETE:
 		if (la == NULL)	/* XXX: at least CARP does this. */
 			break;
 		callout_stop(&la->la_timer);
 		rt->rt_llinfo = NULL;
 		rt->rt_flags &= ~RTF_LLINFO;
 		RT_REMREF(rt);
 		if (la->la_hold)
 			m_freem(la->la_hold);
 		Free((caddr_t)la);
 	}
 }
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 static void
 arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip,
     u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 
 	if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) +
 		2*ifp->if_data.ifi_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	MH_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln);
 	bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln);
 	bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m->m_flags |= M_BCAST;
 	(*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0);
 
 	return;
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  * On input:
  *    ifp is the interface we use
  *    rt0 is the route to the final destination (possibly useless)
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is where we want the address.
  *
  * On success, desten is filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
     struct sockaddr *dst, u_char *desten)
 {
 	INIT_VNET_INET(ifp->if_vnet);
 	struct llinfo_arp *la = NULL;
 	struct rtentry *rt = NULL;
 	struct sockaddr_dl *sdl;
 	int error;
 	int fibnum = -1;
 
 	if (m) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 		fibnum = M_GETFIB(m);
 	}
 
 	if (rt0 != NULL) {
 		/* Look for a cached arp (ll) entry. */
 		if (m == NULL)
 			fibnum = rt0->rt_fibnum;
 		error = rt_check(&rt, &rt0, dst);
 		if (error) {
 			m_freem(m);
 			return error;
 		}
 		la = (struct llinfo_arp *)rt->rt_llinfo;
 		if (la == NULL)
 			RT_UNLOCK(rt);
 	}
 
 	/*
 	 * If we had no mbuf and no route, then hope the caller
 	 * has a fib in mind because we are running out of ideas.
 	 * I think this should not happen in current code.
 	 * (kmacy would know).
 	 */
 	if (fibnum == -1)
 		fibnum = curthread->td_proc->p_fibnum; /* last gasp */
 
 	if (la == NULL) {
 		/*
 		 * We enter this block if rt0 was NULL,
 		 * or if rt found by rt_check() didn't have llinfo.
 		 * we should get a cloned route, which since it should
 		 * come from the local interface should have a ll entry.
 		 * It may be incomplete but that's ok.
 		 */
 		rt = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0, fibnum);
 		if (rt == NULL) {
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate route for %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr));
 			m_freem(m);
 			return (EINVAL); /* XXX */
 		}
 		la = (struct llinfo_arp *)rt->rt_llinfo;
 		if (la == NULL) {
 			RT_UNLOCK(rt);
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr));
 			m_freem(m);
 			return (EINVAL); /* XXX */
 		}
 	}
 	sdl = SDL(rt->rt_gateway);
 	/*
 	 * Check the address family and length is valid, the address
 	 * is resolved; otherwise, try to resolve.
 	 */
 	if ((rt->rt_expire == 0 || rt->rt_expire > time_uptime) &&
 	    sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) {
 
 		bcopy(LLADDR(sdl), desten, sdl->sdl_alen);
 
 		/*
 		 * If entry has an expiry time and it is approaching,
 		 * send an ARP request.
 		 */
 		if ((rt->rt_expire != 0) &&
 		    (time_uptime + la->la_preempt > rt->rt_expire)) {
 			struct in_addr sin = 
 			    SIN(rt->rt_ifa->ifa_addr)->sin_addr;
 
 			la->la_preempt--;
 			RT_UNLOCK(rt);
 			arprequest(ifp, &sin, &SIN(dst)->sin_addr,
 			    IF_LLADDR(ifp));
 			return (0);
 		} 
 
 		RT_UNLOCK(rt);
 		return (0);
 	}
 	/*
 	 * If ARP is disabled or static on this interface, stop.
 	 * XXX
 	 * Probably should not allocate empty llinfo struct if we are
 	 * not going to be sending out an arp request.
 	 */
 	if (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) {
 		RT_UNLOCK(rt);
 		m_freem(m);
 		return (EINVAL);
 	}
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Replace the held mbuf with this
 	 * latest one.
 	 */
 	if (m) {
 		if (la->la_hold)
 			m_freem(la->la_hold);
 		la->la_hold = m;
 	}
 	KASSERT(rt->rt_expire > 0, ("sending ARP request for static entry"));
 
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH;
 
 	if (la->la_asked == 0 || rt->rt_expire != time_uptime) {
 		struct in_addr sin =
 		    SIN(rt->rt_ifa->ifa_addr)->sin_addr;
 
 		rt->rt_expire = time_uptime;
 		callout_reset(&la->la_timer, hz, arptimer, rt);
 		la->la_asked++;
 		RT_UNLOCK(rt);
 
 		arprequest(ifp, &sin, &SIN(dst)->sin_addr,
 		    IF_LLADDR(ifp));
 	} else
 		RT_UNLOCK(rt);
 
 	return (error);
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		log(LOG_ERR, "arp: runt packet -- m_pullup failed\n");
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
 	    ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) {
 		log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n",
 		    (unsigned char *)&ar->ar_hrd, "");
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_len < arphdr_len(ar)) {
 		if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
 			log(LOG_ERR, "arp: runt packet\n");
 			m_freem(m);
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
         &log_arp_movements, 0,
         "log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
         &log_arp_permanent_modify, 0,
         "log arp replies from MACs different than the one in the permanent arp entry");
 
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llinfo_arp *la;
 	struct rtentry *rt;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr_dl *sdl;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	struct mbuf *hold;
 	u_int8_t *enaddr = NULL;
 	int op, rif_len;
 	int req_len;
 	int bridged = 0, is_bridge = 0;
 	u_int fibnum;
 	u_int goodfib = 0;
 	int firstpass = 1;
 #ifdef DEV_CARP
 	int carp_match = 0;
 #endif
 	struct sockaddr_in sin;
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 	INIT_VNET_INET(ifp->if_vnet);
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
 	if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
 		log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n");
 		return;
 	}
 
 	ah = mtod(m, struct arphdr *);
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 * If the interface does not match, but the recieving interface
 	 * is part of carp, we call carp_iamatch to see if this is a
 	 * request for the virtual host ip.
 	 * XXX: This is really ugly!
 	 */
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
 		    (ia->ia_ifp == ifp)) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
 			goto match;
 #ifdef DEV_CARP
 		if (ifp->if_carp != NULL &&
 		    carp_iamatch(ifp->if_carp, ia, &isaddr, &enaddr) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			carp_match = 1;
 			goto match;
 		}
 #endif
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
 		    (ia->ia_ifp == ifp)) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
 			goto match;
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifp = ia->ia_ifp;
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			ia = ifatoia(ifa);
 			goto match;
 		}
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL)
 		goto drop;
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	myaddr = ia->ia_addr.sin_addr;
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		log(LOG_ERR,
 		    "arp: link address is broadcast for IP address %s!\n",
 		    inet_ntoa(isaddr));
 		goto drop;
 	}
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) {
 		log(LOG_ERR,
 		   "arp: %*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 	/*
 	 * We look for any FIB that has this address to find
 	 * the interface etc.
 	 * For sanity checks that are FIB independent we abort the loop.
 	 */
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		rt = arplookup(isaddr.s_addr,
 		    itaddr.s_addr == myaddr.s_addr, 0, fibnum);
 		if (rt == NULL)
 			continue;
 		
 		sdl = SDL(rt->rt_gateway);
 		/* Only call this once */
 		if (firstpass) {
 			sin.sin_addr.s_addr = isaddr.s_addr;
 			EVENTHANDLER_INVOKE(route_arp_update_event, rt,
 			    ar_sha(ah), (struct sockaddr *)&sin);
 		}
 		
 		la = (struct llinfo_arp *)rt->rt_llinfo;
 		if (la == NULL) {
 			RT_UNLOCK(rt);
 			continue;
 		}
 
 		if (firstpass) {
 			/* The following is not an error when doing bridging. */
 			if (!bridged && rt->rt_ifp != ifp
 #ifdef DEV_CARP
 			    && (ifp->if_type != IFT_CARP || !carp_match)
 #endif
 			    ) {
 				if (log_arp_wrong_iface)
 					log(LOG_ERR, "arp: %s is on %s "
 						"but got reply from %*D "
 						"on %s\n",
 					    inet_ntoa(isaddr),
 					    rt->rt_ifp->if_xname,
 					    ifp->if_addrlen,
 					    (u_char *)ar_sha(ah), ":",
 					    ifp->if_xname);
 				RT_UNLOCK(rt);
 				break;
 			}
 			if (sdl->sdl_alen &&
 			    bcmp(ar_sha(ah), LLADDR(sdl), sdl->sdl_alen)) {
 				if (rt->rt_expire) {
 				    if (log_arp_movements)
 					log(LOG_INFO,
 					    "arp: %s moved from %*D to %*D "
 					    "on %s\n",
 					    inet_ntoa(isaddr),
 					    ifp->if_addrlen,
 					    (u_char *)LLADDR(sdl), ":",
 					    ifp->if_addrlen,
 					    (u_char *)ar_sha(ah), ":",
 					    ifp->if_xname);
 				} else {
 					RT_UNLOCK(rt);
 					if (log_arp_permanent_modify)
 						log(LOG_ERR,
 						    "arp: %*D attempts to "
 						    "modify permanent entry "
 						    "for %s on %s\n",
 						    ifp->if_addrlen,
 						    (u_char *)ar_sha(ah), ":",
 						    inet_ntoa(isaddr),
 						    ifp->if_xname);
 					break;
 				}
 			}
 			/*
 			 * sanity check for the address length.
 			 * XXX this does not work for protocols
 			 * with variable address length. -is
 			 */
 			if (sdl->sdl_alen &&
 			    sdl->sdl_alen != ah->ar_hln) {
 				log(LOG_WARNING,
 				    "arp from %*D: new addr len %d, was %d",
 				    ifp->if_addrlen, (u_char *) ar_sha(ah),
 				    ":", ah->ar_hln, sdl->sdl_alen);
 			}
 			if (ifp->if_addrlen != ah->ar_hln) {
 				log(LOG_WARNING,
 				    "arp from %*D: addr len: "
 				    "new %d, i/f %d (ignored)",
 				    ifp->if_addrlen, (u_char *) ar_sha(ah),
 				    ":", ah->ar_hln, ifp->if_addrlen);
 				RT_UNLOCK(rt);
 				break;
 			}
 			firstpass = 0;
 			goodfib = fibnum;
 		}
 
 		/* Copy in the information received. */
 		(void)memcpy(LLADDR(sdl), ar_sha(ah),
 		    sdl->sdl_alen = ah->ar_hln);
 		/*
 		 * If we receive an arp from a token-ring station over
 		 * a token-ring nic then try to save the source routing info.
 		 * XXXMRT Only minimal Token Ring support for MRT.
 		 * Only do this on the first pass as if modifies the mbuf.
 		 */
 		if (ifp->if_type == IFT_ISO88025) {
 			struct iso88025_header *th = NULL;
 			struct iso88025_sockaddr_dl_data *trld;
 
 			/* force the fib loop to end after this pass */
 			fibnum = rt_numfibs - 1;
 
 			th = (struct iso88025_header *)m->m_pkthdr.header;
 			trld = SDL_ISO88025(sdl);
 			rif_len = TR_RCF_RIFLEN(th->rcf);
 			if ((th->iso88025_shost[0] & TR_RII) &&
 			    (rif_len > 2)) {
 				trld->trld_rcf = th->rcf;
 				trld->trld_rcf ^= htons(TR_RCF_DIR);
 				memcpy(trld->trld_route, th->rd, rif_len - 2);
 				trld->trld_rcf &= ~htons(TR_RCF_BCST_MASK);
 				/*
 				 * Set up source routing information for
 				 * reply packet (XXX)
 				 */
 				m->m_data -= rif_len;
 				m->m_len  += rif_len;
 				m->m_pkthdr.len += rif_len;
 			} else {
 				th->iso88025_shost[0] &= ~TR_RII;
 				trld->trld_rcf = 0;
 			}
 			m->m_data -= 8;
 			m->m_len  += 8;
 			m->m_pkthdr.len += 8;
 			th->rcf = trld->trld_rcf;
 		}
 
 		if (rt->rt_expire) {
 			rt->rt_expire = time_uptime + V_arpt_keep;
 			callout_reset(&la->la_timer, hz * V_arpt_keep,
 			    arptimer, rt);
 		}
 		la->la_asked = 0;
 		la->la_preempt = V_arp_maxtries;
 		hold = la->la_hold;
 		la->la_hold = NULL;
 		RT_UNLOCK(rt);
 		if (hold != NULL)
 			(*ifp->if_output)(ifp, hold, rt_key(rt), rt);
 	} /* end of FIB loop */
 reply:
 
 	/*
 	 * Decide if we have to respond to something.
 	 */
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		/* It's not asking for our address. But it still may
 		 * be something we should answer.
 		 *
 		 * XXX MRT
 		 * We assume that link level info is independent of
 		 * the table used and so we use whichever we can and don't
 		 * have a better option.
 		 */
 		/* Have we been asked to proxy for the target. */
 		rt = arplookup(itaddr.s_addr, 0, SIN_PROXY, goodfib);
 		if (rt == NULL) {
 			/* Nope, only intersted now if proxying everything. */
 			struct sockaddr_in sin;
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			bzero(&sin, sizeof sin);
 			sin.sin_family = AF_INET;
 			sin.sin_len = sizeof sin;
 			sin.sin_addr = itaddr;
 
 			/* XXX MRT use table 0 for arp reply  */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (rt->rt_ifp == ifp) {
 				rtfree(rt);
 				goto drop;
 			}
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 			rtfree(rt);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 			sin.sin_addr = isaddr;
 
 			/* XXX MRT use table 0 for arp checks */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 			if (rt->rt_ifp != ifp) {
 				log(LOG_INFO, "arp_proxy: ignoring request"
 				    " from %s via %s, expecting %s\n",
 				    inet_ntoa(isaddr), ifp->if_xname,
 				    rt->rt_ifp->if_xname);
 				rtfree(rt);
 				goto drop;
 			}
 			rtfree(rt);
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n",
 			       inet_ntoa(itaddr));
 #endif
 		} else {
 			/*
 			 * Return proxied ARP replies only on the interface
 			 * or bridge cluster where this network resides.
 			 * Otherwise we may conflict with the host we are
 			 * proxying for.
 			 */
 			if (rt->rt_ifp != ifp &&
 			    (rt->rt_ifp->if_bridge != ifp->if_bridge ||
 			    ifp->if_bridge == NULL)) {
 				RT_UNLOCK(rt);
 				goto drop;
 			}
 			sdl = SDL(rt->rt_gateway);
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), LLADDR(sdl), ah->ar_hln);
 			RT_UNLOCK(rt);
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);   
 	m->m_pkthdr.len = m->m_len;   
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	(*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 /*
  * Lookup or enter a new address in arptab.
  */
 static struct rtentry *
 arplookup(u_long addr, int create, int proxy, int fibnum)
 {
 	struct rtentry *rt;
 	struct sockaddr_inarp sin;
 	const char *why = 0;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(sin);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = addr;
 	if (proxy)
 		sin.sin_other = SIN_PROXY;
 	rt = in_rtalloc1((struct sockaddr *)&sin, create, 0UL, fibnum);
 	if (rt == 0)
 		return (0);
 
 	if (rt->rt_flags & RTF_GATEWAY)
 		why = "host is not on local network";
 	else if ((rt->rt_flags & RTF_LLINFO) == 0)
 		why = "could not allocate llinfo";
 	else if (rt->rt_gateway->sa_family != AF_LINK)
 		why = "gateway route is not ours";
 
 	if (why) {
 #define	ISDYNCLONE(_rt) \
 	(((_rt)->rt_flags & (RTF_STATIC | RTF_WASCLONED)) == RTF_WASCLONED)
 		if (create)
 			log(LOG_DEBUG, "arplookup %s failed: %s\n",
 			    inet_ntoa(sin.sin_addr), why);
 		/*
 		 * If there are no references to this Layer 2 route,
 		 * and it is a cloned route, and not static, and
 		 * arplookup() is creating the route, then purge
 		 * it from the routing table as it is probably bogus.
 		 */
 		if (rt->rt_refcnt == 1 && ISDYNCLONE(rt))
 			rtexpunge(rt);
 		RTFREE_LOCKED(rt);
 		return (0);
 #undef ISDYNCLONE
 	} else {
 		RT_REMREF(rt);
 		return (rt);
 	}
 }
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
 	ifa->ifa_rtrequest = arp_rtrequest;
 	ifa->ifa_flags |= RTF_CLONING;
 }
 
 void
 arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
 {
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, enaddr);
 	ifa->ifa_rtrequest = arp_rtrequest;
 	ifa->ifa_flags |= RTF_CLONING;
 }
 
 static void
 arp_init(void)
 {
 	INIT_VNET_INET(curvnet);
 
 	V_arpt_keep = (20*60); /* once resolved, good for 20 more minutes */
 	V_arp_maxtries = 5;
 	V_useloopback = 1; /* use loopback interface for local traffic */
 	V_arp_proxyall = 0;
 
 	arpintrq.ifq_maxlen = 50;
 	mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF);
 	netisr_register(NETISR_ARP, arpintr, &arpintrq, 0);
 }
 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c	(revision 185347)
+++ head/sys/netinet/in_pcb.c	(revision 185348)
@@ -1,1786 +1,1787 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 int	ipport_lowfirstauto;
 int	ipport_lowlastauto;
 int	ipport_firstauto;
 int	ipport_lastauto;
 int	ipport_hifirstauto;
 int	ipport_hilastauto;
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 int	ipport_reservedhigh;
 int	ipport_reservedlow;
 
 /* Variables dealing with random ephemeral port allocation. */
 int	ipport_randomized;
 int	ipport_randomcps;
 int	ipport_randomtime;
 int	ipport_stoprandom;
 int	ipport_tcpallocs;
 int	ipport_tcplastcount;
 #endif
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
+	INIT_VNET_INET(curvnet);
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
 
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0,	
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
 	reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_RW, ipport_randomtime, 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(inp, inp_zero_size);
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	SOCK_LOCK(so);
 	mac_inpcb_create(so, inp);
 	SOCK_UNLOCK(so);
 #endif
 
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->inp_sp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	INP_WLOCK(inp);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = inp->inp_lport == 0 && (nam == NULL ||
 	    ((struct sockaddr_in *)nam)->sin_port == 0);
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct socket *so = inp->inp_socket;
 	unsigned short *lastport;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error, prison = 0;
 	int dorandom;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		wild = INPLOOKUP_WILDCARD;
 	if (nam) {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		if (sin->sin_addr.s_addr != INADDR_ANY)
 			if (prison_ip(cred, 0, &sin->sin_addr.s_addr))
 				return(EINVAL);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if (so->so_options & SO_REUSEADDR)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (jailed(cred))
 				prison = 1;
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, prison ? 0 : INPLOOKUP_WILDCARD,
 				    cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_socket->so_options &
 					 SO_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 			}
 			if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr))
 				return (EADDRNOTAVAIL);
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, prison ? 0 : wild, cred);
 			if (t && (t->inp_vflag & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(inp);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t &&
 			    (reuseport & t->inp_socket->so_options) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    INP_SOCKAF(so) ==
 				    INP_SOCKAF(t->inp_socket))
 #endif
 				return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		u_short first, last, aux;
 		int count;
 
 		if (laddr.s_addr != INADDR_ANY)
 			if (prison_ip(cred, 0, &laddr.s_addr))
 				return (EINVAL);
 
 		if (inp->inp_flags & INP_HIGHPORT) {
 			first = V_ipport_hifirstauto;	/* sysctl */
 			last  = V_ipport_hilastauto;
 			lastport = &pcbinfo->ipi_lasthi;
 		} else if (inp->inp_flags & INP_LOWPORT) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_RESERVEDPORT, 0);
 			if (error)
 				return error;
 			first = V_ipport_lowfirstauto;	/* 1023 */
 			last  = V_ipport_lowlastauto;	/* 600 */
 			lastport = &pcbinfo->ipi_lastlow;
 		} else {
 			first = V_ipport_firstauto;	/* sysctl */
 			last  = V_ipport_lastauto;
 			lastport = &pcbinfo->ipi_lastport;
 		}
 		/*
 		 * For UDP, use random port allocation as long as the user
 		 * allows it.  For TCP (and as of yet unknown) connections,
 		 * use random port allocation only if the user allows it AND
 		 * ipport_tick() allows it.
 		 */
 		if (V_ipport_randomized &&
 			(!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
 			dorandom = 1;
 		else
 			dorandom = 0;
 		/*
 		 * It makes no sense to do random port allocation if
 		 * we have the only port available.
 		 */
 		if (first == last)
 			dorandom = 0;
 		/* Make sure to not include UDP packets in the count. */
 		if (pcbinfo != &V_udbinfo)
 			V_ipport_tcpallocs++;
 		/*
 		 * Instead of having two loops further down counting up or down
 		 * make sure that first is always <= last and go with only one
 		 * code path implementing all logic.
 		 */
 		if (first > last) {
 			aux = first;
 			first = last;
 			last = aux;
 		}
 
 		if (dorandom)
 			*lastport = first +
 				    (arc4random() % (last - first));
 
 		count = last - first;
 
 		do {
 			if (count-- < 0)	/* completely used? */
 				return (EADDRNOTAVAIL);
 			++*lastport;
 			if (*lastport < first || *lastport > last)
 				*lastport = first;
 			lport = htons(*lastport);
 		} while (in_pcblookup_local(pcbinfo, laddr,
 		    lport, wild, cred));
 	}
 	if (prison_ip(cred, 0, &laddr.s_addr))
 		return (EINVAL);
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash(inp);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 static int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct in_ifaddr *ia;
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: null laddr", __func__));
 
 	error = 0;
 	ia = NULL;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !jailed(cred)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		laddr->s_addr = htonl(prison_getip(cred));
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !jailed(cred)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		laddr->s_addr = htonl(prison_getip(cred));
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
 
 		if (cred == NULL || !jailed(cred)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (htonl(prison_getip(cred)) ==
 				    sin->sin_addr.s_addr) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		laddr->s_addr = htonl(prison_getip(cred));
 		goto done;
 	}
 
 done:
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
 	INP_LOCK_ASSERT(inp);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY)
 			faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 		else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
 		    (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 		    IFF_BROADCAST))
 			faddr = satosin(&TAILQ_FIRST(
 			    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		if (error)
 			return (error);
 
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, use the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
 					if (ia->ia_ifp == ifp)
 						break;
 				if (ia == NULL)
 					return (EADDRNOTAVAIL);
 				laddr = ia->ia_addr.sin_addr;
 			}
 		}
 	}
 
 	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
 	    0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 
 /*
  * Historically, in_pcbdetach() included the functionality now found in
  * in_pcbfree() and in_pcbdrop().  They are now broken out to reflect the
  * more complex life cycle of TCP.
  *
  * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, for TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() may be deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbfree() is responsible for freeing an already-detached inpcb, as well
  * as removing it from any global inpcb lists it might be on.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_INFO_WLOCK_ASSERT(ipi);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef IPSEC
 	ipsec4_delete_pcbpolicy(inp);
 #endif /* IPSEC */
 	inp->inp_gencnt = ++ipi->ipi_gencnt;
 	in_pcbremlists(inp);
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	if (inp->inp_moptions != NULL)
 		inp_freemoptions(inp->inp_moptions);
 	inp->inp_vflag = 0;
 	crfree(inp->inp_cred);
 
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	INP_WUNLOCK(inp);
 	uma_zfree(ipi->ipi_zone, inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
  * lists, but can lead to confusing netstat output, as open sockets with
  * closed TCP connections will no longer appear to have their bound port
  * number.  An explicit flag would be better, as it would allow us to leave
  * the port number intact after the connection is dropped.
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_vflag |= INP_DROPPED;
 	if (inp->inp_lport) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		inp->inp_lport = 0;
 	}
 }
 
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_RLOCK(pcbinfo);
 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					in_delmulti(imo->imo_membership[i]);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int wild_okay, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 
 	if (!wild_okay) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found.
 				 */
 				return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 #ifdef INET6
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0) {
 						break;
 					}
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 /*
  * Lookup PCB in hash list.
  */
 struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	INP_INFO_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if (wildcard) {
 		struct inpcb *local_wild = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == lport) {
 				if (ifp && ifp->if_type == IFT_FAITH &&
 				    (inp->inp_flags & INP_FAITH) == 0)
 					continue;
 				if (inp->inp_laddr.s_addr == laddr.s_addr)
 					return (inp);
 				else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 					if (INP_CHECK_SOCKAF(inp->inp_socket,
 							     AF_INET6))
 						local_wild_mapped = inp;
 					else
 #endif
 						local_wild = inp;
 				}
 			}
 		}
 #ifdef INET6
 		if (local_wild == NULL)
 			return (local_wild_mapped);
 #endif
 		return (local_wild);
 	}
 	return (NULL);
 }
 
 /*
  * Insert PCB onto various hash lists.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
 	else
 #endif /* INET6 */
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		LIST_INIT(&phd->phd_pcblist);
 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	return (0);
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
 	else
 #endif /* INET6 */
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
 /*
  * Remove PCB from various lists.
  */
 void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_lport) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		INIT_VNET_INET(vnet_iter);
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANTS
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	INIT_VNET_INET(curvnet);
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
-	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
+	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags == 1) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else {
 #endif
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 #ifdef INET6
 	}
 #endif
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_FAITH) {
 		db_printf("%sINP_FAITH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif
Index: head/sys/netinet/in_rmx.c
===================================================================
--- head/sys/netinet/in_rmx.c	(revision 185347)
+++ head/sys/netinet/in_rmx.c	(revision 185348)
@@ -1,515 +1,517 @@
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This code does two things necessary for the enhanced TCP metrics to
  * function in a useful manner:
  *  1) It marks all non-host routes as `cloning', thus ensuring that
  *     every actual reference to such a route actually gets turned
  *     into a reference to a host route to the specific destination
  *     requested.
  *  2) When such routes lose all their references, it arranges for them
  *     to be deleted in some random collection of circumstances, so that
  *     a large quantity of stale routing data is not kept in kernel memory
  *     indefinitely.  See in_rtqtimo() below for the exact mechanism.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 
 extern int	in_inithead(void **head, int off);
 
 #define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
     struct radix_node *treenodes)
 {
 	struct rtentry *rt = (struct rtentry *)treenodes;
 	struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
 	struct radix_node *ret;
 
 	/*
 	 * A little bit of help for both IP output and input:
 	 *   For host routes, we make sure that RTF_BROADCAST
 	 *   is set for anything that looks like a broadcast address.
 	 *   This way, we can avoid an expensive call to in_broadcast()
 	 *   in ip_output() most of the time (because the route passed
 	 *   to ip_output() is almost always a host route).
 	 *
 	 *   We also do the same for local addresses, with the thought
 	 *   that this might one day be used to speed up ip_input().
 	 *
 	 * We also mark routes to multicast addresses as such, because
 	 * it's easy to do and might be useful (but this is much more
 	 * dubious since it's so easy to inspect the address).
 	 */
 	if (rt->rt_flags & RTF_HOST) {
 		if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
 			rt->rt_flags |= RTF_BROADCAST;
 		} else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
 		    sin->sin_addr.s_addr) {
 			rt->rt_flags |= RTF_LOCAL;
 		}
 	}
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		rt->rt_flags |= RTF_MULTICAST;
 
 	if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
 		rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
 
 	ret = rn_addroute(v_arg, n_arg, head, treenodes);
 	if (ret == NULL && rt->rt_flags & RTF_HOST) {
 		struct rtentry *rt2;
 		/*
 		 * We are trying to add a host route, but can't.
 		 * Find out if it is because of an
 		 * ARP entry and delete it if so.
 		 */
 		rt2 = in_rtalloc1((struct sockaddr *)sin, 0,
 		    RTF_CLONING, rt->rt_fibnum);
 		if (rt2) {
 			if (rt2->rt_flags & RTF_LLINFO &&
 			    rt2->rt_flags & RTF_HOST &&
 			    rt2->rt_gateway &&
 			    rt2->rt_gateway->sa_family == AF_LINK) {
 				rtexpunge(rt2);
 				RTFREE_LOCKED(rt2);
 				ret = rn_addroute(v_arg, n_arg, head,
 						  treenodes);
 			} else
 				RTFREE_LOCKED(rt2);
 		}
 	}
 
 	return ret;
 }
 
 /*
  * This code is the inverse of in_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
 in_matroute(void *v_arg, struct radix_node_head *head)
 {
 	struct radix_node *rn = rn_match(v_arg, head);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	/*XXX locking? */
 	if (rt && rt->rt_refcnt == 0) {		/* this is first reference */
 		if (rt->rt_flags & RTPRF_OURS) {
 			rt->rt_flags &= ~RTPRF_OURS;
 			rt->rt_rmx.rmx_expire = 0;
 		}
 	}
 	return rn;
 }
 
 #ifdef VIMAGE_GLOBALS
 static int rtq_reallyold;
 static int rtq_minreallyold;
 static int rtq_toomany;
 #endif
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire,
     CTLFLAG_RW, rtq_reallyold, 0,
     "Default expiration time on dynamically learned routes");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE,
     rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0,
     "Minimum time to attempt to hold onto dynamically learned routes");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE,
     rtmaxcache, CTLFLAG_RW, rtq_toomany, 0,
     "Upper limit on dynamically learned routes");
 
 /*
  * On last reference drop, mark the route as belong to us so that it can be
  * timed out.
  */
 static void
 in_clsroute(struct radix_node *rn, struct radix_node_head *head)
 {
 	INIT_VNET_INET(curvnet);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (!(rt->rt_flags & RTF_UP))
 		return;			/* prophylactic measures */
 
 	if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
 		return;
 
 	if (rt->rt_flags & RTPRF_OURS)
 		return;
 
 	if (!(rt->rt_flags & (RTF_WASCLONED | RTF_DYNAMIC)))
 		return;
 
 	/*
 	 * If rtq_reallyold is 0, just delete the route without
 	 * waiting for a timeout cycle to kill it.
 	 */
 	if (V_rtq_reallyold != 0) {
 		rt->rt_flags |= RTPRF_OURS;
 		rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold;
 	} else {
 		rtexpunge(rt);
 	}
 }
 
 struct rtqk_arg {
 	struct radix_node_head *rnh;
 	int draining;
 	int killed;
 	int found;
 	int updating;
 	time_t nextstop;
 };
 
 /*
  * Get rid of old routes.  When draining, this deletes everything, even when
  * the timeout is not expired yet.  When updating, this makes sure that
  * nothing has a timeout longer than the current value of rtq_reallyold.
  */
 static int
 in_rtqkill(struct radix_node *rn, void *rock)
 {
 	INIT_VNET_INET(curvnet);
 	struct rtqk_arg *ap = rock;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int err;
 
 	if (rt->rt_flags & RTPRF_OURS) {
 		ap->found++;
 
 		if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) {
 			if (rt->rt_refcnt > 0)
 				panic("rtqkill route really not free");
 
 			err = in_rtrequest(RTM_DELETE,
 					(struct sockaddr *)rt_key(rt),
 					rt->rt_gateway, rt_mask(rt),
 					rt->rt_flags, 0, rt->rt_fibnum);
 			if (err) {
 				log(LOG_WARNING, "in_rtqkill: error %d\n", err);
 			} else {
 				ap->killed++;
 			}
 		} else {
 			if (ap->updating &&
 			    (rt->rt_rmx.rmx_expire - time_uptime >
 			     V_rtq_reallyold)) {
 				rt->rt_rmx.rmx_expire =
 				    time_uptime + V_rtq_reallyold;
 			}
 			ap->nextstop = lmin(ap->nextstop,
 					    rt->rt_rmx.rmx_expire);
 		}
 	}
 
 	return 0;
 }
 
 #define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
 #ifdef VIMAGE_GLOBALS
 static int rtq_timeout;
 static struct callout rtq_timer;
 #endif
 
 static void in_rtqtimo_one(void *rock);
 
 static void
 in_rtqtimo(void *rock)
 {
 	int fibnum;
 	void *newrock;
 	struct timeval atv;
 
 	KASSERT((rock == (void *)V_rt_tables[0][AF_INET]),
 			("in_rtqtimo: unexpected arg"));
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL)
 			in_rtqtimo_one(newrock);
 	}
 	atv.tv_usec = 0;
 	atv.tv_sec = V_rtq_timeout;
 	callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
 }
 
 static void
 in_rtqtimo_one(void *rock)
 {
+	INIT_VNET_INET(curvnet);
 	struct radix_node_head *rnh = rock;
 	struct rtqk_arg arg;
 	static time_t last_adjusted_timeout = 0;
 
 	arg.found = arg.killed = 0;
 	arg.rnh = rnh;
 	arg.nextstop = time_uptime + V_rtq_timeout;
 	arg.draining = arg.updating = 0;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 
 	/*
 	 * Attempt to be somewhat dynamic about this:
 	 * If there are ``too many'' routes sitting around taking up space,
 	 * then crank down the timeout, and see if we can't make some more
 	 * go away.  However, we make sure that we will never adjust more
 	 * than once in rtq_timeout seconds, to keep from cranking down too
 	 * hard.
 	 */
 	if ((arg.found - arg.killed > V_rtq_toomany) &&
 	    (time_uptime - last_adjusted_timeout >= V_rtq_timeout) &&
 	    V_rtq_reallyold > V_rtq_minreallyold) {
 		V_rtq_reallyold = 2 * V_rtq_reallyold / 3;
 		if (V_rtq_reallyold < V_rtq_minreallyold) {
 			V_rtq_reallyold = V_rtq_minreallyold;
 		}
 
 		last_adjusted_timeout = time_uptime;
 #ifdef DIAGNOSTIC
 		log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
 		    V_rtq_reallyold);
 #endif
 		arg.found = arg.killed = 0;
 		arg.updating = 1;
 		RADIX_NODE_HEAD_LOCK(rnh);
 		rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	}
 
 }
 
 void
 in_rtqdrain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct radix_node_head *rnh;
 	struct rtqk_arg arg;
 	int 	fibnum;
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INIT_VNET_NET(vnet_iter);
+
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			rnh = V_rt_tables[fibnum][AF_INET];
 			arg.found = arg.killed = 0;
 			arg.rnh = rnh;
 			arg.nextstop = 0;
 			arg.draining = 1;
 			arg.updating = 0;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 static int _in_rt_was_here;
 /*
  * Initialize our routing tree.
  */
 int
 in_inithead(void **head, int off)
 {
 	INIT_VNET_INET(curvnet);
 	struct radix_node_head *rnh;
 
 	/* XXX MRT
 	 * This can be called from vfs_export.c too in which case 'off'
 	 * will be 0. We know the correct value so just use that and
 	 * return directly if it was 0.
 	 * This is a hack that replaces an even worse hack on a bad hack
 	 * on a bad design. After RELENG_7 this should be fixed but that
 	 * will change the ABI, so for now do it this way.
 	 */
 	if (!rn_inithead(head, 32))
 		return 0;
 
 	if (off == 0)		/* XXX MRT  see above */
 		return 1;	/* only do the rest for a real routing table */
 
 	V_rtq_reallyold = 60*60; /* one hour is "really old" */
 	V_rtq_minreallyold = 10; /* never automatically crank down to less */
 	V_rtq_toomany = 128;	 /* 128 cached routes is "too many" */
 	V_rtq_timeout = RTQ_TIMEOUT;
 
 	rnh = *head;
 	rnh->rnh_addaddr = in_addroute;
 	rnh->rnh_matchaddr = in_matroute;
 	rnh->rnh_close = in_clsroute;
 	if (_in_rt_was_here == 0 ) {
 		callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
 		in_rtqtimo(rnh);	/* kick off timeout first time */
 		_in_rt_was_here = 1;
 	}
 	return 1;
 }
 
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
  * that point to this address.  If we don't do this, we may end up
  * using the old address in the future.  The ones we always want to
  * get rid of are things like ARP entries, since the user might down
  * the interface, walk over to a completely different network, and
  * plug back in.
  */
 struct in_ifadown_arg {
 	struct ifaddr *ifa;
 	int del;
 };
 
 static int
 in_ifadownkill(struct radix_node *rn, void *xap)
 {
 	struct in_ifadown_arg *ap = xap;
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	RT_LOCK(rt);
 	if (rt->rt_ifa == ap->ifa &&
 	    (ap->del || !(rt->rt_flags & RTF_STATIC))) {
 		/*
 		 * We need to disable the automatic prune that happens
 		 * in this case in rtrequest() because it will blow
 		 * away the pointers that rn_walktree() needs in order
 		 * continue our descent.  We will end up deleting all
 		 * the routes that rtrequest() would have in any case,
 		 * so that behavior is not needed there.
 		 */
 		rt->rt_flags &= ~RTF_CLONING;
 		rtexpunge(rt);
 	}
 	RT_UNLOCK(rt);
 	return 0;
 }
 
 int
 in_ifadown(struct ifaddr *ifa, int delete)
 {
 	INIT_VNET_NET(curvnet);
 	struct in_ifadown_arg arg;
 	struct radix_node_head *rnh;
 	int	fibnum;
 
 	if (ifa->ifa_addr->sa_family != AF_INET)
 		return 1;
 
 	for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		rnh = V_rt_tables[fibnum][AF_INET];
 		arg.ifa = ifa;
 		arg.del = delete;
 		RADIX_NODE_HEAD_LOCK(rnh);
 		rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 		ifa->ifa_flags &= ~IFA_ROUTE;		/* XXXlocking? */
 	}
 	return 0;
 }
 
 /*
  * inet versions of rt functions. These have fib extensions and 
  * for now will just reference the _fib variants.
  * eventually this order will be reversed,
  */
 void
 in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, ignflags, fibnum);
 }
 
 int
 in_rtrequest( int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	return (rtrequest_fib(req, dst, gateway, netmask, 
 	    flags, ret_nrt, fibnum));
 }
 
 struct rtentry *
 in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
 {
 	return (rtalloc1_fib(dst, report, ignflags, fibnum));
 }
 
 void
 in_rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
 }
  
 void
 in_rtalloc(struct route *ro, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, 0UL, fibnum);
 }
 
 #if 0
 int	 in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
 int	 in_rtioctl(u_long, caddr_t, u_int);
 int	 in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
 #endif
 
 
Index: head/sys/netinet/ip_divert.c
===================================================================
--- head/sys/netinet/ip_divert.c	(revision 185347)
+++ head/sys/netinet/ip_divert.c	(revision 185348)
@@ -1,769 +1,770 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #if !defined(KLD_MODULE)
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_mac.h"
 #ifndef INET
 #error "IPDIVERT requires INET."
 #endif
 #ifndef IPFIREWALL
 #error "IPDIVERT requires IPFIREWALL"
 #endif
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/netisr.h> 
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_divert.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw, see the divert(4)
  * manpage for features.
  * Internally, packets selected by ipfw in ip_input() or ip_output(),
  * and never diverted before, are passed to the input queue of the
  * divert socket with a given 'divert_port' number (as specified in
  * the matching ipfw rule), and they are tagged with a 16 bit cookie
  * (representing the rule number of the matching ipfw rule), which
  * is passed to process reading from the socket.
  *
  * Packets written to the divert socket are again tagged with a cookie
  * (usually the same as above) and a destination address.
  * If the destination address is INADDR_ANY then the packet is
  * treated as outgoing and sent to ip_output(), otherwise it is
  * treated as incoming and sent to ip_input().
  * In both cases, the packet is tagged with the cookie.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * ipfw processing will start at the rule number after the one
  * written in the cookie (so, tagging a packet with a cookie of 0
  * will cause it to be effectively considered as a standard packet).
  */
 
 /* Internal variables. */
 #ifdef VIMAGE_GLOBALS
 static struct inpcbhead divcb;
 static struct inpcbinfo divcbinfo;
 #endif
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 /*
  * Initialize divert connection block queue.
  */
 static void
 div_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
 }
 
 static int
 div_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "divinp");
 	return (0);
 }
 
 static void
 div_inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 void
 div_init(void)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_LOCK_INIT(&V_divcbinfo, "div");
 	LIST_INIT(&V_divcb);
 	V_divcbinfo.ipi_listhead = &V_divcb;
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask);
 	V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
 	    &V_divcbinfo.ipi_porthashmask);
 	V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
 	    NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR,
 	    UMA_ZONE_NOFREE);
 	uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
 	EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change,
 		NULL, EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * IPPROTO_DIVERT is not in the real IP protocol number space; this
  * function should never be called.  Just in case, drop any packets.
  */
 void
 div_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET(curvnet);
 
 	V_ipstat.ips_noproto++;
 	m_freem(m);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 static void
 divert_packet(struct mbuf *m, int incoming)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
 	if (mtag == NULL) {
 		printf("%s: no divert tag\n", __func__);
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		ip->ip_len = ntohs(ip->ip_len);
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		ip->ip_len = htons(ip->ip_len);
 	}
 
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	divsrc.sin_port = divert_cookie(mtag);	/* record matching rule */
 	if (incoming) {
 		struct ifaddr *ifa;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)divert_info(mtag));
 	INP_INFO_RLOCK(&V_divcbinfo);
 	LIST_FOREACH(inp, &V_divcb, inp_list) {
 		/* XXX why does only one socket match? */
 		if (inp->inp_lport == nport) {
 			INP_RLOCK(inp);
 			sa = inp->inp_socket;
 			SOCKBUF_LOCK(&sa->so_rcv);
 			if (sbappendaddr_locked(&sa->so_rcv,
 			    (struct sockaddr *)&divsrc, m,
 			    (struct mbuf *)0) == 0) {
 				SOCKBUF_UNLOCK(&sa->so_rcv);
 				sa = NULL;	/* force mbuf reclaim below */
 			} else
 				sorwakeup_locked(sa);
 			INP_RUNLOCK(inp);
 			break;
 		}
 	}
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 	if (sa == NULL) {
 		m_freem(m);
 		V_ipstat.ips_noproto++;
 		V_ipstat.ips_delivered--;
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
     struct mbuf *control)
 {
 	INIT_VNET_INET(curvnet);
 	struct m_tag *mtag;
 	struct divert_tag *dt;
 	int error = 0;
 	struct mbuf *options;
 
 	/*
 	 * An mbuf may hasn't come from userland, but we pretend
 	 * that it has.
 	 */
 	m->m_pkthdr.rcvif = NULL;
 	m->m_nextpkt = NULL;
 	M_SETFIB(m, so->so_fibnum);
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) {
 		mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag),
 		    M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			error = ENOBUFS;
 			goto cantsend;
 		}
 		dt = (struct divert_tag *)(mtag+1);
 		m_tag_prepend(m, mtag);
 	} else
 		dt = (struct divert_tag *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		dt->cookie = sin->sin_port;
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct ip *const ip = mtod(m, struct ip *);
 		struct inpcb *inp;
 
 		dt->info |= IP_FW_DIVERT_OUTPUT_FLAG;
 		INP_INFO_WLOCK(&V_divcbinfo);
 		inp = sotoinpcb(so);
 		INP_RLOCK(inp);
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
 		     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			error = EINVAL;
 			INP_RUNLOCK(inp);
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			m_freem(m);
 		} else {
 			/* Convert fields to host order for ip_output() */
 			ip->ip_len = ntohs(ip->ip_len);
 			ip->ip_off = ntohs(ip->ip_off);
 
 			/* Send packet to output processing */
 			V_ipstat.ips_rawout++;			/* XXX */
 
 #ifdef MAC
 			mac_inpcb_create_mbuf(inp, m);
 #endif
 			/*
 			 * Get ready to inject the packet into ip_output().
 			 * Just in case socket options were specified on the
 			 * divert socket, we duplicate them.  This is done
 			 * to avoid having to hold the PCB locks over the call
 			 * to ip_output(), as doing this results in a number of
 			 * lock ordering complexities.
 			 *
 			 * Note that we set the multicast options argument for
 			 * ip_output() to NULL since it should be invariant that
 			 * they are not present.
 			 */
 			KASSERT(inp->inp_moptions == NULL,
 			    ("multicast options set on a divert socket"));
 			options = NULL;
 			/*
 			 * XXXCSJP: It is unclear to me whether or not it makes
 			 * sense for divert sockets to have options.  However,
 			 * for now we will duplicate them with the INP locks
 			 * held so we can use them in ip_output() without
 			 * requring a reference to the pcb.
 			 */
 			if (inp->inp_options != NULL) {
 				options = m_dup(inp->inp_options, M_DONTWAIT);
 				if (options == NULL)
 					error = ENOBUFS;
 			}
 			INP_RUNLOCK(inp);
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			if (error == ENOBUFS) {
 				m_freem(m);
 				return (error);
 			}
 			error = ip_output(m, options, NULL,
 			    ((so->so_options & SO_DONTROUTE) ?
 			    IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST |
 			    IP_RAWOUTPUT, NULL, NULL);
 			if (options != NULL)
 				m_freem(options);
 		}
 	} else {
 		dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG;
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 		}
 #ifdef MAC
 		SOCK_LOCK(so);
 		mac_socket_create_mbuf(so, m);
 		SOCK_UNLOCK(so);
 #endif
 		/* Send packet to input processing via netisr */
 		netisr_queue(NETISR_IP, m);
 	}
 
 	return error;
 
 cantsend:
 	m_freem(m);
 	return error;
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	int error;
 
 	inp  = sotoinpcb(so);
 	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
 			return (error);
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	INP_INFO_WLOCK(&V_divcbinfo);
 	error = in_pcballoc(so, &V_divcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 		return error;
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static void
 div_detach(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
 	INP_INFO_WLOCK(&V_divcbinfo);
 	INP_WLOCK(inp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 	INP_INFO_WLOCK(&V_divcbinfo);
 	INP_WLOCK(inp);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_divcbinfo);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		V_ipstat.ips_toosmall++;
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 void
 div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
         struct in_addr faddr;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
+	INIT_VNET_INET(curvnet);
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_divcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_divcbinfo);
 	gencnt = V_divcbinfo.ipi_gencnt;
 	n = V_divcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 
 	error = sysctl_wire_old_buffer(req,
 	    2 * sizeof(xig) + n*sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	INP_INFO_RLOCK(&V_divcbinfo);
 	for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0)
 			inp_list[i++] = inp;
 		INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_divcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_divcbinfo);
 		xig.xig_gen = V_divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_divcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_divcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT");
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
 	    div_pcblist, "S,xinpcb", "List of active divert sockets");
 #endif
 
 struct pr_usrreqs div_usrreqs = {
 	.pru_attach =		div_attach,
 	.pru_bind =		div_bind,
 	.pru_control =		in_control,
 	.pru_detach =		div_detach,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		div_send,
 	.pru_shutdown =		div_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel
 };
 
 struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_protocol =		IPPROTO_DIVERT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		div_input,
 	.pr_ctlinput =		div_ctlinput,
 	.pr_ctloutput =		ip_ctloutput,
 	.pr_init =		div_init,
 	.pr_usrreqs =		&div_usrreqs
 };
 
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 	int n;
 
 	switch (type) {
 	case MOD_LOAD:
 		/*
 		 * Protocol will be initialized by pf_proto_register().
 		 * We don't have to register ip_protox because we are not
 		 * a true IP protocol that goes over the wire.
 		 */
 		err = pf_proto_register(PF_INET, &div_protosw);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
 		 */
 		INP_INFO_WLOCK(&V_divcbinfo);
 		n = V_divcbinfo.ipi_count;
 		if (n != 0) {
 			err = EBUSY;
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			break;
 		}
 		ip_divert_ptr = NULL;
 		err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 		INP_INFO_LOCK_DESTROY(&V_divcbinfo);
 		uma_zdestroy(V_divcbinfo.ipi_zone);
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 MODULE_VERSION(ipdivert, 1);
Index: head/sys/netinet/ip_fw_pfil.c
===================================================================
--- head/sys/netinet/ip_fw_pfil.c	(revision 185347)
+++ head/sys/netinet/ip_fw_pfil.c	(revision 185348)
@@ -1,571 +1,572 @@
 /*-
  * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #if !defined(KLD_MODULE)
 #include "opt_ipfw.h"
 #include "opt_ipdn.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #endif /* KLD_MODULE */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_divert.h>
 #include <netinet/ip_dummynet.h>
 
 #include <netgraph/ng_ipfw.h>
 
 #include <machine/in_cksum.h>
 
 int fw_enable = 1;
 #ifdef INET6
 int fw6_enable = 1;
 #endif
 
 int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
 
 /* Dummynet hooks. */
 ip_dn_ruledel_t	*ip_dn_ruledel_ptr = NULL;
 
 /* Divert hooks. */
 ip_divert_packet_t *ip_divert_ptr = NULL;
 
 /* ng_ipfw hooks. */
 ng_ipfw_input_t *ng_ipfw_input_p = NULL;
 
 /* Forward declarations. */
 static int	ipfw_divert(struct mbuf **, int, int);
 #define	DIV_DIR_IN	1
 #define	DIV_DIR_OUT	0
 
 int
 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	struct ip_fw_args args;
 	struct ng_ipfw_tag *ng_tag;
 	struct m_tag *dn_tag;
 	int ipfw = 0;
 	int divert;
 	int tee;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
 
 	KASSERT(dir == PFIL_IN, ("ipfw_check_in wrong direction!"));
 
 	bzero(&args, sizeof(args));
 
 	ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
 	    NULL);
 	if (ng_tag != NULL) {
 		KASSERT(ng_tag->dir == NG_IPFW_IN,
 		    ("ng_ipfw tag with wrong direction"));
 		args.rule = ng_tag->rule;
 		m_tag_delete(*m0, (struct m_tag *)ng_tag);
 	}
 
 again:
 	dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
 	if (dn_tag != NULL){
 		struct dn_pkt_tag *dt;
 
 		dt = (struct dn_pkt_tag *)(dn_tag+1);
 		args.rule = dt->rule;
 
 		m_tag_delete(*m0, dn_tag);
 	}
 
 	args.m = *m0;
 	args.inp = inp;
 	ipfw = ipfw_chk(&args);
 	*m0 = args.m;
 	tee = 0;
 
 	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
 	    __func__));
 
 	switch (ipfw) {
 	case IP_FW_PASS:
 		if (args.next_hop == NULL)
 			goto pass;
 
 #ifdef IPFIREWALL_FORWARD
 		fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
 				sizeof(struct sockaddr_in), M_NOWAIT);
 		if (fwd_tag == NULL)
 			goto drop;
 		bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
 		m_tag_prepend(*m0, fwd_tag);
 
 		if (in_localip(args.next_hop->sin_addr))
 			(*m0)->m_flags |= M_FASTFWD_OURS;
 		goto pass;
 #endif
 		break;			/* not reached */
 
 	case IP_FW_DENY:
 		goto drop;
 		break;			/* not reached */
 
 	case IP_FW_DUMMYNET:
 		if (!DUMMYNET_LOADED)
 			goto drop;
 		if (mtod(*m0, struct ip *)->ip_v == 4)
 			ip_dn_io_ptr(m0, DN_TO_IP_IN, &args);
 		else if (mtod(*m0, struct ip *)->ip_v == 6)
 			ip_dn_io_ptr(m0, DN_TO_IP6_IN, &args);
 		if (*m0 != NULL)
 			goto again;
 		return 0;		/* packet consumed */
 
 	case IP_FW_TEE:
 		tee = 1;
 		/* fall through */
 
 	case IP_FW_DIVERT:
 		divert = ipfw_divert(m0, DIV_DIR_IN, tee);
 		if (divert) {
 			*m0 = NULL;
 			return 0;	/* packet consumed */
 		} else {
 			args.rule = NULL;
 			goto again;	/* continue with packet */
 		}
 
 	case IP_FW_NGTEE:
 		if (!NG_IPFW_LOADED)
 			goto drop;
 		(void)ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 1);
 		goto again;		/* continue with packet */
 
 	case IP_FW_NETGRAPH:
 		if (!NG_IPFW_LOADED)
 			goto drop;
 		return ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 0);
 		
 	case IP_FW_NAT:
 		goto again;		/* continue with packet */
 
 	default:
 		KASSERT(0, ("%s: unknown retval", __func__));
 	}
 
 drop:
 	if (*m0)
 		m_freem(*m0);
 	*m0 = NULL;
 	return (EACCES);
 pass:
 	return 0;	/* not filtered */
 }
 
 int
 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	struct ip_fw_args args;
 	struct ng_ipfw_tag *ng_tag;
 	struct m_tag *dn_tag;
 	int ipfw = 0;
 	int divert;
 	int tee;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
 
 	KASSERT(dir == PFIL_OUT, ("ipfw_check_out wrong direction!"));
 
 	bzero(&args, sizeof(args));
 
 	ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
 	    NULL);
 	if (ng_tag != NULL) {
 		KASSERT(ng_tag->dir == NG_IPFW_OUT,
 		    ("ng_ipfw tag with wrong direction"));
 		args.rule = ng_tag->rule;
 		m_tag_delete(*m0, (struct m_tag *)ng_tag);
 	}
 
 again:
 	dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
 	if (dn_tag != NULL) {
 		struct dn_pkt_tag *dt;
 
 		dt = (struct dn_pkt_tag *)(dn_tag+1);
 		args.rule = dt->rule;
 
 		m_tag_delete(*m0, dn_tag);
 	}
 
 	args.m = *m0;
 	args.oif = ifp;
 	args.inp = inp;
 	ipfw = ipfw_chk(&args);
 	*m0 = args.m;
 	tee = 0;
 
 	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
 	    __func__));
 
 	switch (ipfw) {
 	case IP_FW_PASS:
                 if (args.next_hop == NULL)
                         goto pass;
 #ifdef IPFIREWALL_FORWARD
 		/* Overwrite existing tag. */
 		fwd_tag = m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
 		if (fwd_tag == NULL) {
 			fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
 				sizeof(struct sockaddr_in), M_NOWAIT);
 			if (fwd_tag == NULL)
 				goto drop;
 		} else
 			m_tag_unlink(*m0, fwd_tag);
 		bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
 		m_tag_prepend(*m0, fwd_tag);
 
 		if (in_localip(args.next_hop->sin_addr))
 			(*m0)->m_flags |= M_FASTFWD_OURS;
 		goto pass;
 #endif
 		break;			/* not reached */
 
 	case IP_FW_DENY:
 		goto drop;
 		break;  		/* not reached */
 
 	case IP_FW_DUMMYNET:
 		if (!DUMMYNET_LOADED)
 			break;
 		if (mtod(*m0, struct ip *)->ip_v == 4)
 			ip_dn_io_ptr(m0, DN_TO_IP_OUT, &args);
 		else if (mtod(*m0, struct ip *)->ip_v == 6)
 			ip_dn_io_ptr(m0, DN_TO_IP6_OUT, &args);
 		if (*m0 != NULL)
 			goto again;
 		return 0;		/* packet consumed */
 
 		break;
 
 	case IP_FW_TEE:
 		tee = 1;
 		/* fall through */
 
 	case IP_FW_DIVERT:
 		divert = ipfw_divert(m0, DIV_DIR_OUT, tee);
 		if (divert) {
 			*m0 = NULL;
 			return 0;	/* packet consumed */
 		} else {
 			args.rule = NULL;
 			goto again;	/* continue with packet */
 		}
 
 	case IP_FW_NGTEE:
 		if (!NG_IPFW_LOADED)
 			goto drop;
 		(void)ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 1);
 		goto again;		/* continue with packet */
 
 	case IP_FW_NETGRAPH:
 		if (!NG_IPFW_LOADED)
 			goto drop;
 		return ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 0);
 
 	case IP_FW_NAT:
 		goto again;		/* continue with packet */
 		
 	default:
 		KASSERT(0, ("%s: unknown retval", __func__));
 	}
 
 drop:
 	if (*m0)
 		m_freem(*m0);
 	*m0 = NULL;
 	return (EACCES);
 pass:
 	return 0;	/* not filtered */
 }
 
 static int
 ipfw_divert(struct mbuf **m, int incoming, int tee)
 {
 	/*
 	 * ipfw_chk() has already tagged the packet with the divert tag.
 	 * If tee is set, copy packet and return original.
 	 * If not tee, consume packet and send it to divert socket.
 	 */
 	struct mbuf *clone, *reass;
 	struct ip *ip;
 	int hlen;
 
 	reass = NULL;
 
 	/* Is divert module loaded? */
 	if (ip_divert_ptr == NULL)
 		goto nodivert;
 
 	/* Cloning needed for tee? */
 	if (tee)
 		clone = m_dup(*m, M_DONTWAIT);
 	else
 		clone = *m;
 
 	/* In case m_dup was unable to allocate mbufs. */
 	if (clone == NULL)
 		goto teeout;
 
 	/*
 	 * Divert listeners can only handle non-fragmented packets.
 	 * However when tee is set we will *not* de-fragment the packets;
 	 * Doing do would put the reassembly into double-jeopardy.  On top
 	 * of that someone doing a tee will probably want to get the packet
 	 * in its original form.
 	 */
 	ip = mtod(clone, struct ip *);
 	if (!tee && ip->ip_off & (IP_MF | IP_OFFMASK)) {
 
 		/* Reassemble packet. */
 		reass = ip_reass(clone);
 
 		/*
 		 * IP header checksum fixup after reassembly and leave header
 		 * in network byte order.
 		 */
 		if (reass != NULL) {
 			ip = mtod(reass, struct ip *);
 			hlen = ip->ip_hl << 2;
 			ip->ip_len = htons(ip->ip_len);
 			ip->ip_off = htons(ip->ip_off);
 			ip->ip_sum = 0;
 			if (hlen == sizeof(struct ip))
 				ip->ip_sum = in_cksum_hdr(ip);
 			else
 				ip->ip_sum = in_cksum(reass, hlen);
 			clone = reass;
 		} else
 			clone = NULL;
 	} else {
 		/* Convert header to network byte order. */
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 	}
 
 	/* Do the dirty job... */
 	if (clone && ip_divert_ptr != NULL)
 		ip_divert_ptr(clone, incoming);
 
 teeout:
 	/*
 	 * For tee we leave the divert tag attached to original packet.
 	 * It will then continue rule evaluation after the tee rule.
 	 */
 	if (tee)
 		return 0;
 
 	/* Packet diverted and consumed */
 	return 1;
 
 nodivert:
 	m_freem(*m);
 	return 1;
 }
 
 static int
 ipfw_hook(void)
 {
 	struct pfil_head *pfh_inet;
 
 	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
 	if (pfh_inet == NULL)
 		return ENOENT;
 
 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet);
 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet);
 
 	return 0;
 }
 
 static int
 ipfw_unhook(void)
 {
 	struct pfil_head *pfh_inet;
 
 	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
 	if (pfh_inet == NULL)
 		return ENOENT;
 
 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet);
 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet);
 
 	return 0;
 }
 
 #ifdef INET6
 static int
 ipfw6_hook(void)
 {
 	struct pfil_head *pfh_inet6;
 
 	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
 	if (pfh_inet6 == NULL)
 		return ENOENT;
 
 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6);
 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6);
 
 	return 0;
 }
 
 static int
 ipfw6_unhook(void)
 {
 	struct pfil_head *pfh_inet6;
 
 	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
 	if (pfh_inet6 == NULL)
 		return ENOENT;
 
 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6);
 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6);
 
 	return 0;
 }
 #endif /* INET6 */
 
 int
 ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
 {
+	INIT_VNET_IPFW(curvnet);
 	int enable = *(int *)arg1;
 	int error;
 
 	error = sysctl_handle_int(oidp, &enable, 0, req);
 	if (error)
 		return (error);
 
 	enable = (enable) ? 1 : 0;
 
 	if (enable == *(int *)arg1)
 		return (0);
 
 	if (arg1 == &V_fw_enable) {
 		if (enable)
 			error = ipfw_hook();
 		else
 			error = ipfw_unhook();
 	}
 #ifdef INET6
 	if (arg1 == &V_fw6_enable) {
 		if (enable)
 			error = ipfw6_hook();
 		else
 			error = ipfw6_unhook();
 	}
 #endif
 
 	if (error)
 		return (error);
 
 	*(int *)arg1 = enable;
 
 	return (0);
 }
 
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if ((err = ipfw_init()) != 0) {
 			printf("ipfw_init() error\n");
 			break;
 		}
 		if ((err = ipfw_hook()) != 0) {
 			printf("ipfw_hook() error\n");
 			break;
 		}
 #ifdef INET6
 		if ((err = ipfw6_hook()) != 0) {
 			printf("ipfw_hook() error\n");
 			break;
 		}
 #endif
 		break;
 
 	case MOD_UNLOAD:
 		if ((err = ipfw_unhook()) > 0)
 			break;
 #ifdef INET6
 		if ((err = ipfw6_unhook()) > 0)
 			break;
 #endif
 		ipfw_destroy();
 		break;
 
 	default:
 		return EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY - 256);
 MODULE_VERSION(ipfw, 2);
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 185347)
+++ head/sys/netinet/ip_output.c	(revision 185348)
@@ -1,1195 +1,1196 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #endif /* IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
 				x, (ntohl(a.s_addr)>>24)&0xFF,\
 				  (ntohl(a.s_addr)>>16)&0xFF,\
 				  (ntohl(a.s_addr)>>8)&0xFF,\
 				  (ntohl(a.s_addr))&0xFF, y);
 
 #ifdef VIMAGE_GLOBALS
 u_short ip_id;
 #endif
 
 #ifdef MBUF_STRESS_TEST
 int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback
 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
 
 
 extern	struct protosw inetsw[];
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET(curvnet);
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int len, error = 0;
 	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
 	struct in_ifaddr *ia = NULL;
 	int isbroadcast, sw_csum;
 	struct route iproute;
 	struct in_addr odst;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag = NULL;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 
 	if (inp != NULL) {
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		INP_LOCK_ASSERT(inp);
 	}
 
 	if (opt) {
 		len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len;
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Fill in IP header.  If we are not allowing fragmentation,
 	 * then the ip_id field is meaningless, but we don't set it
 	 * to zero.  Doing so causes various problems when devices along
 	 * the path (routers, load balancers, firewalls, etc.) illegally
 	 * disable DF on our packet.  Note that a 16-bit counter
 	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
 	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
 	 * for Counting NATted Hosts", Proc. IMW'02, available at
 	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
 	 */
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip->ip_id = ip_newid();
 		V_ipstat.ips_localout++;
 	} else {
 		hlen = ip->ip_hl << 2;
 	}
 
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 again:
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 */
 	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = (struct rtentry *)NULL;
 	}
 #ifdef IPFIREWALL_FORWARD
 	if (ro->ro_rt == NULL && fwd_tag == NULL) {
 #else
 	if (ro->ro_rt == NULL) {
 #endif
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
 			V_ipstat.ips_noroute++;
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
 			V_ipstat.ips_noroute++;
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia);
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * We want to do any cloning requested by the link layer,
 		 * as this is probably required in all cases for correct
 		 * operation (as it is for ARP).
 		 */
 		if (ro->ro_rt == NULL)
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #else
 			in_rtalloc_ign(ro, 0,
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #endif
 		if (ro->ro_rt == NULL) {
 			V_ipstat.ips_noroute++;
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 		ro->ro_rt->rt_rmx.rmx_pksent++;
 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 		if (ro->ro_rt->rt_flags & RTF_HOST)
 			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 		else
 			isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	}
 	/*
 	 * Calculate MTU.  If we have a route that is up, use that,
 	 * otherwise use the interface's MTU.
 	 */
 	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
 		/*
 		 * This case can happen if the user changed the MTU
 		 * of an interface after enabling IP on it.  Because
 		 * most netifs don't keep track of routes pointing to
 		 * them, there is no way for one to update all its
 		 * routes when the MTU is changed.
 		 */
 		if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
 		mtu = ro->ro_rt->rt_rmx.rmx_mtu;
 	} else {
 		mtu = ifp->if_mtu;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		struct in_multi *inm;
 
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "dst"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		dst = (struct sockaddr_in *)&ro->ro_dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				V_ipstat.ips_noroute++;
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		IN_MULTI_LOCK();
 		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
 		if (inm != NULL &&
 		   (imo == NULL || imo->imo_multicast_loop)) {
 			IN_MULTI_UNLOCK();
 			/*
 			 * If we belong to the destination multicast group
 			 * on the outgoing interface, and the caller did not
 			 * forbid loopback, loop back a copy.
 			 */
 			ip_mloopback(ifp, m, dst, hlen);
 		}
 		else {
 			IN_MULTI_UNLOCK();
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 	}
 
 	/*
 	 * Verify that we have any chance at all of being able to queue the
 	 * packet or packet fragments, unless ALTQ is enabled on the given
 	 * interface in which case packetdrop should be done by queueing.
 	 */
 #ifdef ALTQ
 	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
 	    ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
 	    ifp->if_snd.ifq_maxlen))
 #else
 	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
 	    ifp->if_snd.ifq_maxlen)
 #endif /* ALTQ */
 	{
 		error = ENOBUFS;
 		V_ipstat.ips_odropped++;
 		ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
 		goto bad;
 	}
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip->ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
 	case 1:
 		goto bad;
 	case -1:
 		goto done;
 	case 0:
 	default:
 		break;	/* Continue with packet processing. */
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&inet_pfil_hook))
 		goto passout;
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 			error = netisr_queue(NETISR_IP, m);
 			goto done;
 		} else
 			goto again;	/* Redo the routing table lookup. */
 	}
 
 #ifdef IPFIREWALL_FORWARD
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 		m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		error = netisr_queue(NETISR_IP, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	if (fwd_tag) {
 		dst = (struct sockaddr_in *)&ro->ro_dst;
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 #endif /* IPFIREWALL_FORWARD */
 
 passout:
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			V_ipstat.ips_badaddr++;
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
 	if (sw_csum & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		sw_csum &= ~CSUM_DELAY_DATA;
 	}
 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip->ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
 	    ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 		ip->ip_sum = 0;
 		if (sw_csum & CSUM_DELAY_IP)
 			ip->ip_sum = in_cksum(m, hlen);
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				ia->ia_ifa.if_opackets +=
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
 			else
 				ia->ia_ifa.if_opackets++;
 			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m->m_flags &= ~(M_PROTOFLAGS);
 
 		error = (*ifp->if_output)(ifp, m,
 				(struct sockaddr *)dst, ro->ro_rt);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		V_ipstat.ips_cantfrag++;
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				ia->ia_ifa.if_opackets++;
 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m->m_flags &= ~(M_PROTOFLAGS);
 
 			error = (*ifp->if_output)(ifp, m,
 			    (struct sockaddr *)dst, ro->ro_rt);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		V_ipstat.ips_fragmented++;
 
 done:
 	if (ro == &iproute && ro->ro_rt) {
 		RTFREE(ro->ro_rt);
 	}
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags, int sw_csum)
 {
 	INIT_VNET_INET(curvnet);
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 
 	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
 		V_ipstat.ips_cantfrag++;
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
 	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 
 	if (len > PAGE_SIZE) {
 		/* 
 		 * Fragment large datagrams such that each segment 
 		 * contains a multiple of PAGE_SIZE amount of data, 
 		 * plus headers. This enables a receiver to perform 
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 		struct mbuf *m;
 
 		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
 			off += m->m_len;
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an 
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		MGETHDR(m, M_DONTWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			V_ipstat.ips_odropped++;
 			goto done;
 		}
 		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copy().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip->ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
 		if (off + len >= ip->ip_len) {	/* last fragment */
 			len = ip->ip_len - off;
 			m->m_flags |= M_LASTFRAG;
 		} else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copy(m0, off, len);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			V_ipstat.ips_odropped++;
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 		m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (sw_csum & CSUM_DELAY_IP)
 			mhip->ip_sum = in_cksum(m, mhlen);
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	V_ipstat.ips_ofragments += nfrags;
 
 	/* set first marker for fragment chain */
 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
 	m0->m_pkthdr.csum_data = nfrags;
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip->ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off |= IP_MF;
 	ip->ip_off = htons(ip->ip_off);
 	ip->ip_sum = 0;
 	if (sw_csum & CSUM_DELAY_IP)
 		ip->ip_sum = in_cksum(m0, hlen);
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
+	INIT_VNET_INET(curvnet);
 	struct ip *ip;
 	u_short csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 	csum = in_cksum_skip(m, ip->ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(u_short) > m->m_len) {
 		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
 		    m->m_len, offset, ip->ip_p);
 		/*
 		 * XXX
 		 * this shouldn't happen, but if it does, the
 		 * correct behavior may be to insert the checksum
 		 * in the appropriate next mbuf in the chain.
 		 */
 		return;
 	}
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		if ((sopt->sopt_level == SOL_SOCKET) &&
 		    (sopt->sopt_name == SO_SETFIB)) {
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval > 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_FAITH:
 				OPTSET(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			}
 			break;
 #undef OPTSET
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			struct mbuf *m;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			req = mtod(m, caddr_t);
 			error = ipsec4_set_policy(inp, sopt->sopt_name, req,
 			    m->m_len, (sopt->sopt_td != NULL) ?
 			    sopt->sopt_td->td_ucred : NULL);
 			m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt, 
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_FAITH:
 				optval = OPTBIT(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != 0) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
     int hlen)
 {
 	register struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_DONTWAIT);
 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 #if 1 /* XXX */
 		if (dst->sin_family != AF_INET) {
 			printf("ip_mloopback: bad address family %d\n",
 						dst->sin_family);
 			dst->sin_family = AF_INET;
 		}
 #endif
 		if_simloop(ifp, copym, dst->sin_family, 0);
 	}
 }
Index: head/sys/netinet/tcp_input.c
===================================================================
--- head/sys/netinet/tcp_input.c	(revision 185347)
+++ head/sys/netinet/tcp_input.c	(revision 185348)
@@ -1,3185 +1,3186 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"		/* for ipfw_fwd	*/
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/vimage.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static const int tcprexmtthresh = 3;
 
 #ifdef VIMAGE_GLOBALS
 struct	tcpstat tcpstat;
 int	blackhole;
 int	tcp_delack_enabled;
 int	drop_synfin;
 int	tcp_do_rfc3042;
 int	tcp_do_rfc3390;
 int	tcp_do_ecn;
 int	tcp_ecn_maxretries;
 int	tcp_insecure_rst;
 int	tcp_do_autorcvbuf;
 int	tcp_autorcvbuf_inc;
 int	tcp_autorcvbuf_max;
 #endif
 
 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats,
     CTLFLAG_RW, tcpstat , tcpstat,
     "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
     blackhole, 0, "Do not send RST on segments to closed ports");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack,
     CTLFLAG_RW, tcp_delack_enabled, 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin,
     CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable,
     CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support");
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries,
     CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst,
     CTLFLAG_RW, tcp_insecure_rst, 0,
     "Follow the old (insecure) criteria for accepting RST packets");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto,
     CTLFLAG_RW, tcp_do_autorcvbuf, 0,
     "Enable automatic receive buffer sizing");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc,
     CTLFLAG_RW, tcp_autorcvbuf_inc, 0,
     "Incrementor step size of automatic receive buffer");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max,
     CTLFLAG_RW, tcp_autorcvbuf_max, 0,
     "Max size of automatic receive buffer");
 
 #ifdef VIMAGE_GLOBALS
 struct inpcbhead tcb;
 struct inpcbinfo tcbinfo;
 #endif
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int, uint8_t);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline
 		 tcp_congestion_exp(struct tcpcb *);
 
 static void inline
 tcp_congestion_exp(struct tcpcb *tp)
 {
 	u_int win;
 	
 	win = min(tp->snd_wnd, tp->snd_cwnd) /
 	    2 / tp->t_maxseg;
 	if (win < 2)
 		win = 2;
 	tp->snd_ssthresh = win * tp->t_maxseg;
 	ENTER_FASTRECOVERY(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_ECN_PERMIT)
 		tp->t_flags |= TF_ECN_SND_CWR;
 }
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
 		nd6_nud_hint(NULL, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  *	- there is no delayed ack timer in progress and
  *	- our last ack wasn't a 0-sized window.  We never want to delay
  *	  the ack that opens up a 0-sized window and
  *		- delayed acks are enabled or
  *		- this is a half-synchronized T/TCP connection.
  */
 #define DELAY_ACK(tp)							\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		struct ip6_hdr *ip6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
 }
 #endif
 
 void
 tcp_input(struct mbuf *m, int off0)
 {
 	INIT_VNET_INET(curvnet);
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 #ifdef IPSEC
 	INIT_VNET_IPSEC(curvnet);
 #endif
 	struct tcphdr *th;
 	struct ip *ip = NULL;
 	struct ipovly *ipov;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int optlen = 0;
 	int len, tlen, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 	uint8_t iptos;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 	const int isipv6 = 0;
 #endif
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	to.to_flags = 0;
 	V_tcpstat.tcps_rcvtotal++;
 
 	if (isipv6) {
 #ifdef INET6
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
 		ip6 = mtod(m, struct ip6_hdr *);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
 			V_tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 #else
 		th = NULL;		/* XXX: Avoid compiler warning. */
 #endif
 	} else {
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m, (struct mbuf *)0);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				V_tcpstat.tcps_rcvshort++;
 				return;
 			}
 		}
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ip->ip_len;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr,
 						htonl(m->m_pkthdr.csum_data +
 							ip->ip_len +
 							IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 #ifdef TCPDEBUG
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 #endif
 		} else {
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = sizeof (struct ip) + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 			th->th_sum = in_cksum(m, len);
 		}
 		if (th->th_sum) {
 			V_tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 	}
 
 #ifdef INET6
 	if (isipv6)
 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 	else
 #endif
 		iptos = ip->ip_tos;
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		V_tcpstat.tcps_rcvbadoff++;
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 		if (isipv6) {
 #ifdef INET6
 			IP6_EXTHDR_CHECK(m, off0, off, );
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 #endif
 		} else {
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					V_tcpstat.tcps_rcvshort++;
 					return;
 				}
 				ip = mtod(m, struct ip *);
 				ipov = (struct ipovly *)ip;
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment.
 	 */
 	INP_INFO_WLOCK(&V_tcbinfo);
 findpcb:
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 #ifdef IPFIREWALL_FORWARD
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 	if (fwd_tag != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_hash(&V_tcbinfo,
 					ip->ip_src, th->th_sport,
 					ip->ip_dst, th->th_dport,
 					0, m->m_pkthdr.rcvif);
 		if (!inp) {
 			/* It's new.  Try to find the ambushing socket. */
 			inp = in_pcblookup_hash(&V_tcbinfo,
 						ip->ip_src, th->th_sport,
 						next_hop->sin_addr,
 						next_hop->sin_port ?
 						    ntohs(next_hop->sin_port) :
 						    th->th_dport,
 						INPLOOKUP_WILDCARD,
 						m->m_pkthdr.rcvif);
 		}
 		/* Remove the tag from the packet.  We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 	} else
 #endif /* IPFIREWALL_FORWARD */
 	{
 		if (isipv6) {
 #ifdef INET6
 			inp = in6_pcblookup_hash(&V_tcbinfo,
 						 &ip6->ip6_src, th->th_sport,
 						 &ip6->ip6_dst, th->th_dport,
 						 INPLOOKUP_WILDCARD,
 						 m->m_pkthdr.rcvif);
 #endif
 		} else
 			inp = in_pcblookup_hash(&V_tcbinfo,
 						ip->ip_src, th->th_sport,
 						ip->ip_dst, th->th_dport,
 						INPLOOKUP_WILDCARD,
 						m->m_pkthdr.rcvif);
 	}
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    tcp_log_in_vain == 2) {
 			if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2)
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_WLOCK(inp);
 
 #ifdef IPSEC
 #ifdef INET6
 	if (isipv6 && ipsec6_in_reject(m, inp)) {
 		V_ipsec6stat.in_polvio++;
 		goto dropunlock;
 	} else
 #endif /* INET6 */
 	if (ipsec4_in_reject(m, inp) != 0) {
 		V_ipsec4stat.in_polvio++;
 		goto dropunlock;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
 			goto dropunlock;
 		else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch
 	 * stray or duplicate segments arriving late.  If this segment
 	 * was a legitimate new connection attempt the old INPCB gets
 	 * removed and we can try again to find a listening socket.
 	 */
 	if (inp->inp_vflag & INP_TIMEWAIT) {
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
 		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return;
 	}
 	/*
 	 * The TCPCB may no longer exist if the connection is winding
 	 * down or it is in the CLOSED state.  Either way we drop the
 	 * segment and send an appropriate response.
 	 */
 	tp = intotcpcb(inp);
 	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG) {
 		ostate = tp->t_state;
 		if (isipv6) {
 #ifdef INET6
 			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
 #endif
 		} else
 			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
 		tcp_savetcp = *th;
 	}
 #endif
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.
 	 */
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct in_conninfo inc;
 
 		KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
 		    "tp not listening", __func__));
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_isipv6 = isipv6;
 #ifdef INET6
 		if (isipv6) {
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
 			if (!syncache_expand(&inc, &to, th, &so, m)) {
 				/*
 				 * No syncache entry or ACK was not
 				 * for our SYN/ACK.  Send a RST.
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 */
 			INP_WUNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			INP_WLOCK(inp);		/* new connection */
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return;
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			V_tcpstat.tcps_badsyn++;
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc);	/* XXX: Not needed! */
 			V_tcpstat.tcps_badsyn++;
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			V_tcpstat.tcps_badsyn++;
                 	goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			if ((ia6 = ip6_getdstifaddr(m)) &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 		}
 #endif
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 		if (isipv6) {
 #ifdef INET6
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 #endif
 		} else {
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG)
 			tcp_trace(TA_INPUT, ostate, tp,
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		syncache_add(&inc, &to, th, inp, &so, m);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Everything already unlocked by syncache_add().
 		 */
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		return;
 	}
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return;
 
 dropwithreset:
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 }
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	int thflags, acked, ourfinisacked, needoutput = 0;
 	int headlocked = 1;
 	int rstreason, todrop, win;
 	u_long tiwin;
 	struct tcpopt to;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	thflags = th->th_flags;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 
 	/*
 	 * Unscale the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			V_tcpstat.tcps_ecn_ce++;
 			break;
 		case IPTOS_ECN_ECT0:
 			V_tcpstat.tcps_ecn_ect0++;
 			break;
 		case IPTOS_ECN_ECT1:
 			V_tcpstat.tcps_ecn_ect1++;
 			break;
 		}
 
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 
 		/*
 		 * Congestion experienced.
 		 * Ignore if we are already trying to recover.
 		 */
 		if ((thflags & TH_ECE) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
 			V_tcpstat.tcps_ecn_rcwnd++;
 			tcp_congestion_exp(tp);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, ticks))
 			to.to_tsecr = 0;
 	}
 
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		}
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = ticks;
 		}
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (to.to_flags & TOF_SACKPERM) == 0)
 			tp->t_flags &= ~TF_SACK_PERMIT;
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    LIST_EMPTY(&tp->t_segq) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = ticks;
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
 			    ((!V_tcp_do_newreno &&
 			      !(tp->t_flags & TF_SACK_PERMIT) &&
 			      tp->t_dupacks < tcprexmtthresh) ||
 			     ((V_tcp_do_newreno ||
 			       (tp->t_flags & TF_SACK_PERMIT)) &&
 			      !IN_FASTRECOVERY(tp) &&
 			      (to.to_flags & TOF_SACK) == 0 &&
 			      TAILQ_EMPTY(&tp->snd_holes)))) {
 				KASSERT(headlocked,
 				    ("%s: headlocked", __func__));
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				headlocked = 0;
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				++V_tcpstat.tcps_predack;
 				/*
 				 * "bad retransmit" recovery.
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    ticks < tp->t_badrxtwin) {
 					++V_tcpstat.tcps_sndrexmitbad;
 					tp->snd_cwnd = tp->snd_cwnd_prev;
 					tp->snd_ssthresh =
 					    tp->snd_ssthresh_prev;
 					tp->snd_recover = tp->snd_recover_prev;
 					if (tp->t_flags & TF_WASFRECOVERY)
 					    ENTER_FASTRECOVERY(tp);
 					tp->snd_nxt = tp->snd_max;
 					tp->t_badrxtwin = 0;
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - to.to_tsecr)
 						tp->t_rttlow = ticks - to.to_tsecr;
 					tcp_xmit_timer(tp,
 					    ticks - to.to_tsecr + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				V_tcpstat.tcps_rcvackpack++;
 				V_tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* Some progress has been made. */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 #ifdef TCPDEBUG
 				if (so->so_options & SO_DEBUG)
 					tcp_trace(TA_INPUT, ostate, tp,
 					    (void *)tcp_saveipgen,
 					    &tcp_savetcp, 0);
 #endif
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			KASSERT(headlocked, ("%s: headlocked", __func__));
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			headlocked = 0;
 			/*
 			 * This is a pure, in-sequence data packet
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			++V_tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			V_tcpstat.tcps_rcvpack++;
 			V_tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* Some progress has been made */
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		/*
 		 * Automatic sizing of receive socket buffer.  Often the send
 		 * buffer size is not optimally adjusted to the actual network
 		 * conditions at hand (delay bandwidth product).  Setting the
 		 * buffer size too small limits throughput on links with high
 		 * bandwidth and high delay (eg. trans-continental/oceanic links).
 		 *
 		 * On the receive side the socket buffer memory is only rarely
 		 * used to any significant extent.  This allows us to be much
 		 * more aggressive in scaling the receive socket buffer.  For
 		 * the case that the buffer space is actually used to a large
 		 * extent and we run out of kernel memory we can simply drop
 		 * the new segments; TCP on the sender will just retransmit it
 		 * later.  Setting the buffer size too big may only consume too
 		 * much kernel memory if the application doesn't read() from
 		 * the socket or packet loss or reordering makes use of the
 		 * reassembly queue.
 		 *
 		 * The criteria to step up the receive buffer one notch are:
 		 *  1. the number of bytes received during the time it takes
 		 *     one timestamp to be reflected back to us (the RTT);
 		 *  2. received bytes per RTT is within seven eighth of the
 		 *     current socket buffer size;
 		 *  3. receive buffer size has not hit maximal automatic size;
 		 *
 		 * This algorithm does one step per RTT at most and only if
 		 * we receive a bulk stream w/o packet losses or reorderings.
 		 * Shrinking the buffer during idle times is not necessary as
 		 * it doesn't consume any memory when idle.
 		 *
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 */
 			if (V_tcp_do_autorcvbuf &&
 			    to.to_tsecr &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 				if (to.to_tsecr > tp->rfbuf_ts &&
 				    to.to_tsecr - tp->rfbuf_ts < hz) {
 					if (tp->rfbuf_cnt >
 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
 					    so->so_rcv.sb_hiwat <
 					    V_tcp_autorcvbuf_max) {
 						newsize =
 						    min(so->so_rcv.sb_hiwat +
 						    V_tcp_autorcvbuf_inc,
 						    V_tcp_autorcvbuf_max);
 					}
 					/* Start over with next RTT. */
 					tp->rfbuf_ts = 0;
 					tp->rfbuf_cnt = 0;
 				} else
 					tp->rfbuf_cnt += tlen;	/* add up */
 			}
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(&so->so_rcv,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			rstreason = BANDLIM_UNLIMITED;
 			goto dropwithreset;
 		}
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
 			tp = tcp_drop(tp, ECONNREFUSED);
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			V_tcpstat.tcps_connects++;
 			soisconnected(so);
 #ifdef MAC
 			SOCK_LOCK(so);
 			mac_socketpeer_set_from_mbuf(m, so);
 			SOCK_UNLOCK(so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += tp->rcv_wnd;
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp) && tlen != 0)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 				tp->t_flags |= TF_ECN_PERMIT;
 				V_tcpstat.tcps_ecn_shs++;
 			}
 			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_state = TCPS_FIN_WAIT_1;
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
 				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.  If segment contains CC option
 			 * and there is a cached CC, apply TAO test.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 * If there was no CC option, clear cached CC value.
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_state = TCPS_SYN_RECEIVED;
 		}
 
 		KASSERT(headlocked, ("%s: trimthenstep6: head not locked",
 		    __func__));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			V_tcpstat.tcps_rcvpackafterwin++;
 			V_tcpstat.tcps_rcvbyteafterwin += todrop;
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *      do normal processing.
 	 *
 	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 *
 	 *
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 * Note: this does not take into account delayed ACKs, so
 	 *   we should test against last_ack_sent instead of rcv_nxt.
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
 	 *   that brute force RST attacks are possible.  To combat this,
 	 *   we use a much stricter check while in the ESTABLISHED state,
 	 *   only accepting RSTs where the sequence number is equal to
 	 *   last_ack_sent.  In all other states (the states in which a
 	 *   RST is more likely), the more permissive check is used.
 	 * If we have multiple segments in flight, the initial reset
 	 * segment sequence numbers will be to the left of last_ack_sent,
 	 * but they will eventually catch up.
 	 * In any case, it never made sense to trim reset segments to
 	 * fit the receive window since RFC 1122 says:
 	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 	 *
 	 *    A TCP SHOULD allow a received RST segment to include data.
 	 *
 	 *    DISCUSSION
 	 *         It has been suggested that a RST segment could contain
 	 *         ASCII text that encoded and explained the cause of the
 	 *         RST.  No standard has yet been established for such
 	 *         data.
 	 *
 	 * If the reset segment passes the sequence number test examine
 	 * the state:
 	 *    SYN_RECEIVED STATE:
 	 *	If passive open, return to LISTEN state.
 	 *	If active open, inform user that connection was refused.
 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 	 *	Inform user that connection was reset, and close tcb.
 	 *    CLOSING, LAST_ACK STATES:
 	 *	Close the tcb.
 	 *    TIME_WAIT STATE:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST) {
 		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			switch (tp->t_state) {
 
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 
 			case TCPS_ESTABLISHED:
 				if (V_tcp_insecure_rst == 0 &&
 				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
 				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
 					V_tcpstat.tcps_badrst++;
 					goto drop;
 				}
 				/* FALLTHROUGH */
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
 				tp->t_state = TCPS_CLOSED;
 				V_tcpstat.tcps_drops++;
 				KASSERT(headlocked, ("%s: trimthenstep6: "
 				    "tcp_close: head not locked", __func__));
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				KASSERT(headlocked, ("%s: trimthenstep6: "
 				    "tcp_close.2: head not locked", __func__));
 				tp = tcp_close(tp);
 				break;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			V_tcpstat.tcps_rcvduppack++;
 			V_tcpstat.tcps_rcvdupbyte += tlen;
 			V_tcpstat.tcps_pawsdrop++;
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			V_tcpstat.tcps_rcvduppack++;
 			V_tcpstat.tcps_rcvdupbyte += todrop;
 		} else {
 			V_tcpstat.tcps_rcvpartduppack++;
 			V_tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		char *s;
 
 		KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head "
 		    "not locked", __func__));
 		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
 			    "was closed, sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tp = tcp_close(tp);
 		V_tcpstat.tcps_rcvafterclose++;
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		V_tcpstat.tcps_rcvpackafterwin++;
 		if (todrop >= tlen) {
 			V_tcpstat.tcps_rcvbyteafterwin += tlen;
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				V_tcpstat.tcps_rcvwinprobe++;
 			} else
 				goto dropafterack;
 		} else
 			V_tcpstat.tcps_rcvbyteafterwin += todrop;
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE: 
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be 
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = ticks;
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If a SYN is in the window, then this is an
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
 		KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: "
 		    "head not locked", __func__));
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto drop;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		V_tcpstat.tcps_connects++;
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			tp->snd_wnd = tiwin;
 		}
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_state = TCPS_FIN_WAIT_1;
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
 			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			V_tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
 		     !TAILQ_EMPTY(&tp->snd_holes)))
 			tcp_sack_doack(tp, &to, th->th_ack);
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				V_tcpstat.tcps_rcvdupack++;
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change), the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				    ((V_tcp_do_newreno ||
 				      (tp->t_flags & TF_SACK_PERMIT)) &&
 				     IN_FASTRECOVERY(tp))) {
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's 	
 						 * worth of data in flight.
 						 */
 						awnd = (tp->snd_nxt - tp->snd_fack) +
 							tp->sackhint.sack_bytes_rexmit;
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += tp->t_maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else if (V_tcp_do_newreno ||
 					    V_tcp_do_ecn) {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					tcp_congestion_exp(tp);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						V_tcpstat.tcps_sack_recovery_episode++;
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    tp->t_maxseg;
 					(void) tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == tp->t_maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
 			if (IN_FASTRECOVERY(tp)) {
 				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 					if (tp->t_flags & TF_SACK_PERMIT)
 						tcp_sack_partialack(tp, th);
 					else
 						tcp_newreno_partial_ack(tp, th);
 				} else {
 					/*
 					 * Out of fast recovery.
 					 * Window inflation should have left us
 					 * with approximately snd_ssthresh
 					 * outstanding data.
 					 * But in case we would be inclined to
 					 * send a burst, better to do it via
 					 * the slow start mechanism.
 					 */
 					if (SEQ_GT(th->th_ack +
 							tp->snd_ssthresh,
 						   tp->snd_max))
 						tp->snd_cwnd = tp->snd_max -
 								th->th_ack +
 								tp->t_maxseg;
 					else
 						tp->snd_cwnd = tp->snd_ssthresh;
 				}
 			}
 		} else {
 			if (tp->t_dupacks >= tcprexmtthresh &&
 			    tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 		}
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		KASSERT(headlocked, ("%s: process_ACK: head not locked",
 		    __func__));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = th->th_ack - tp->snd_una;
 		V_tcpstat.tcps_rcvackpack++;
 		V_tcpstat.tcps_rcvackbyte += acked;
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
 			++V_tcpstat.tcps_sndrexmitbad;
 			tp->snd_cwnd = tp->snd_cwnd_prev;
 			tp->snd_ssthresh = tp->snd_ssthresh_prev;
 			tp->snd_recover = tp->snd_recover_prev;
 			if (tp->t_flags & TF_WASFRECOVERY)
 				ENTER_FASTRECOVERY(tp);
 			tp->snd_nxt = tp->snd_max;
 			tp->t_badrxtwin = 0;	/* XXX probably not required */
 		}
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    to.to_tsecr) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
 				tp->t_rttlow = ticks - to.to_tsecr;
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * When new data is acked, open the congestion window.
 		 * If the window gives us less than ssthresh packets
 		 * in flight, open exponentially (maxseg per packet).
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte
 		 * to avoid capping cwnd (as suggested in RFC2581).
 		 */
 		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
 		    !IN_FASTRECOVERY(tp)) {
 			u_int cw = tp->snd_cwnd;
 			u_int incr = tp->t_maxseg;
 			if (cw > tp->snd_ssthresh)
 				incr = max((incr * incr / cw), 1);
 			tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
 		}
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop_locked(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
 		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 		    !IN_FASTRECOVERY(tp) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 		    IN_FASTRECOVERY(tp) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover))
 			EXIT_FASTRECOVERY(tp);
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 *
 				 * XXXjl:
 				 * we should release the tp also, and use a
 				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					int timeout;
 
 					soisdisconnected(so);
 					timeout = (tcp_fast_finwait2_recycle) ? 
 						tcp_finwait2_timeout : tcp_maxidle;
 					tcp_timer_activate(tp, TT_2MSL, timeout);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				KASSERT(headlocked, ("%s: process_ACK: "
 				    "head not locked", __func__));
 				tcp_twstart(tp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				headlocked = 0;
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				KASSERT(headlocked, ("%s: process_ACK: "
 				    "tcp_close: head not locked", __func__));
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	KASSERT(headlocked, ("%s: step6: head not locked", __func__));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			V_tcpstat.tcps_rcvwinupd++;
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	KASSERT(headlocked, ("%s: dodata: head not locked", __func__));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp))
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			V_tcpstat.tcps_rcvpack++;
 			V_tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tp->t_state = TCPS_CLOSE_WAIT;
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tp->t_state = TCPS_CLOSING;
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			KASSERT(headlocked == 1, ("%s: dodata: "
 			    "TCP_FIN_WAIT_2: head not locked", __func__));
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	headlocked = 0;
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 
 check_delack:
 	KASSERT(headlocked == 0, ("%s: check_delack: head locked",
 	    __func__));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(tp->t_inpcb);
 	return;
 
 dropafterack:
 	KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__));
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	KASSERT(headlocked, ("%s: headlocked should be 1", __func__));
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__));
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	if (headlocked)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 static void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	} else
 #endif
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (th->th_flags & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (th->th_flags & TH_SYN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 #ifdef TCP_SIGNATURE
 		/*
 		 * XXX In order to reply to a host which has set the
 		 * TCP_SIGNATURE option in its initial SYN, we have to
 		 * record the fact that the option was observed here
 		 * for the syncache code to perform the correct response.
 		 */
 		case TCPOPT_SIGNATURE:
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 #endif
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			V_tcpstat.tcps_sack_rcv_blocks++;
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	int delta;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	V_tcpstat.tcps_rttupdated++;
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing
  * interface without forcing IP to fragment; if bigger than
  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
  * to utilize large mbufs.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * In case of T/TCP, we call this routine during implicit connection
  * setup as well (offer = -1), to initialize maxseg from the cached
  * MSS of our peer.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer,
     struct hc_metrics_lite *metricptr, int *mtuflags)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	int mss;
 	u_long maxmtu;
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 	int origoffer = offer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
 	} else
 #endif
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
 	}
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxopd above.
 			 */
 			offer = tp->t_maxopd;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu int tcp hostcache, use it
 	 * else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		} else
 #endif
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxopd will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 */
 	mss = max(mss, 64);
 
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * origoffer==-1 indicates that no segments were received yet.
 	 * In this case we just guess.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
 
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 	if (mss > MCLBYTES)
 		mss &= ~(MCLBYTES-1);
 #else
 	if (mss > MCLBYTES)
 		mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int rtt, mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	int mtuflags = 0;
 #ifdef INET6
 	int isipv6;
 #endif
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
+	INIT_VNET_INET(tp->t_vnet);
 	
 	tcp_mss_update(tp, offer, &metrics, &mtuflags);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
 #ifdef INET6
 	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 #endif
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	tp->t_maxseg = mss;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * While we're here, check the others too.
 	 */
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		V_tcpstat.tcps_usedrtt++;
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			V_tcpstat.tcps_usedrttvar++;
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 			      tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
 		V_tcpstat.tcps_usedssthresh++;
 	}
 	if (metrics.rmx_bandwidth)
 		tp->snd_bandwidth = metrics.rmx_bandwidth;
 
 	/*
 	 * Set the slow-start flight size depending on whether this
 	 * is a local network or not.
 	 *
 	 * Extend this so we cache the cwnd too and retrieve it here.
 	 * Make cwnd even bigger than RFC3390 suggests but only if we
 	 * have previous experience with the remote host. Be careful
 	 * not make cwnd bigger than remote receive window or our own
 	 * send socket buffer. Maybe put some additional upper bound
 	 * on the retrieved cwnd. Should do incremental updates to
 	 * hostcache when cwnd collapses so next connection doesn't
 	 * overloads the path again.
 	 *
 	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
 	 * We currently check only in syncache_socket for that.
 	 */
 #define TCP_METRICS_CWND
 #ifdef TCP_METRICS_CWND
 	if (metrics.rmx_cwnd)
 		tp->snd_cwnd = max(mss,
 				min(metrics.rmx_cwnd / 2,
 				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
 	else
 #endif
 	if (V_tcp_do_rfc3390)
 		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
 #ifdef INET6
 	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 		 (!isipv6 && in_localaddr(inp->inp_faddr)))
 #else
 	else if (in_localaddr(inp->inp_faddr))
 #endif
 		tp->snd_cwnd = mss * V_ss_fltsz_local;
 	else
 		tp->snd_cwnd = mss * V_ss_fltsz;
 
 	/* Check the interface for TSO capabilities. */
 	if (mtuflags & CSUM_TSO)
 		tp->t_flags |= TF_TSO;
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	int mss = 0;
 	u_long maxmtu = 0;
 	u_long thcmtu = 0;
 	size_t min_protoh;
 #ifdef INET6
 	int isipv6 = inc->inc_isipv6 ? 1 : 0;
 #endif
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
 	if (isipv6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	} else
 #endif
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
 		tp->snd_cwnd -= th->th_ack - tp->snd_una;
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
 }
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 185347)
+++ head/sys/netinet/tcp_subr.c	(revision 185348)
@@ -1,2300 +1,2306 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 int	tcp_mssdflt;
 #ifdef INET6
 int	tcp_v6mssdflt;
 #endif
 int	tcp_minmss;
 int	tcp_do_rfc1323;
 static int	icmp_may_rst;
 static int	tcp_isn_reseed_interval;
 static int	tcp_inflight_enable;
 static int	tcp_inflight_rttthresh;
 static int	tcp_inflight_min;
 static int	tcp_inflight_max;
 static int	tcp_inflight_stab;
 #endif
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
-	INIT_VNET_INET(TD_TO_VNET(curthread));
+	INIT_VNET_INET(curvnet);
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
-SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLTYPE_INT|CTLFLAG_RW,
-	   &tcp_mssdflt, 0, &sysctl_net_inet_tcp_mss_check, "I",
-	   "Default TCP Maximum Segment Size");
+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
+    CTLTYPE_INT|CTLFLAG_RW, tcp_mssdflt, 0,
+    &sysctl_net_inet_tcp_mss_check, "I",
+    "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
-	INIT_VNET_INET6(TD_TO_VNET(curthread));
+	INIT_VNET_INET6(curvnet);
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
-SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLTYPE_INT|CTLFLAG_RW,
-	   &tcp_v6mssdflt, 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
-	   "Default TCP Maximum Segment Size for IPv6");
+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+    CTLTYPE_INT|CTLFLAG_RW, tcp_v6mssdflt, 0,
+    &sysctl_net_inet_tcp_mss_v6_check, "I",
+   "Default TCP Maximum Segment Size for IPv6");
 #endif
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss,
     CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
     CTLFLAG_RW, tcp_do_rfc1323, 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount,
     CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst,
     CTLFLAG_RW, icmp_may_rst, 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval,
     CTLFLAG_RW, tcp_isn_reseed_interval, 0,
     "Seconds between reseeding of ISN secret");
 
 /*
  * TCP bandwidth limiting sysctls.  Note that the default lower bound of
  * 1024 exists only for debugging.  A good production default would be
  * something like 6100.
  */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
     "TCP inflight data limiting");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable,
     CTLFLAG_RW, tcp_inflight_enable, 0,
     "Enable automatic TCP inflight data limiting");
 
 static int	tcp_inflight_debug = 0;
 SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
 
-SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
-    &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
-    "RTT threshold below which inflight will deactivate itself");
+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, rttthresh,
+    CTLTYPE_INT|CTLFLAG_RW, tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks,
+    "I", "RTT threshold below which inflight will deactivate itself");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min,
     CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max,
     CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab,
     CTLFLAG_RW, tcp_inflight_stab, 0,
     "Inflight Algorithm Stabilization 20 = 2 packets");
 
 uma_zone_t sack_hole_zone;
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static void	tcp_isn_tick(void *);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	512
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 };
 
 static uma_zone_t tcpcb_zone;
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 struct callout isn_callout;
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 void
 tcp_init(void)
 {
 	INIT_VNET_INET(curvnet);
 	int hashsize;
 
 	V_blackhole = 0;
 	V_tcp_delack_enabled = 1;
 	V_drop_synfin = 0;
 	V_tcp_do_rfc3042 = 1;
 	V_tcp_do_rfc3390 = 1;
 	V_tcp_do_ecn = 0;
 	V_tcp_ecn_maxretries = 1;
 	V_tcp_insecure_rst = 0;
 	V_tcp_do_autorcvbuf = 1;
 	V_tcp_autorcvbuf_inc = 16*1024;
 	V_tcp_autorcvbuf_max = 256*1024;
 
 	V_tcp_mssdflt = TCP_MSS;
 #ifdef INET6
 	V_tcp_v6mssdflt = TCP6_MSS;
 #endif
 	V_tcp_minmss = TCP_MINMSS;
 	V_tcp_do_rfc1323 = 1;
 	V_icmp_may_rst = 1;
 	V_tcp_isn_reseed_interval = 0;
 	V_tcp_inflight_enable = 1;
 	V_tcp_inflight_min = 6144;
 	V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	V_tcp_inflight_stab = 20;
 
 	V_path_mtu_discovery = 1;
 	V_ss_fltsz = 1;
 	V_ss_fltsz_local = 4;
 	V_tcp_do_newreno = 1;
 	V_tcp_do_tso = 1;
 	V_tcp_do_autosndbuf = 1;
 	V_tcp_autosndbuf_inc = 8*1024;
 	V_tcp_autosndbuf_max = 256*1024;
 
 	V_nolocaltimewait = 0;
 
 	V_tcp_do_sack = 1;
 	V_tcp_sack_maxholes = 128;
 	V_tcp_sack_globalmaxholes = 65536;
 	V_tcp_sack_globalholes = 0;
 
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
 	LIST_INIT(&V_tcb);
 	V_tcbinfo.ipi_listhead = &V_tcb;
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	tcp_tcbhashsize = hashsize;
 	V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
 	    &V_tcbinfo.ipi_hashmask);
 	V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
 	    &V_tcbinfo.ipi_porthashmask);
 	V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
 	    NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcpcb_zone, maxsockets);
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 	tcp_reass_init();
 	ISN_LOCK_INIT();
 	callout_init(&isn_callout, CALLOUT_MPSAFE);
 	tcp_isn_tick(NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 }
 
 void
 tcp_fini(void *xtp)
 {
 
 	callout_stop(&isn_callout);
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	} else
 #endif
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_DONTWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, n_short);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 						tlen));
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	} else
 #endif
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = tlen;
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= IP_DF;
 	}
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 #ifdef INET6
 	if (isipv6) {
 		nth->th_sum = 0;
 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
 					sizeof(struct ip6_hdr),
 					tlen - sizeof(struct ip6_hdr));
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	} else
 #endif /* INET6 */
 	{
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 	}
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 	else
 #endif /* INET6 */
 	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
 		(void) tcp_output_reset(tp);
 		V_tcpstat.tcps_drops++;
 	} else
 		V_tcpstat.tcps_conndrops++;
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we
 	 * delete the PCB.
 	 */
 	callout_stop(&tp->t_timers->tt_rexmt);
 	callout_stop(&tp->t_timers->tt_persist);
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 				      (isipv6 ? sizeof (struct ip6_hdr) +
 					       sizeof (struct tcphdr) :
 #endif
 				       sizeof (struct tcpiphdr)
 #ifdef INET6
 				       )
 #endif
 				      );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
 		metrics.rmx_bandwidth = tp->snd_bandwidth;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		uma_zfree(tcp_reass_zone, q);
 		tp->t_segqlen--;
 		V_tcp_reass_qsize--;
 	}
 	/* Disconnect offload device, if any. */
 	tcp_offload_detach(tp);
 		
 	tcp_free_sackholes(tp);
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_inpcb->inp_vnet);
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* Notify any offload devices of listener close */
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_close(tp);
 	in_pcbdrop(inp);
 	V_tcpstat.tcps_closed++;
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_vflag & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_vflag &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INIT_VNET_INET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 		struct tseg_qent *te;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	usefull.
 	 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_vflag & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				while ((te = LIST_FIRST(&tcpb->t_segq))
 			            != NULL) {
 					LIST_REMOVE(te, tqe_q);
 					m_freem(te->tqe_m);
 					uma_zfree(tcp_reass_zone, te);
 					tcpb->t_segqlen--;
 					V_tcp_reass_qsize--;
 				}
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
+#ifdef INVARIANTS
+	INIT_VNET_INET(inp->inp_vnet); /* V_tcbinfo WLOCK ASSERT */
+#endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		m = syncache_pcbcount();
 		n = V_tcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ ((m + n) + n/8) * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_vflag & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0)
 				inp_list[i++] = inp;
 		}
 		INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_vflag & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
 	    addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 	if (inp != NULL) {
 		INP_RLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = ENOENT;
 	}
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error, mapped = 0;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 			return (EINVAL);
 	}
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	if (mapped == 1)
 		inp = in_pcblookup_hash(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port,
 			0, NULL);
 	else
 		inp = in6_pcblookup_hash(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
 	if (inp != NULL) {
 		INP_RLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = ENOENT;
 	}
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif
 
 
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Source quench is depreciated.
 	 */
 	else if (cmd == PRC_QUENCH)
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		icp = (struct icmp *)((caddr_t)ip
 				      - offsetof(struct icmp, icmp_ip));
 		th = (struct tcphdr *)((caddr_t)ip
 				       + (ip->ip_hl << 2));
 		INP_INFO_WLOCK(&V_tcbinfo);
 		inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, 0, NULL);
 		if (inp != NULL)  {
 			INP_WLOCK(inp);
 			if (!(inp->inp_vflag & INP_TIMEWAIT) &&
 			    !(inp->inp_vflag & INP_DROPPED) &&
 			    !(inp->inp_socket == NULL)) {
 				icmp_tcp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 					if (cmd == PRC_MSGSIZE) {
 					    /*
 					     * MTU discovery:
 					     * If we got a needfrag set the MTU
 					     * in the route to the suggested new
 					     * value (if given) and then notify.
 					     */
 					    bzero(&inc, sizeof(inc));
 					    inc.inc_flags = 0;	/* IPv4 */
 					    inc.inc_faddr = faddr;
 					    inc.inc_fibnum =
 						inp->inp_inc.inc_fibnum;
 
 					    mtu = ntohs(icp->icmp_nextmtu);
 					    /*
 					     * If no alternative MTU was
 					     * proposed, try the next smaller
 					     * one.  ip->ip_len has already
 					     * been swapped in icmp_input().
 					     */
 					    if (!mtu)
 						mtu = ip_next_mtu(ip->ip_len,
 						 1);
 					    if (mtu < max(296, V_tcp_minmss
 						 + sizeof(struct tcpiphdr)))
 						mtu = 0;
 					    if (!mtu)
 						mtu = V_tcp_mssdflt
 						 + sizeof(struct tcpiphdr);
 					    /*
 					     * Only cache the the MTU if it
 					     * is smaller than the interface
 					     * or route MTU.  tcp_mtudisc()
 					     * will do right thing by itself.
 					     */
 					    if (mtu <= tcp_maxmtu(&inc, NULL))
 						tcp_hc_updatemtu(&inc, mtu);
 					}
 
 					inp = (*notify)(inp, inetctlerrmap[cmd]);
 				}
 			}
 			if (inp != NULL)
 				INP_WUNLOCK(inp);
 		} else {
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 #ifdef INET6
 			inc.inc_isipv6 = 0;
 #endif
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	INIT_VNET_INET(curvnet);
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 	/* Source quench is depreciated. */
 	else if (cmd == PRC_QUENCH)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_isipv6 = 1;
 		INP_INFO_WLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 #ifdef VIMAGE_GLOBALS
 static u_char isn_secret[32];
 static int isn_last_reseed;
 static u_int32_t isn_offset, isn_offset_old;
 static MD5_CTX isn_ctx;
 #endif
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&V_isn_ctx);
 	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&V_isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &V_isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
  * to keep time flowing at a relatively constant rate.  If the random
  * increments have already pushed us past the projected offset, do nothing.
  */
 static void
 tcp_isn_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
 		INIT_VNET_INET(curvnet);
 		projected_offset =
 		    V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
 
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 
 		V_isn_offset_old = V_isn_offset;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 	callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
 	ISN_UNLOCK();
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 #ifdef INVARIANTS
 	INIT_VNET_INET(inp->inp_vnet);
 #endif
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value in the route.  Also nudge TCP to send something,
  * since we know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int errno)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_vflag & INP_TIMEWAIT) ||
 	    (inp->inp_vflag & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	V_tcpstat.tcps_mturesent++;
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp);
 	tcp_output_send(tp);
 	return (inp);
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, int *flags)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		in_rtalloc_ign(&sro, RTF_CLONING, inc->inc_fibnum);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, int *flags)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		rtalloc_ign((struct route *)&sro6, RTF_CLONING);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (flags != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return (0);
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 /*
  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
  *
  * This code attempts to calculate the bandwidth-delay product as a
  * means of determining the optimal window size to maximize bandwidth,
  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
  * routers.  This code also does a fairly good job keeping RTTs in check
  * across slow links like modems.  We implement an algorithm which is very
  * similar (but not meant to be) TCP/Vegas.  The code operates on the
  * transmitter side of a TCP connection and so only effects the transmit
  * side of the connection.
  *
  * BACKGROUND:  TCP makes no provision for the management of buffer space
  * at the end points or at the intermediate routers and switches.  A TCP
  * stream, whether using NewReno or not, will eventually buffer as
  * many packets as it is able and the only reason this typically works is
  * due to the fairly small default buffers made available for a connection
  * (typicaly 16K or 32K).  As machines use larger windows and/or window
  * scaling it is now fairly easy for even a single TCP connection to blow-out
  * all available buffer space not only on the local interface, but on
  * intermediate routers and switches as well.  NewReno makes a misguided
  * attempt to 'solve' this problem by waiting for an actual failure to occur,
  * then backing off, then steadily increasing the window again until another
  * failure occurs, ad-infinitum.  This results in terrible oscillation that
  * is only made worse as network loads increase and the idea of intentionally
  * blowing out network buffers is, frankly, a terrible way to manage network
  * resources.
  *
  * It is far better to limit the transmit window prior to the failure
  * condition being achieved.  There are two general ways to do this:  First
  * you can 'scan' through different transmit window sizes and locate the
  * point where the RTT stops increasing, indicating that you have filled the
  * pipe, then scan backwards until you note that RTT stops decreasing, then
  * repeat ad-infinitum.  This method works in principle but has severe
  * implementation issues due to RTT variances, timer granularity, and
  * instability in the algorithm which can lead to many false positives and
  * create oscillations as well as interact badly with other TCP streams
  * implementing the same algorithm.
  *
  * The second method is to limit the window to the bandwidth delay product
  * of the link.  This is the method we implement.  RTT variances and our
  * own manipulation of the congestion window, bwnd, can potentially
  * destabilize the algorithm.  For this reason we have to stabilize the
  * elements used to calculate the window.  We do this by using the minimum
  * observed RTT, the long term average of the observed bandwidth, and
  * by adding two segments worth of slop.  It isn't perfect but it is able
  * to react to changing conditions and gives us a very stable basis on
  * which to extend the algorithm.
  */
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	u_long bw;
 	u_long bwnd;
 	int save_ticks;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * If inflight_enable is disabled in the middle of a tcp connection,
 	 * make sure snd_bwnd is effectively disabled.
 	 */
 	if (V_tcp_inflight_enable == 0 ||
 	    tp->t_rttlow < V_tcp_inflight_rttthresh) {
 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 		tp->snd_bandwidth = 0;
 		return;
 	}
 
 	/*
 	 * Figure out the bandwidth.  Due to the tick granularity this
 	 * is a very rough number and it MUST be averaged over a fairly
 	 * long period of time.  XXX we need to take into account a link
 	 * that is not using all available bandwidth, but for now our
 	 * slop will ramp us up if this case occurs and the bandwidth later
 	 * increases.
 	 *
 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
 	    (save_ticks - tp->t_bw_rtttime);
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 		return;
 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	tp->snd_bandwidth = bw;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
 	 * segments.  The additional slop puts us squarely in the sweet
 	 * spot and also handles the bandwidth run-up case and stabilization.
 	 * Without the slop we could be locking ourselves into a lower
 	 * bandwidth.
 	 *
 	 * Situations Handled:
 	 *	(1) Prevents over-queueing of packets on LANs, especially on
 	 *	    high speed LANs, allowing larger TCP buffers to be
 	 *	    specified, and also does a good job preventing
 	 *	    over-queueing of packets over choke points like modems
 	 *	    (at least for the transmit side).
 	 *
 	 *	(2) Is able to handle changing network loads (bandwidth
 	 *	    drops so bwnd drops, bandwidth increases so bwnd
 	 *	    increases).
 	 *
 	 *	(3) Theoretically should stabilize in the face of multiple
 	 *	    connections implementing the same algorithm (this may need
 	 *	    a little work).
 	 *
 	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
 	 *	    be adjusted with a sysctl but typically only needs to be
 	 *	    on very slow connections.  A value no smaller then 5
 	 *	    should be used, but only reduce this default if you have
 	 *	    no other choice.
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 			    tp,
 			    bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
 	if ((long)bwnd < V_tcp_inflight_min)
 		bwnd = V_tcp_inflight_min;
 	if (bwnd > V_tcp_inflight_max)
 		bwnd = V_tcp_inflight_max;
 	if ((long)bwnd < tp->t_maxseg * 2)
 		bwnd = tp->t_maxseg * 2;
 	tp->snd_bwnd = bwnd;
 }
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * _unused	
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Return 0 if successful, otherwise return -1.
  *
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
  * right now. Another branch of this code exists which uses the SPD to
  * specify per-application flows but it is unstable.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
+	INIT_VNET_IPSEC(curvnet);
 	union sockaddr_union dst;
 	struct ippseudo ippseudo;
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 	struct ipovly *ipovly;
 	struct secasvar *sav;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	char ip6buf[INET6_ADDRSTRLEN];
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 		return (EINVAL);
 	}
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 #ifdef INET6
 	INIT_VNET_INET6(curvnet);
 #endif
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 	struct in6_addr f6, l6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup_hash(&V_tcbinfo, &f6, fin6->sin6_port,
 		    &l6, lin6->sin6_port, 0, NULL);
 		break;
 #endif
 	case AF_INET:
 		inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
 		    fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
 		break;
 	}
 	if (inp != NULL) {
 		INP_WLOCK(inp);
 		if (inp->inp_vflag & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_vflag & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0 && tcp_log_in_vain == 0)
 		return (NULL);
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && inc->inc_isipv6 == 0) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
Index: head/sys/netinet/tcp_syncache.c
===================================================================
--- head/sys/netinet/tcp_syncache.c	(revision 185347)
+++ head/sys/netinet/tcp_syncache.c	(revision 185348)
@@ -1,1764 +1,1768 @@
 /*-
  * Copyright (c) 2001 McAfee, Inc.
  * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jonathan Lemon
  * and McAfee Research, the Security Research Division of McAfee, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/md5.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef VIMAGE_GLOBALS
 static struct tcp_syncache tcp_syncache;
 static int tcp_syncookies;
 static int tcp_syncookiesonly;
 int tcp_sc_rst_sock_fail;
 #endif
 
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
-    &tcp_syncookies, 0,
+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, syncookies,
+    CTLFLAG_RW, tcp_syncookies, 0,
     "Use TCP SYN cookies if the syncache overflows");
 
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
-    &tcp_syncookiesonly, 0,
+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, syncookies_only,
+    CTLFLAG_RW, tcp_syncookiesonly, 0,
     "Use only TCP SYN cookies");
 
 #ifdef TCP_OFFLOAD_DISABLE
 #define TOEPCB_ISSET(sc) (0)
 #else
 #define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
 static void	 syncache_free(struct syncache *);
 static void	 syncache_insert(struct syncache *, struct syncache_head *);
 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
 static int	 syncache_respond(struct syncache *);
 static struct	 socket *syncache_socket(struct syncache *, struct socket *,
 		    struct mbuf *m);
 static void	 syncache_timeout(struct syncache *sc, struct syncache_head *sch,
 		    int docallout);
 static void	 syncache_timer(void *);
 static void	 syncookie_generate(struct syncache_head *, struct syncache *,
 		    u_int32_t *);
 static struct syncache
 		*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
 		    struct syncache *, struct tcpopt *, struct tcphdr *,
 		    struct socket *);
 
 /*
  * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
  * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds,
  * the odds are that the user has given up attempting to connect by then.
  */
 #define SYNCACHE_MAXREXMTS		3
 
 /* Arbitrary values */
 #define TCP_SYNCACHE_HASHSIZE		512
 #define TCP_SYNCACHE_BUCKETLIMIT	30
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     bucketlimit, CTLFLAG_RDTUN,
     tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     cachelimit, CTLFLAG_RDTUN,
     tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     count, CTLFLAG_RD,
     tcp_syncache.cache_count, 0, "Current number of entries in syncache");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     hashsize, CTLFLAG_RDTUN,
     tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
     rexmtlimit, CTLFLAG_RW,
     tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
 
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
      rst_on_sock_fail, CTLFLAG_RW,
      tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure");
 
 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define SYNCACHE_HASH(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc_faddr.s_addr ^					\
 	  ((inc)->inc_faddr.s_addr >> 16) ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define SYNCACHE_HASH6(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc6_faddr.s6_addr32[0] ^				\
 	  (inc)->inc6_faddr.s6_addr32[3] ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define ENDPTS_EQ(a, b) (						\
 	(a)->ie_fport == (b)->ie_fport &&				\
 	(a)->ie_lport == (b)->ie_lport &&				\
 	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
 	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
 )
 
 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
 
 #define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
 #define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
 #define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
 
 /*
  * Requires the syncache entry to be already removed from the bucket list.
  */
 static void
 syncache_free(struct syncache *sc)
 {
 	INIT_VNET_INET(curvnet);
 
 	if (sc->sc_ipopts)
 		(void) m_free(sc->sc_ipopts);
 	if (sc->sc_cred)
 		crfree(sc->sc_cred);
 #ifdef MAC
 	mac_syncache_destroy(&sc->sc_label);
 #endif
 
 	uma_zfree(V_tcp_syncache.zone, sc);
 }
 
 void
 syncache_init(void)
 {
 	INIT_VNET_INET(curvnet);
 	int i;
 
 	V_tcp_syncookies = 1;
 	V_tcp_syncookiesonly = 0;
 	V_tcp_sc_rst_sock_fail = 1;
 
 	V_tcp_syncache.cache_count = 0;
 	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
 	V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
 	V_tcp_syncache.hash_secret = arc4random();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
 	    &V_tcp_syncache.hashsize);
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
 	    &V_tcp_syncache.bucket_limit);
 	if (!powerof2(V_tcp_syncache.hashsize) ||
 	    V_tcp_syncache.hashsize == 0) {
 		printf("WARNING: syncache hash size is not a power of 2.\n");
 		V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	}
 	V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;
 
 	/* Set limits. */
 	V_tcp_syncache.cache_limit =
 	    V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
 	    &V_tcp_syncache.cache_limit);
 
 	/* Allocate the hash table. */
 	V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
 	    sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
 
 	/* Initialize the hash buckets. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
 		mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
 			 NULL, MTX_DEF);
 		callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
 			 &V_tcp_syncache.hashbase[i].sch_mtx, 0);
 		V_tcp_syncache.hashbase[i].sch_length = 0;
 	}
 
 	/* Create the syncache entry zone. */
 	V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit);
 }
 
 /*
  * Inserts a syncache entry into the specified bucket row.
  * Locks and unlocks the syncache_head autonomously.
  */
 static void
 syncache_insert(struct syncache *sc, struct syncache_head *sch)
 {
 	INIT_VNET_INET(sch->sch_vnet);
 	struct syncache *sc2;
 
 	SCH_LOCK(sch);
 
 	/*
 	 * Make sure that we don't overflow the per-bucket limit.
 	 * If the bucket is full, toss the oldest element.
 	 */
 	if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
 		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
 			("sch->sch_length incorrect"));
 		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
 		syncache_drop(sc2, sch);
 		V_tcpstat.tcps_sc_bucketoverflow++;
 	}
 
 	/* Put it into the bucket. */
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
 	syncache_timeout(sc, sch, 1);
 
 	SCH_UNLOCK(sch);
 
 	V_tcp_syncache.cache_count++;
 	V_tcpstat.tcps_sc_added++;
 }
 
 /*
  * Remove and free entry from syncache bucket row.
  * Expects locked syncache head.
  */
 static void
 syncache_drop(struct syncache *sc, struct syncache_head *sch)
 {
 	INIT_VNET_INET(sch->sch_vnet);
 
 	SCH_LOCK_ASSERT(sch);
 
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
 #ifndef TCP_OFFLOAD_DISABLE
 	if (sc->sc_tu)
 		sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
 #endif		    
 	syncache_free(sc);
 	V_tcp_syncache.cache_count--;
 }
 
 /*
  * Engage/reengage time on bucket row.
  */
 static void
 syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
 {
 	sc->sc_rxttime = ticks +
 		TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]);
 	sc->sc_rxmits++;
 	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
 		sch->sch_nextc = sc->sc_rxttime;
 		if (docallout)
 			callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
 			    syncache_timer, (void *)sch);
 	}
 }
 
 /*
  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  * If we have retransmitted an entry the maximum number of times, expire it.
  * One separate timer for each bucket row.
  */
 static void
 syncache_timer(void *xsch)
 {
 	struct syncache_head *sch = (struct syncache_head *)xsch;
-	INIT_VNET_INET(sch->sch_vnet);
 	struct syncache *sc, *nsc;
 	int tick = ticks;
 	char *s;
 
+	CURVNET_SET(sch->sch_vnet);
+	INIT_VNET_INET(sch->sch_vnet);
+
 	/* NB: syncache_head has already been locked by the callout. */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * In the following cycle we may remove some entries and/or
 	 * advance some timeouts, so re-initialize the bucket timer.
 	 */
 	sch->sch_nextc = tick + INT_MAX;
 
 	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
 		/*
 		 * We do not check if the listen socket still exists
 		 * and accept the case where the listen socket may be
 		 * gone by the time we resend the SYN/ACK.  We do
 		 * not expect this to happens often. If it does,
 		 * then the RST will be sent by the time the remote
 		 * host does the SYN/ACK->ACK.
 		 */
 		if (TSTMP_GT(sc->sc_rxttime, tick)) {
 			if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
 				sch->sch_nextc = sc->sc_rxttime;
 			continue;
 		}
 		if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
 				    "giving up and removing syncache entry\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 			syncache_drop(sc, sch);
 			V_tcpstat.tcps_sc_stale++;
 			continue;
 		}
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Response timeout, "
 			    "retransmitting (%u) SYN|ACK\n",
 			    s, __func__, sc->sc_rxmits);
 			free(s, M_TCPLOG);
 		}
 
 		(void) syncache_respond(sc);
 		V_tcpstat.tcps_sc_retransmitted++;
 		syncache_timeout(sc, sch, 0);
 	}
 	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
 		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
 			syncache_timer, (void *)(sch));
+	CURVNET_RESTORE();
 }
 
 /*
  * Find an entry in the syncache.
  * Returns always with locked syncache_head plus a matching entry or NULL.
  */
 struct syncache *
 syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 #ifdef INET6
 	if (inc->inc_isipv6) {
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	} else
 #endif
 	{
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 #ifdef INET6
 			if (sc->sc_inc.inc_isipv6)
 				continue;
 #endif
 			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	}
 	SCH_LOCK_ASSERT(*schp);
 	return (NULL);			/* always returns with locked sch */
 }
 
 /*
  * This function is called when we get a RST for a
  * non-existent connection, so that we can see if the
  * connection is in the syn cache.  If it is, zap it.
  */
 void
 syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 	char *s = NULL;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags.
 	 * See RFC 793 page 65, section SEGMENT ARRIVES.
 	 */
 	if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
 			    "FIN flag set, segment ignored\n", s, __func__);
 		V_tcpstat.tcps_badrst++;
 		goto done;
 	}
 
 	/*
 	 * No corresponding connection was found in syncache.
 	 * If syncookies are enabled and possibly exclusively
 	 * used, or we are under memory pressure, a valid RST
 	 * may not find a syncache entry.  In that case we're
 	 * done and no SYN|ACK retransmissions will happen.
 	 * Otherwise the the RST was misdirected or spoofed.
 	 */
 	if (sc == NULL) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
 			    "syncache entry (possibly syncookie only), "
 			    "segment ignored\n", s, __func__);
 		V_tcpstat.tcps_badrst++;
 		goto done;
 	}
 
 	/*
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 *
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 */
 	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
 	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		syncache_drop(sc, sch);
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, "
 			    "connection attempt aborted by remote endpoint\n",
 			    s, __func__);
 		V_tcpstat.tcps_sc_reset++;
 	} else {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
 			    "IRS %u (+WND %u), segment ignored\n",
 			    s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd);
 		V_tcpstat.tcps_badrst++;
 	}
 
 done:
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_badack(struct in_conninfo *inc)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 		syncache_drop(sc, sch);
 		V_tcpstat.tcps_sc_badack++;
 	}
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL)
 		goto done;
 
 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 	if (ntohl(th->th_seq) != sc->sc_iss)
 		goto done;
 
 	/*
 	 * If we've rertransmitted 3 times and this is our second error,
 	 * we remove the entry.  Otherwise, we allow it to continue on.
 	 * This prevents us from incorrectly nuking an entry during a
 	 * spurious network outage.
 	 *
 	 * See tcp_notify().
 	 */
 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
 		sc->sc_flags |= SCF_UNREACH;
 		goto done;
 	}
 	syncache_drop(sc, sch);
 	V_tcpstat.tcps_sc_unreach++;
 done:
 	SCH_UNLOCK(sch);
 }
 
 /*
  * Build a new TCP socket structure from a syncache entry.
  */
 static struct socket *
 syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 {
 	INIT_VNET_INET(lso->so_vnet);
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
 	char *s;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
 	 * connection when the SYN arrived.  If we can't create
 	 * the connection, abort it.
 	 */
 	so = sonewconn(lso, SS_ISCONNECTED);
 	if (so == NULL) {
 		/*
 		 * Drop the connection; we will either send a RST or
 		 * have the peer retransmit its SYN again after its
 		 * RTO and try again.
 		 */
 		V_tcpstat.tcps_listendrop++;
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Socket create failed "
 			    "due to limits or memory shortage\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		goto abort2;
 	}
 #ifdef MAC
 	SOCK_LOCK(so);
 	mac_socketpeer_set_from_mbuf(m, so);
 	SOCK_UNLOCK(so);
 #endif
 
 	inp = sotoinpcb(so);
 	inp->inp_inc.inc_fibnum = sc->sc_inc.inc_fibnum;
 	so->so_fibnum = sc->sc_inc.inc_fibnum;
 	INP_WLOCK(inp);
 
 	/* Insert new socket into PCB hash list. */
 	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 	} else {
 		inp->inp_vflag &= ~INP_IPV6;
 		inp->inp_vflag |= INP_IPV4;
 #endif
 		inp->inp_laddr = sc->sc_inc.inc_laddr;
 #ifdef INET6
 	}
 #endif
 	inp->inp_lport = sc->sc_inc.inc_lport;
 	if (in_pcbinshash(inp) != 0) {
 		/*
 		 * Undo the assignments above if we failed to
 		 * put the PCB on the hash lists.
 		 */
 #ifdef INET6
 		if (sc->sc_inc.inc_isipv6)
 			inp->in6p_laddr = in6addr_any;
 		else
 #endif
 			inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		goto abort;
 	}
 #ifdef IPSEC
 	/* Copy old policy into new socket's. */
 	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
 		printf("syncache_socket: could not copy policy\n");
 #endif
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		struct inpcb *oinp = sotoinpcb(lso);
 		struct in6_addr laddr6;
 		struct sockaddr_in6 sin6;
 		/*
 		 * Inherit socket options from the listening socket.
 		 * Note that in6p_inputopts are not (and should not be)
 		 * copied, since it stores previously received options and is
 		 * used to detect if each new option is different than the
 		 * previous one and hence should be passed to a user.
 		 * If we copied in6p_inputopts, a user would not be able to
 		 * receive options just after calling the accept system call.
 		 */
 		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
 		if (oinp->in6p_outputopts)
 			inp->in6p_outputopts =
 			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
 
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
 		sin6.sin6_port = sc->sc_inc.inc_fport;
 		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 		if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
 		    thread0.td_ucred)) {
 			inp->in6p_laddr = laddr6;
 			goto abort;
 		}
 		/* Override flowlabel from in6_pcbconnect. */
 		inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
 		inp->in6p_flowinfo |= sc->sc_flowlabel;
 	} else
 #endif
 	{
 		struct in_addr laddr;
 		struct sockaddr_in sin;
 
 		inp->inp_options = (m) ? ip_srcroute(m) : NULL;
 		
 		if (inp->inp_options == NULL) {
 			inp->inp_options = sc->sc_ipopts;
 			sc->sc_ipopts = NULL;
 		}
 
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		sin.sin_addr = sc->sc_inc.inc_faddr;
 		sin.sin_port = sc->sc_inc.inc_fport;
 		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
 		if (in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    thread0.td_ucred)) {
 			inp->inp_laddr = laddr;
 			goto abort;
 		}
 	}
 	tp = intotcpcb(inp);
 	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
 	tp->snd_wl1 = sc->sc_irs;
 	tp->snd_max = tp->iss + 1;
 	tp->snd_nxt = tp->iss + 1;
 	tp->rcv_up = sc->sc_irs + 1;
 	tp->rcv_wnd = sc->sc_wnd;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 	if (sc->sc_flags & SCF_NOOPT)
 		tp->t_flags |= TF_NOOPT;
 	else {
 		if (sc->sc_flags & SCF_WINSCALE) {
 			tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 			tp->snd_scale = sc->sc_requested_s_scale;
 			tp->request_r_scale = sc->sc_requested_r_scale;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 			tp->ts_recent = sc->sc_tsreflect;
 			tp->ts_recent_age = ticks;
 			tp->ts_offset = sc->sc_tsoff;
 		}
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tp->t_flags |= TF_SIGNATURE;
 #endif
 		if (sc->sc_flags & SCF_SACK)
 			tp->t_flags |= TF_SACK_PERMIT;
 	}
 
 	if (sc->sc_flags & SCF_ECN)
 		tp->t_flags |= TF_ECN_PERMIT;
 
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
 	 */
 	tcp_mss(tp, sc->sc_peer_mss);
 
 	/*
 	 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
 	 */
 	if (sc->sc_rxmits)
 		tp->snd_cwnd = tp->t_maxseg;
 	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
 
 	INP_WUNLOCK(inp);
 
 	V_tcpstat.tcps_accepts++;
 	return (so);
 
 abort:
 	INP_WUNLOCK(inp);
 abort2:
 	if (so != NULL)
 		soabort(so);
 	return (NULL);
 }
 
 /*
  * This function gets called when we receive an ACK for a
  * socket in the LISTEN state.  We look up the connection
  * in the syncache, and if its there, we pull it out of
  * the cache and turn it into a full-blown connection in
  * the SYN-RECEIVED state.
  */
 int
 syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct socket **lsop, struct mbuf *m)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache *sc;
 	struct syncache_head *sch;
 	struct syncache scs;
 	char *s;
 
 	/*
 	 * Global TCP locks are held because we manipulate the PCB lists
 	 * and create a new socket.
 	 */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
 	    ("%s: can handle only ACK", __func__));
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL) {
 		/*
 		 * There is no syncache entry, so see if this ACK is
 		 * a returning syncookie.  To do this, first:
 		 *  A. See if this socket has had a syncache entry dropped in
 		 *     the past.  We don't want to accept a bogus syncookie
 		 *     if we've never received a SYN.
 		 *  B. check that the syncookie is valid.  If it is, then
 		 *     cobble up a fake syncache entry, and return.
 		 */
-		if (!tcp_syncookies) {
+		if (!V_tcp_syncookies) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Spurious ACK, "
 				    "segment rejected (syncookies disabled)\n",
 				    s, __func__);
 			goto failed;
 		}
 		bzero(&scs, sizeof(scs));
 		sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
 		SCH_UNLOCK(sch);
 		if (sc == NULL) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Segment failed "
 				    "SYNCOOKIE authentication, segment rejected "
 				    "(probably spoofed)\n", s, __func__);
 			goto failed;
 		}
 	} else {
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
 		V_tcp_syncache.cache_count--;
 		SCH_UNLOCK(sch);
 	}
 
 	/*
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
 	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
 		goto failed;
 	}
 
 	/*
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
 	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
 	    !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
 		goto failed;
 	}
 
 	if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "segment rejected\n", s, __func__);
 		goto failed;
 	}
 	/*
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
 	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
 	    !TOEPCB_ISSET(sc)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
 			    s, __func__, to->to_tsecr, sc->sc_ts);
 		goto failed;
 	}
 
 	*lsop = syncache_socket(sc, *lsop, m);
 
 	if (*lsop == NULL)
 		V_tcpstat.tcps_sc_aborted++;
 	else
 		V_tcpstat.tcps_sc_completed++;
 
 /* how do we find the inp for the new socket? */
 	if (sc != &scs)
 		syncache_free(sc);
 	return (1);
 failed:
 	if (sc != NULL && sc != &scs)
 		syncache_free(sc);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	*lsop = NULL;
 	return (0);
 }
 
 int
 tcp_offload_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct socket **lsop, struct mbuf *m)
 {
+	INIT_VNET_INET(curvnet);
 	int rc;
 	
 	INP_INFO_WLOCK(&V_tcbinfo);
 	rc = syncache_expand(inc, to, th, lsop, m);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	return (rc);
 }
 
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  * to the source.
  *
  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  * Doing so would require that we hold onto the data and deliver it
  * to the application.  However, if we are the target of a SYN-flood
  * DoS attack, an attacker could send data which would eventually
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
 static void
 _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket **lsop, struct mbuf *m,
     struct toe_usrreqs *tu, void *toepcb)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct tcpcb *tp;
 	struct socket *so;
 	struct syncache *sc = NULL;
 	struct syncache_head *sch;
 	struct mbuf *ipopts = NULL;
 	u_int32_t flowtmp;
 	int win, sb_hiwat, ip_ttl, ip_tos, noopt;
 	char *s;
 #ifdef INET6
 	int autoflowlabel = 0;
 #endif
 #ifdef MAC
 	struct label *maclabel;
 #endif
 	struct syncache scs;
 	struct ucred *cred;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);			/* listen socket */
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
 	    ("%s: unexpected tcp flags", __func__));
 
 	/*
 	 * Combine all so/tp operations very early to drop the INP lock as
 	 * soon as possible.
 	 */
 	so = *lsop;
 	tp = sototcpcb(so);
 	cred = crhold(so->so_cred);
 
 #ifdef INET6
 	if (inc->inc_isipv6 &&
 	    (inp->in6p_flags & IN6P_AUTOFLOWLABEL))
 		autoflowlabel = 1;
 #endif
 	ip_ttl = inp->inp_ip_ttl;
 	ip_tos = inp->inp_ip_tos;
 	win = sbspace(&so->so_rcv);
 	sb_hiwat = so->so_rcv.sb_hiwat;
 	noopt = (tp->t_flags & TF_NOOPT);
 
 	/* By the time we drop the lock these should no longer be used. */
 	so = NULL;
 	tp = NULL;
 
 #ifdef MAC
 	if (mac_syncache_init(&maclabel) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		goto done;
 	} else
 		mac_syncache_create(maclabel, inp);
 #endif
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	/*
 	 * Remember the IP options, if any.
 	 */
 #ifdef INET6
 	if (!inc->inc_isipv6)
 #endif
 		ipopts = (m) ? ip_srcroute(m) : NULL;
 
 	/*
 	 * See if we already have an entry for this connection.
 	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
 	 *
 	 * XXX: should the syncache be re-initialized with the contents
 	 * of the new SYN here (which may have different options?)
 	 *
 	 * XXX: We do not check the sequence number to see if this is a
 	 * real retransmit or a new connection attempt.  The question is
 	 * how to handle such a case; either ignore it as spoofed, or
 	 * drop the current entry and create a new one?
 	 */
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 #ifndef TCP_OFFLOAD_DISABLE
 		if (sc->sc_tu)
 			sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
 			    sc->sc_toepcb);
 #endif		    
 		V_tcpstat.tcps_sc_dupsyn++;
 		if (ipopts) {
 			/*
 			 * If we were remembering a previous source route,
 			 * forget it and use the new one we've been given.
 			 */
 			if (sc->sc_ipopts)
 				(void) m_free(sc->sc_ipopts);
 			sc->sc_ipopts = ipopts;
 		}
 		/*
 		 * Update timestamp if present.
 		 */
 		if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
 			sc->sc_tsreflect = to->to_tsval;
 		else
 			sc->sc_flags &= ~SCF_TIMESTAMP;
 #ifdef MAC
 		/*
 		 * Since we have already unconditionally allocated label
 		 * storage, free it up.  The syncache entry will already
 		 * have an initialized label we can use.
 		 */
 		mac_syncache_destroy(&maclabel);
 		KASSERT(sc->sc_label != NULL,
 		    ("%s: label not initialized", __func__));
 #endif
 		/* Retransmit SYN|ACK and reset retransmit count. */
 		if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
 			    "resetting timer and retransmitting SYN|ACK\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			V_tcpstat.tcps_sndacks++;
 			V_tcpstat.tcps_sndtotal++;
 		}
 		SCH_UNLOCK(sch);
 		goto done;
 	}
 
 	sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 	if (sc == NULL) {
 		/*
 		 * The zone allocator couldn't provide more entries.
 		 * Treat this as if the cache was full; drop the oldest
 		 * entry and insert the new one.
 		 */
 		V_tcpstat.tcps_sc_zonefail++;
 		if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL)
 			syncache_drop(sc, sch);
 		sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 		if (sc == NULL) {
-			if (tcp_syncookies) {
+			if (V_tcp_syncookies) {
 				bzero(&scs, sizeof(scs));
 				sc = &scs;
 			} else {
 				SCH_UNLOCK(sch);
 				if (ipopts)
 					(void) m_free(ipopts);
 				goto done;
 			}
 		}
 	}
 	
 	/*
 	 * Fill in the syncache values.
 	 */
 #ifdef MAC
 	sc->sc_label = maclabel;
 #endif
 	sc->sc_cred = cred;
 	cred = NULL;
 	sc->sc_ipopts = ipopts;
 	sc->sc_inc.inc_fibnum = inp->inp_inc.inc_fibnum;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 #ifdef INET6
 	if (!inc->inc_isipv6)
 #endif
 	{
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
 #ifndef TCP_OFFLOAD_DISABLE	
 	sc->sc_tu = tu;
 	sc->sc_toepcb = toepcb;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
 	sc->sc_flags = 0;
 	sc->sc_flowlabel = 0;
 
 	/*
 	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
 	 * win was derived from socket earlier in the function.
 	 */
 	win = imax(win, 0);
 	win = imin(win, TCP_MAXWIN);
 	sc->sc_wnd = win;
 
 	if (V_tcp_do_rfc1323) {
 		/*
 		 * A timestamp received in a SYN makes
 		 * it ok to send timestamp requests and replies.
 		 */
 		if (to->to_flags & TOF_TS) {
 			sc->sc_tsreflect = to->to_tsval;
 			sc->sc_ts = ticks;
 			sc->sc_flags |= SCF_TIMESTAMP;
 		}
 		if (to->to_flags & TOF_SCALE) {
 			int wscale = 0;
 
 			/*
 			 * Pick the smallest possible scaling factor that
 			 * will still allow us to scale up to sb_max, aka
 			 * kern.ipc.maxsockbuf.
 			 *
 			 * We do this because there are broken firewalls that
 			 * will corrupt the window scale option, leading to
 			 * the other endpoint believing that our advertised
 			 * window is unscaled.  At scale factors larger than
 			 * 5 the unscaled window will drop below 1500 bytes,
 			 * leading to serious problems when traversing these
 			 * broken firewalls.
 			 *
 			 * With the default maxsockbuf of 256K, a scale factor
 			 * of 3 will be chosen by this algorithm.  Those who
 			 * choose a larger maxsockbuf should watch out
 			 * for the compatiblity problems mentioned above.
 			 *
 			 * RFC1323: The Window field in a SYN (i.e., a <SYN>
 			 * or <SYN,ACK>) segment itself is never scaled.
 			 */
 			while (wscale < TCP_MAX_WINSHIFT &&
 			    (TCP_MAXWIN << wscale) < sb_max)
 				wscale++;
 			sc->sc_requested_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_wscale;
 			sc->sc_flags |= SCF_WINSCALE;
 		}
 	}
 #ifdef TCP_SIGNATURE
 	/*
 	 * If listening socket requested TCP digests, and received SYN
 	 * contains the option, flag this in the syncache so that
 	 * syncache_respond() will do the right thing with the SYN+ACK.
 	 * XXX: Currently we always record the option by default and will
 	 * attempt to use it in syncache_respond().
 	 */
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 #endif
 	if (to->to_flags & TOF_SACKPERM)
 		sc->sc_flags |= SCF_SACK;
 	if (to->to_flags & TOF_MSS)
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
 	if (noopt)
 		sc->sc_flags |= SCF_NOOPT;
 	if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
 		sc->sc_flags |= SCF_ECN;
 
-	if (tcp_syncookies) {
+	if (V_tcp_syncookies) {
 		syncookie_generate(sch, sc, &flowtmp);
 #ifdef INET6
 		if (autoflowlabel)
 			sc->sc_flowlabel = flowtmp;
 #endif
 	} else {
 #ifdef INET6
 		if (autoflowlabel)
 			sc->sc_flowlabel =
 			    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 #endif
 	}
 	SCH_UNLOCK(sch);
 
 	/*
 	 * Do a standard 3-way handshake.
 	 */
 	if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
-		if (tcp_syncookies && tcp_syncookiesonly && sc != &scs)
+		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
 			syncache_insert(sc, sch);   /* locks and unlocks sch */
 		V_tcpstat.tcps_sndacks++;
 		V_tcpstat.tcps_sndtotal++;
 	} else {
 		if (sc != &scs)
 			syncache_free(sc);
 		V_tcpstat.tcps_sc_dropped++;
 	}
 
 done:
 	if (cred != NULL)
 		crfree(cred);
 #ifdef MAC
 	if (sc == &scs)
 		mac_syncache_destroy(&maclabel);
 #endif
 	if (m) {
 		
 		*lsop = NULL;
 		m_freem(m);
 	}
 }
 
 static int
 syncache_respond(struct syncache *sc)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip = NULL;
 	struct mbuf *m;
 	struct tcphdr *th;
 	int optlen, error;
 	u_int16_t hlen, tlen, mssopt;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 
 	hlen =
 #ifdef INET6
 	       (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
 #endif
 		sizeof(struct ip);
 	tlen = hlen + sizeof(struct tcphdr);
 
 	/* Determine MSS we advertize to other end of connection. */
 	mssopt = tcp_mssopt(&sc->sc_inc);
 	if (sc->sc_peer_mss)
 		mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss);
 
 	/* XXX: Assume that the entire packet will fit in a header mbuf. */
 	KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
 	    ("syncache: mbuf too small"));
 
 	/* Create the IP+TCP header from scratch. */
 	m = m_gethdr(M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifdef MAC
 	mac_syncache_create_mbuf(sc->sc_label, m);
 #endif
 	m->m_data += max_linkhdr;
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_src = sc->sc_inc.inc6_laddr;
 		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
 		ip6->ip6_plen = htons(tlen - hlen);
 		/* ip6_hlim is set after checksum */
 		ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
 		ip6->ip6_flow |= sc->sc_flowlabel;
 
 		th = (struct tcphdr *)(ip6 + 1);
 	} else
 #endif
 	{
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_len = tlen;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = sc->sc_inc.inc_laddr;
 		ip->ip_dst = sc->sc_inc.inc_faddr;
 		ip->ip_ttl = sc->sc_ip_ttl;
 		ip->ip_tos = sc->sc_ip_tos;
 
 		/*
 		 * See if we should do MTU discovery.  Route lookups are
 		 * expensive, so we will only unset the DF bit if:
 		 *
 		 *	1) path_mtu_discovery is disabled
 		 *	2) the SCF_UNREACH flag has been set
 		 */
 		if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
 		       ip->ip_off |= IP_DF;
 
 		th = (struct tcphdr *)(ip + 1);
 	}
 	th->th_sport = sc->sc_inc.inc_lport;
 	th->th_dport = sc->sc_inc.inc_fport;
 
 	th->th_seq = htonl(sc->sc_iss);
 	th->th_ack = htonl(sc->sc_irs + 1);
 	th->th_off = sizeof(struct tcphdr) >> 2;
 	th->th_x2 = 0;
 	th->th_flags = TH_SYN|TH_ACK;
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
 	if (sc->sc_flags & SCF_ECN) {
 		th->th_flags |= TH_ECE;
 		V_tcpstat.tcps_ecn_shs++;
 	}
 
 	/* Tack on the TCP options. */
 	if ((sc->sc_flags & SCF_NOOPT) == 0) {
 		to.to_flags = 0;
 
 		to.to_mss = mssopt;
 		to.to_flags = TOF_MSS;
 		if (sc->sc_flags & SCF_WINSCALE) {
 			to.to_wscale = sc->sc_requested_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			/* Virgin timestamp or TCP cookie enhanced one. */
 			to.to_tsval = sc->sc_ts;
 			to.to_tsecr = sc->sc_tsreflect;
 			to.to_flags |= TOF_TS;
 		}
 		if (sc->sc_flags & SCF_SACK)
 			to.to_flags |= TOF_SACKPERM;
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 		optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 		/* Adjust headers by option size. */
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tcp_signature_compute(m, 0, 0, optlen,
 			    to.to_signature, IPSEC_DIR_OUTBOUND);
 #endif
 #ifdef INET6
 		if (sc->sc_inc.inc_isipv6)
 			ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
 		else
 #endif
 			ip->ip_len += optlen;
 	} else
 		optlen = 0;
 
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		th->th_sum = 0;
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
 				       tlen + optlen - hlen);
 		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	} else
 #endif
 	{
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 	return (error);
 }
 
 void
 syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket **lsop, struct mbuf *m)
 {
 	_syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
 }
 
 void
 tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
     struct toe_usrreqs *tu, void *toepcb)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	_syncache_add(inc, to, th, inp, lsop, NULL, tu, toepcb);
 }
 
 /*
  * The purpose of SYN cookies is to avoid keeping track of all SYN's we
  * receive and to be able to handle SYN floods from bogus source addresses
  * (where we will never receive any reply).  SYN floods try to exhaust all
  * our memory and available slots in the SYN cache table to cause a denial
  * of service to legitimate users of the local host.
  *
  * The idea of SYN cookies is to encode and include all necessary information
  * about the connection setup state within the SYN-ACK we send back and thus
  * to get along without keeping any local state until the ACK to the SYN-ACK
  * arrives (if ever).  Everything we need to know should be available from
  * the information we encoded in the SYN-ACK.
  *
  * More information about the theory behind SYN cookies and its first
  * discussion and specification can be found at:
  *  http://cr.yp.to/syncookies.html    (overview)
  *  http://cr.yp.to/syncookies/archive (gory details)
  *
  * This implementation extends the orginal idea and first implementation
  * of FreeBSD by using not only the initial sequence number field to store
  * information but also the timestamp field if present.  This way we can
  * keep track of the entire state we need to know to recreate the session in
  * its original form.  Almost all TCP speakers implement RFC1323 timestamps
  * these days.  For those that do not we still have to live with the known
  * shortcomings of the ISN only SYN cookies.
  *
  * Cookie layers:
  *
  * Initial sequence number we send:
  * 31|................................|0
  *    DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
  *    D = MD5 Digest (first dword)
  *    M = MSS index
  *    R = Rotation of secret
  *    P = Odd or Even secret
  *
  * The MD5 Digest is computed with over following parameters:
  *  a) randomly rotated secret
  *  b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
  *  c) the received initial sequence number from remote host
  *  d) the rotation offset and odd/even bit
  *
  * Timestamp we send:
  * 31|................................|0
  *    DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
  *    D = MD5 Digest (third dword) (only as filler)
  *    S = Requested send window scale
  *    R = Requested receive window scale
  *    A = SACK allowed
  *    5 = TCP-MD5 enabled (not implemented yet)
  *    XORed with MD5 Digest (forth dword)
  *
  * The timestamp isn't cryptographically secure and doesn't need to be.
  * The double use of the MD5 digest dwords ties it to a specific remote/
  * local host/port, remote initial sequence number and our local time
  * limited secret.  A received timestamp is reverted (XORed) and then
  * the contained MD5 dword is compared to the computed one to ensure the
  * timestamp belongs to the SYN-ACK we sent.  The other parameters may
  * have been tampered with but this isn't different from supplying bogus
  * values in the SYN in the first place.
  *
  * Some problems with SYN cookies remain however:
  * Consider the problem of a recreated (and retransmitted) cookie.  If the
  * original SYN was accepted, the connection is established.  The second
  * SYN is inflight, and if it arrives with an ISN that falls within the
  * receive window, the connection is killed.
  *
  * Notes:
  * A heuristic to determine when to accept syn cookies is not necessary.
  * An ACK flood would cause the syncookie verification to be attempted,
  * but a SYN flood causes syncookies to be generated.  Both are of equal
  * cost, so there's no point in trying to optimize the ACK flood case.
  * Also, if you don't process certain ACKs for some reason, then all someone
  * would have to do is launch a SYN and ACK flood at the same time, which
  * would stop cookie verification and defeat the entire purpose of syncookies.
  */
 static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };
 
 static void
 syncookie_generate(struct syncache_head *sch, struct syncache *sc,
     u_int32_t *flowlabel)
 {
 	INIT_VNET_INET(curvnet);
 	MD5_CTX ctx;
 	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
 	u_int32_t data;
 	u_int32_t *secbits;
 	u_int off, pmss, mss;
 	int i;
 
 	SCH_LOCK_ASSERT(sch);
 
 	/* Which of the two secrets to use. */
 	secbits = sch->sch_oddeven ?
 			sch->sch_secbits_odd : sch->sch_secbits_even;
 
 	/* Reseed secret if too old. */
 	if (sch->sch_reseed < time_uptime) {
 		sch->sch_oddeven = sch->sch_oddeven ? 0 : 1;	/* toggle */
 		secbits = sch->sch_oddeven ?
 				sch->sch_secbits_odd : sch->sch_secbits_even;
 		for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
 			secbits[i] = arc4random();
 		sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
 	}
 
 	/* Secret rotation offset. */
 	off = sc->sc_iss & 0x7;			/* iss was randomized before */
 
 	/* Maximum segment size calculation. */
 	pmss =
 	    max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)),	V_tcp_minmss);
 	for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
 		if (tcp_sc_msstab[mss] <= pmss)
 			break;
 
 	/* Fold parameters and MD5 digest into the ISN we will send. */
 	data = sch->sch_oddeven;/* odd or even secret, 1 bit */
 	data |= off << 1;	/* secret offset, derived from iss, 3 bits */
 	data |= mss << 4;	/* mss, 3 bits */
 
 	MD5Init(&ctx);
 	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
 	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
 	MD5Update(&ctx, secbits, off);
 	MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
 	MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
 	MD5Update(&ctx, &data, sizeof(data));
 	MD5Final((u_int8_t *)&md5_buffer, &ctx);
 
 	data |= (md5_buffer[0] << 7);
 	sc->sc_iss = data;
 
 #ifdef INET6
 	*flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
 #endif
 
 	/* Additional parameters are stored in the timestamp if present. */
 	if (sc->sc_flags & SCF_TIMESTAMP) {
 		data =  ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
 		data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
 		data |= sc->sc_requested_s_scale << 2;  /* SWIN scale, 4 bits */
 		data |= sc->sc_requested_r_scale << 6;  /* RWIN scale, 4 bits */
 		data |= md5_buffer[2] << 10;		/* more digest bits */
 		data ^= md5_buffer[3];
 		sc->sc_ts = data;
 		sc->sc_tsoff = data - ticks;		/* after XOR */
 	}
 
 	V_tcpstat.tcps_sc_sendcookie++;
 }
 
 static struct syncache *
 syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, 
     struct syncache *sc, struct tcpopt *to, struct tcphdr *th,
     struct socket *so)
 {
 	INIT_VNET_INET(curvnet);
 	MD5_CTX ctx;
 	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
 	u_int32_t data = 0;
 	u_int32_t *secbits;
 	tcp_seq ack, seq;
 	int off, mss, wnd, flags;
 
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Pull information out of SYN-ACK/ACK and
 	 * revert sequence number advances.
 	 */
 	ack = th->th_ack - 1;
 	seq = th->th_seq - 1;
 	off = (ack >> 1) & 0x7;
 	mss = (ack >> 4) & 0x7;
 	flags = ack & 0x7f;
 
 	/* Which of the two secrets to use. */
 	secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;
 
 	/*
 	 * The secret wasn't updated for the lifetime of a syncookie,
 	 * so this SYN-ACK/ACK is either too old (replay) or totally bogus.
 	 */
 	if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) {
 		return (NULL);
 	}
 
 	/* Recompute the digest so we can compare it. */
 	MD5Init(&ctx);
 	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
 	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
 	MD5Update(&ctx, secbits, off);
 	MD5Update(&ctx, inc, sizeof(*inc));
 	MD5Update(&ctx, &seq, sizeof(seq));
 	MD5Update(&ctx, &flags, sizeof(flags));
 	MD5Final((u_int8_t *)&md5_buffer, &ctx);
 
 	/* Does the digest part of or ACK'ed ISS match? */
 	if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
 		return (NULL);
 
 	/* Does the digest part of our reflected timestamp match? */
 	if (to->to_flags & TOF_TS) {
 		data = md5_buffer[3] ^ to->to_tsecr;
 		if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
 			return (NULL);
 	}
 
 	/* Fill in the syncache values. */
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 	sc->sc_ipopts = NULL;
 	
 	sc->sc_irs = seq;
 	sc->sc_iss = ack;
 
 #ifdef INET6
 	if (inc->inc_isipv6) {
 		if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL)
 			sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
 	} else
 #endif
 	{
 		sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
 		sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
 	}
 
 	/* Additional parameters that were encoded in the timestamp. */
 	if (data) {
 		sc->sc_flags |= SCF_TIMESTAMP;
 		sc->sc_tsreflect = to->to_tsval;
 		sc->sc_ts = to->to_tsecr;
 		sc->sc_tsoff = to->to_tsecr - ticks;
 		sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0;
 		sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0;
 		sc->sc_requested_s_scale = min((data >> 2) & 0xf,
 		    TCP_MAX_WINSHIFT);
 		sc->sc_requested_r_scale = min((data >> 6) & 0xf,
 		    TCP_MAX_WINSHIFT);
 		if (sc->sc_requested_s_scale || sc->sc_requested_r_scale)
 			sc->sc_flags |= SCF_WINSCALE;
 	} else
 		sc->sc_flags |= SCF_NOOPT;
 
 	wnd = sbspace(&so->so_rcv);
 	wnd = imax(wnd, 0);
 	wnd = imin(wnd, TCP_MAXWIN);
 	sc->sc_wnd = wnd;
 
 	sc->sc_rxmits = 0;
 	sc->sc_peer_mss = tcp_sc_msstab[mss];
 
 	V_tcpstat.tcps_sc_recvcookie++;
 	return (sc);
 }
 
 /*
  * Returns the current number of syncache entries.  This number
  * will probably change before you get around to calling 
  * syncache_pcblist.
  */
 
 int
 syncache_pcbcount(void)
 {
 	INIT_VNET_INET(curvnet);
 	struct syncache_head *sch;
 	int count, i;
 
 	for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		/* No need to lock for a read. */
 		sch = &V_tcp_syncache.hashbase[i];
 		count += sch->sch_length;
 	}
 	return count;
 }
 
 /*
  * Exports the syncache entries to userland so that netstat can display
  * them alongside the other sockets.  This function is intended to be
  * called only from tcp_pcblist.
  *
  * Due to concurrency on an active system, the number of pcbs exported
  * may have no relation to max_pcbs.  max_pcbs merely indicates the
  * amount of space the caller allocated for this function to use.
  */
 int
 syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported)
 {
 	INIT_VNET_INET(curvnet);
 	struct xtcpcb xt;
 	struct syncache *sc;
 	struct syncache_head *sch;
 	int count, error, i;
 
 	for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		sch = &V_tcp_syncache.hashbase[i];
 		SCH_LOCK(sch);
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (count >= max_pcbs) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
 				continue;
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof(xt);
 			if (sc->sc_inc.inc_isipv6)
 				xt.xt_inp.inp_vflag = INP_IPV6;
 			else
 				xt.xt_inp.inp_vflag = INP_IPV4;
 			bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo));
 			xt.xt_tp.t_inpcb = &xt.xt_inp;
 			xt.xt_tp.t_state = TCPS_SYN_RECEIVED;
 			xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			xt.xt_socket.xso_len = sizeof (struct xsocket);
 			xt.xt_socket.so_type = SOCK_STREAM;
 			xt.xt_socket.so_state = SS_ISCONNECTING;
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			count++;
 		}
 		SCH_UNLOCK(sch);
 	}
 exit:
 	*pcbs_exported = count;
 	return error;
 }
Index: head/sys/netinet/tcp_timewait.c
===================================================================
--- head/sys/netinet/tcp_timewait.c	(revision 185347)
+++ head/sys/netinet/tcp_timewait.c	(revision 185348)
@@ -1,666 +1,666 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static uma_zone_t tcptw_zone;
 static int	maxtcptw;
 
 /*
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * tcbinfo lock, which must be held over queue iteration and modification.
  */
 #ifdef VIMAGE_GLOBALS
 static TAILQ_HEAD(, tcptw)	twq_2msl;
 int	nolocaltimewait;
 #endif
 
 static void	tcp_tw_2msl_reset(struct tcptw *, int);
 static void	tcp_tw_2msl_stop(struct tcptw *);
 
 static int
 tcptw_auto_size(void)
 {
 	INIT_VNET_INET(curvnet);
 	int halfrange;
 
 	/*
 	 * Max out at half the ephemeral port range so that TIME_WAIT
 	 * sockets don't tie up too many ephemeral ports.
 	 */
 	if (V_ipport_lastauto > V_ipport_firstauto)
 		halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
 	else
 		halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
 	/* Protect against goofy port ranges smaller than 32. */
 	return (imin(imax(halfrange, 32), maxsockets / 5));
 }
 
 static int
 sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	if (maxtcptw == 0)
 		new = tcptw_auto_size();
 	else
 		new = maxtcptw;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr)
 		if (new >= 32) {
 			maxtcptw = new;
 			uma_zone_set_max(tcptw_zone, maxtcptw);
 		}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
     &maxtcptw, 0, sysctl_maxtcptw, "IU",
     "Maximum number of compressed TCP TIME_WAIT entries");
 
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
-    &nolocaltimewait, 0,
+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, nolocaltimewait,
+    CTLFLAG_RW, nolocaltimewait, 0,
     "Do not create compressed TCP TIME_WAIT entries for local connections");
 
 void
 tcp_tw_zone_change(void)
 {
 
 	if (maxtcptw == 0)
 		uma_zone_set_max(tcptw_zone, tcptw_auto_size());
 }
 
 void
 tcp_tw_init(void)
 {
 	INIT_VNET_INET(curvnet);
 
 	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
 	if (maxtcptw == 0)
 		uma_zone_set_max(tcptw_zone, tcptw_auto_size());
 	else
 		uma_zone_set_max(tcptw_zone, maxtcptw);
 	TAILQ_INIT(&V_twq_2msl);
 }
 
 /*
  * Move a TCP connection into TIME_WAIT state.
  *    tcbinfo is locked.
  *    inp is locked, and is unlocked before returning.
  */
 void
 tcp_twstart(struct tcpcb *tp)
 {
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 	INIT_VNET_INET(tp->t_vnet);
 #endif
 	struct tcptw *tw;
 	struct inpcb *inp = tp->t_inpcb;
 	int acknow;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_reset(). */
 	INP_WLOCK_ASSERT(inp);
 
 	if (V_nolocaltimewait && in_localip(inp->inp_faddr)) {
 		tp = tcp_close(tp);
 		if (tp != NULL)
 			INP_WUNLOCK(inp);
 		return;
 	}
 
 	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
 	if (tw == NULL) {
 		tw = tcp_tw_2msl_scan(1);
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 			return;
 		}
 	}
 	tw->tw_inpcb = inp;
 
 	/*
 	 * Recover last window size sent.
 	 */
 	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 
 	/*
 	 * Set t_recent if timestamps are used on the connection.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 		tw->t_recent = tp->ts_recent;
 		tw->ts_offset = tp->ts_offset;
 	} else {
 		tw->t_recent = 0;
 		tw->ts_offset = 0;
 	}
 
 	tw->snd_nxt = tp->snd_nxt;
 	tw->rcv_nxt = tp->rcv_nxt;
 	tw->iss     = tp->iss;
 	tw->irs     = tp->irs;
 	tw->t_starttime = tp->t_starttime;
 	tw->tw_time = 0;
 
 /* XXX
  * If this code will
  * be used for fin-wait-2 state also, then we may need
  * a ts_recent from the last segment.
  */
 	acknow = tp->t_flags & TF_ACKNOW;
 
 	/*
 	 * First, discard tcpcb state, which includes stopping its timers and
 	 * freeing it.  tcp_discardcb() used to also release the inpcb, but
 	 * that work is now done in the caller.
 	 *
 	 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
 	 * and might not be needed here any longer.
 	 */
 	tcp_discardcb(tp);
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	tw->tw_cred = crhold(so->so_cred);
 	SOCK_LOCK(so);
 	tw->tw_so_options = so->so_options;
 	SOCK_UNLOCK(so);
 	if (acknow)
 		tcp_twrespond(tw, TH_ACK);
 	inp->inp_ppcb = tw;
 	inp->inp_vflag |= INP_TIMEWAIT;
 	tcp_tw_2msl_reset(tw, 0);
 
 	/*
 	 * If the inpcb owns the sole reference to the socket, then we can
 	 * detach and free the socket as it is not needed in time wait.
 	 */
 	if (inp->inp_vflag & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_twstart: !SS_PROTOREF"));
 		inp->inp_vflag &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 	} else
 		INP_WUNLOCK(inp);
 }
 
 #if 0
 /*
  * The appromixate rate of ISN increase of Microsoft TCP stacks;
  * the actual rate is slightly higher due to the addition of
  * random positive increments.
  *
  * Most other new OSes use semi-randomized ISN values, so we
  * do not need to worry about them.
  */
 #define MS_ISN_BYTES_PER_SECOND		250000
 
 /*
  * Determine if the ISN we will generate has advanced beyond the last
  * sequence number used by the previous connection.  If so, indicate
  * that it is safe to recycle this tw socket by returning 1.
  */
 int
 tcp_twrecycleable(struct tcptw *tw)
 {
 	INIT_VNET_INET(curvnet);
 	tcp_seq new_iss = tw->iss;
 	tcp_seq new_irs = tw->irs;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
 	new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
 
 	if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
 		return (1);
 	else
 		return (0);
 }
 #endif
 
 /*
  * Returns 1 if the TIME_WAIT state was killed and we should start over,
  * looking for a pcb in the listen state.  Returns 0 otherwise.
  */
 int
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
     struct mbuf *m, int tlen)
 {
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 	INIT_VNET_INET(curvnet);
 #endif
 	struct tcptw *tw;
 	int thflags;
 	tcp_seq seq;
 #ifdef INET6
 	int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #else
 	const int isipv6 = 0;
 #endif
 
 	/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
 	 * still present.  This is undesirable, but temporarily necessary
 	 * until we work out how to handle inpcb's who's timewait state has
 	 * been removed.
 	 */
 	tw = intotw(inp);
 	if (tw == NULL)
 		goto drop;
 
 	thflags = th->th_flags;
 
 	/*
 	 * NOTE: for FIN_WAIT_2 (to be added later),
 	 * must validate sequence number before accepting RST
 	 */
 
 	/*
 	 * If the segment contains RST:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST)
 		goto drop;
 
 #if 0
 /* PAWS not needed at the moment */
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 		if ((thflags & TH_ACK) == 0)
 			goto drop;
 		goto ack;
 	}
 	/*
 	 * ts_recent is never updated because we never accept new segments.
 	 */
 #endif
 
 	/*
 	 * If a new connection request is received
 	 * while in TIME_WAIT, drop the old connection
 	 * and start over if the sequence numbers
 	 * are above the previous ones.
 	 */
 	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
 		tcp_twclose(tw, 0);
 		return (1);
 	}
 
 	/*
 	 * Drop the the segment if it does not contain an ACK.
 	 */
 	if ((thflags & TH_ACK) == 0)
 		goto drop;
 
 	/*
 	 * Reset the 2MSL timer if this is a duplicate FIN.
 	 */
 	if (thflags & TH_FIN) {
 		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
 		if (seq + 1 == tw->rcv_nxt)
 			tcp_tw_2msl_reset(tw, 1);
 	}
 
 	/*
 	 * Acknowledge the segment if it has data or is not a duplicate ACK.
 	 */
 	if (thflags != TH_ACK || tlen != 0 ||
 	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
 		tcp_twrespond(tw, TH_ACK);
 	goto drop;
 
 	/*
 	 * Generate a RST, dropping incoming segment.
 	 * Make ACK acceptable to originator of segment.
 	 * Don't bother to respond if destination was broadcast/multicast.
 	 */
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 	if (isipv6) {
 #ifdef INET6
 		struct ip6_hdr *ip6;
 
 		/* IPv6 anycast check is done at tcp6_input() */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 #endif
 	} else {
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 	if (thflags & TH_ACK) {
 		tcp_respond(NULL,
 		    mtod(m, void *), th, m, 0, th->th_ack, TH_RST);
 	} else {
 		seq = th->th_seq + (thflags & TH_SYN ? 1 : 0);
 		tcp_respond(NULL,
 		    mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK);
 	}
 	INP_WUNLOCK(inp);
 	return (0);
 
 drop:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 void
 tcp_twclose(struct tcptw *tw, int reuse)
 {
 	INIT_VNET_INET(curvnet);
 	struct socket *so;
 	struct inpcb *inp;
 
 	/*
 	 * At this point, we are in one of two situations:
 	 *
 	 * (1) We have no socket, just an inpcb<->twtcp pair.  We can free
 	 *     all state.
 	 *
 	 * (2) We have a socket -- if we own a reference, release it and
 	 *     notify the socket layer.
 	 */
 	inp = tw->tw_inpcb;
 	KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
 	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_stop(). */
 	INP_WLOCK_ASSERT(inp);
 
 	tw->tw_inpcb = NULL;
 	tcp_tw_2msl_stop(tw);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
 
 	so = inp->inp_socket;
 	if (so != NULL) {
 		/*
 		 * If there's a socket, handle two cases: first, we own a
 		 * strong reference, which we will now release, or we don't
 		 * in which case another reference exists (XXXRW: think
 		 * about this more), and we don't need to take action.
 		 */
 		if (inp->inp_vflag & INP_SOCKREF) {
 			inp->inp_vflag &= ~INP_SOCKREF;
 			INP_WUNLOCK(inp);
 			ACCEPT_LOCK();
 			SOCK_LOCK(so);
 			KASSERT(so->so_state & SS_PROTOREF,
 			    ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
 			so->so_state &= ~SS_PROTOREF;
 			sofree(so);
 		} else {
 			/*
 			 * If we don't own the only reference, the socket and
 			 * inpcb need to be left around to be handled by
 			 * tcp_usr_detach() later.
 			 */
 			INP_WUNLOCK(inp);
 		}
 	} else {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			in6_pcbfree(inp);
 		else
 #endif
 			in_pcbfree(inp);
 	}
 	V_tcpstat.tcps_closed++;
 	crfree(tw->tw_cred);
 	tw->tw_cred = NULL;
 	if (reuse)
 		return;
 	uma_zfree(tcptw_zone, tw);
 }
 
 int
 tcp_twrespond(struct tcptw *tw, int flags)
 {
 	INIT_VNET_INET(curvnet);
 	struct inpcb *inp = tw->tw_inpcb;
 	struct tcphdr *th;
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	u_int hdrlen, optlen;
 	int error;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6 = inp->inp_inc.inc_isipv6;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 
 	m = m_gethdr(M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(inp, ip6, th);
 	} else
 #endif
 	{
 		hdrlen = sizeof(struct tcpiphdr);
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(inp, ip, th);
 	}
 	to.to_flags = 0;
 
 	/*
 	 * Send a timestamp and echo-reply if both our side and our peer
 	 * have sent timestamps in our SYN's and this is not a RST.
 	 */
 	if (tw->t_recent && flags == TH_ACK) {
 		to.to_flags |= TOF_TS;
 		to.to_tsval = ticks + tw->ts_offset;
 		to.to_tsecr = tw->t_recent;
 	}
 	optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 	m->m_len = hdrlen + optlen;
 	m->m_pkthdr.len = m->m_len;
 
 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
 
 	th->th_seq = htonl(tw->snd_nxt);
 	th->th_ack = htonl(tw->rcv_nxt);
 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(tw->last_win);
 
 #ifdef INET6
 	if (isipv6) {
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 		    sizeof(struct tcphdr) + optlen);
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 		error = ip6_output(m, inp->in6p_outputopts, NULL,
 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
 	} else
 #endif
 	{
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		ip->ip_len = m->m_pkthdr.len;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= IP_DF;
 		error = ip_output(m, inp->inp_options, NULL,
 		    ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, inp);
 	}
 	if (flags & TH_ACK)
 		V_tcpstat.tcps_sndacks++;
 	else
 		V_tcpstat.tcps_sndctrl++;
 	V_tcpstat.tcps_sndtotal++;
 	return (error);
 }
 
 static void
 tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tw->tw_inpcb);
 	if (rearm)
 		TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	tw->tw_time = ticks + 2 * tcp_msl;
 	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
 }
 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw)
 {
 	INIT_VNET_INET(curvnet);
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 }
 
 struct tcptw *
 tcp_tw_2msl_scan(int reuse)
 {
 	INIT_VNET_INET(curvnet);
 	struct tcptw *tw;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	for (;;) {
 		tw = TAILQ_FIRST(&V_twq_2msl);
 		if (tw == NULL || (!reuse && tw->tw_time > ticks))
 			break;
 		INP_WLOCK(tw->tw_inpcb);
 		tcp_twclose(tw, reuse);
 		if (reuse)
 			return (tw);
 	}
 	return (NULL);
 }
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c	(revision 185347)
+++ head/sys/netinet/udp_usrreq.c	(revision 185348)
@@ -1,1289 +1,1290 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/vimage.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 
 #ifdef VIMAGE_GLOBALS
 int	udp_blackhole;
 #endif
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 static int	udp_cksum = 1;
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum,
     0, "compute udp checksum");
 
 int	udp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &udp_log_in_vain, 0, "Log all incoming UDP packets");
 
-SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &udp_blackhole, 0,
+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_udp, OID_AUTO, blackhole,
+    CTLFLAG_RW, udp_blackhole, 0,
     "Do not send port unreachables for refused connects");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 #ifdef VIMAGE_GLOBALS
 struct inpcbhead	udb;		/* from udp_var.h */
 struct inpcbinfo	udbinfo;
 struct udpstat		udpstat;	/* from udp_var.h */
 #endif
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_udp, UDPCTL_STATS, stats,
     CTLFLAG_RW, udpstat, udpstat,
     "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *);
 
 static void
 udp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 }
 
 static int
 udp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpinp");
 	return (0);
 }
 
 void
 udp_init(void)
 {
 	INIT_VNET_INET(curvnet);
 
 	V_udp_blackhole = 0;
 
 	INP_INFO_LOCK_INIT(&V_udbinfo, "udp");
 	LIST_INIT(&V_udb);
 	V_udbinfo.ipi_listhead = &V_udb;
 	V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
 	    &V_udbinfo.ipi_hashmask);
 	V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB,
 	    &V_udbinfo.ipi_porthashmask);
 	V_udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
 	    NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  */
 static void
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 
 	INP_RLOCK_ASSERT(inp);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, inp)) {
 		INIT_VNET_IPSEC(curvnet);
 		m_freem(n);
 		V_ipsec4stat.in_polvio++;
 		return;
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif
 		append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		INIT_VNET_INET(so->so_vnet);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		V_udpstat.udps_fullsock++;
 	} else
 		sorwakeup_locked(so);
 }
 
 void
 udp_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET(curvnet);
 	int iphlen = off;
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	int len;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
 
 	ifp = m->m_pkthdr.rcvif;
 	V_udpstat.udps_ipackets++;
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m, (struct mbuf *)0);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			V_udpstat.udps_hdrops++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
 			V_udpstat.udps_badlen++;
 			goto badunlocked;
 		}
 		m_adj(m, len - ip->ip_len);
 		/* ip->ip_len = len; */
 	}
 
 	/*
 	 * Save a copy of the IP header in case we want restore it for
 	 * sending an ICMP error message in response.
 	 */
 	if (!V_udp_blackhole)
 		save_ip = *ip;
 	else
 		memset(&save_ip, 0, sizeof(save_ip));
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh_sum) {
 			V_udpstat.udps_badsum++;
 			m_freem(m);
 			return;
 		}
 	} else
 		V_udpstat.udps_nosum++;
 
 #ifdef IPFIREWALL_FORWARD
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	if (fwd_tag != NULL) {
 		struct sockaddr_in *next_hop;
 
 		/*
 		 * Do the hack.
 		 */
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 		ip->ip_dst = next_hop->sin_addr;
 		uh->uh_dport = ntohs(next_hop->sin_port);
 
 		/*
 		 * Remove the tag from the packet.  We don't need it anymore.
 		 */
 		m_tag_delete(m, fwd_tag);
 	}
 #endif
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp)) {
 		struct inpcb *last;
 		struct ip_moptions *imo;
 
 		last = NULL;
 		LIST_FOREACH(inp, &V_udb, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 				continue;
 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 				continue;
 			/*
 			 * XXX: Do not check source port of incoming datagram
 			 * unless inp_connect() has been called to bind the
 			 * fport part of the 4-tuple; the source could be
 			 * trying to talk to us with an ephemeral port.
 			 */
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 
 			INP_RLOCK(inp);
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->inp_moptions;
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 			    imo != NULL) {
 				struct sockaddr_in	 sin;
 				struct in_msource	*ims;
 				int			 blocked, mode;
 				size_t			 idx;
 
 				bzero(&sin, sizeof(struct sockaddr_in));
 				sin.sin_len = sizeof(struct sockaddr_in);
 				sin.sin_family = AF_INET;
 				sin.sin_addr = ip->ip_dst;
 
 				blocked = 0;
 				idx = imo_match_group(imo, ifp,
 				    (struct sockaddr *)&sin);
 				if (idx == -1) {
 					/*
 					 * No group membership for this socket.
 					 * Do not bump udps_noportbcast, as
 					 * this will happen further down.
 					 */
 					blocked++;
 				} else {
 					/*
 					 * Check for a multicast source filter
 					 * entry on this socket for this group.
 					 * MCAST_EXCLUDE is the default
 					 * behaviour.  It means default accept;
 					 * entries, if present, denote sources
 					 * to be excluded from delivery.
 					 */
 					ims = imo_match_source(imo, idx,
 					    (struct sockaddr *)&udp_in);
 					mode = imo->imo_mfilters[idx].imf_fmode;
 					if ((ims != NULL &&
 					     mode == MCAST_EXCLUDE) ||
 					    (ims == NULL &&
 					     mode == MCAST_INCLUDE)) {
 #ifdef DIAGNOSTIC
 						if (bootverbose) {
 							printf("%s: blocked by"
 							    " source filter\n",
 							    __func__);
 						}
 #endif
 						V_udpstat.udps_filtermcast++;
 						blocked++;
 					}
 				}
 				if (blocked != 0) {
 					INP_RUNLOCK(inp);
 					continue;
 				}
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				n = m_copy(m, 0, M_COPYALL);
 				if (n != NULL)
 					udp_append(last, ip, n, iphlen +
 					    sizeof(struct udphdr), &udp_in);
 				INP_RUNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			V_udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
 		    &udp_in);
 		INP_RUNLOCK(last);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		return;
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 	inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
 	    ip->ip_dst, uh->uh_dport, 1, ifp);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		V_udpstat.udps_noport++;
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			V_udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		if (V_udp_blackhole)
 			goto badheadlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badheadlocked;
 		*ip = save_ip;
 		ip->ip_len += iphlen;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		return;
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK(inp);
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		INP_RUNLOCK(inp);
 		goto badunlocked;
 	}
 	udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
 	INP_RUNLOCK(inp);
 	return;
 
 badheadlocked:
 	if (inp)
 		INP_RUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_udbinfo);
 badunlocked:
 	m_freem(m);
 }
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	/*
 	 * While udp_ctlinput() always calls udp_notify() with a read lock
 	 * when invoking it directly, in_pcbnotifyall() currently uses write
 	 * locks due to sharing code with TCP.  For now, accept either a read
 	 * or a write lock, but a read lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		INP_INFO_RLOCK(&V_udbinfo);
 		inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, 0, NULL);
 		if (inp != NULL) {
 			INP_RLOCK(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_RUNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&V_udbinfo);
 	} else
 		in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_udbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	gencnt = V_udbinfo.ipi_gencnt;
 	n = V_udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_udbinfo);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0)
 			inp_list[i++] = inp;
 		INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_udbinfo);
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET(curvnet);
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	INP_INFO_RLOCK(&V_udbinfo);
 	inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
 	if (inp != NULL) {
 		INP_RLOCK(inp);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else {
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = ENOENT;
 	}
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	INIT_VNET_INET(inp->inp_vnet);
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct sockaddr_in *sin, src;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		m_freem(m);
 		return (error);
 	}
 
 	/*
 	 * Depending on whether or not the application has bound or connected
 	 * the socket, we may have to do varying levels of work.  The optimal
 	 * case is for a connected UDP socket, as a global lock isn't
 	 * required at all.
 	 *
 	 * In order to decide which we need, we require stability of the
 	 * inpcb binding, which we ensure by acquiring a read lock on the
 	 * inpcb.  This doesn't strictly follow the lock order, so we play
 	 * the trylock and retry game; note that we may end up with more
 	 * conservative locks than required the second time around, so later
 	 * assertions have to accept that.  Further analysis of the number of
 	 * misses under contention is required.
 	 */
 	sin = (struct sockaddr_in *)addr;
 	INP_RLOCK(inp);
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
 		INP_RUNLOCK(inp);
 		INP_INFO_WLOCK(&V_udbinfo);
 		INP_WLOCK(inp);
 		unlock_udbinfo = 2;
 	} else if ((sin != NULL && (
 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
 	    (inp->inp_lport == 0))) ||
 	    (src.sin_family == AF_INET)) {
 		if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) {
 			INP_RUNLOCK(inp);
 			INP_INFO_RLOCK(&V_udbinfo);
 			INP_RLOCK(inp);
 		}
 		unlock_udbinfo = 1;
 	} else
 		unlock_udbinfo = 0;
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		INP_INFO_LOCK_ASSERT(&V_udbinfo);
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		if (jailed(td->td_ucred))
 			prison_remote_ip(td->td_ucred, 0,
 			    &sin->sin_addr.s_addr);
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_INFO_LOCK_ASSERT(&V_udbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error)
 				goto release;
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_INFO_WLOCK_ASSERT(&V_udbinfo);
 				INP_WLOCK_ASSERT(inp);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (jailed(td->td_ucred))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				if (in_pcbinshash(inp) != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			}
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= IP_DF;
 	}
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else
 		ui->ui_sum = 0;
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
 	V_udpstat.udps_opackets++;
 
 	if (unlock_udbinfo == 2)
 		INP_INFO_WUNLOCK(&V_udbinfo);
 	else if (unlock_udbinfo == 1)
 		INP_INFO_RUNLOCK(&V_udbinfo);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	if (unlock_udbinfo == 2)
 		INP_WUNLOCK(inp);
 	else
 		INP_RUNLOCK(inp);
 	return (error);
 
 release:
 	if (unlock_udbinfo == 2) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_udbinfo);
 	} else if (unlock_udbinfo == 1) {
 		INP_RUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 	} else
 		INP_RUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 static void
 udp_abort(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(&V_udbinfo);
 	error = in_pcballoc(so, &V_udbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (error);
 	}
 
 	inp = (struct inpcb *)so->so_pcb;
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = V_ip_defttl;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	int error;
 	struct sockaddr_in *sin;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (EISCONN);
 	}
 	sin = (struct sockaddr_in *)nam;
 	if (jailed(td->td_ucred))
 		prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (ENOTCONN);
 	}
 
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	return (0);
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 	return (udp_output(inp, m, addr, control, td));
 }
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		udp_send,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp_close,
 };
Index: head/sys/netinet/vinet.h
===================================================================
--- head/sys/netinet/vinet.h	(revision 185347)
+++ head/sys/netinet/vinet.h	(revision 185348)
@@ -1,333 +1,336 @@
 /*-
  * Copyright (c) 2006-2008 University of Zagreb
  * Copyright (c) 2006-2008 FreeBSD Foundation
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NETINET_VINET_H_
 #define _NETINET_VINET_H_
 
 #ifdef VIMAGE
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/md5.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/igmp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hostcache.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 struct vnet_inet {
 	struct	in_ifaddrhashhead *_in_ifaddrhashtbl;
 	struct	in_ifaddrhead _in_ifaddrhead;
 	u_long	_in_ifaddrhmask;
 	struct	in_multihead _in_multihead;
 
 	int	_arpt_keep;
 	int	_arp_maxtries;
 	int	_useloopback;
 	int	_arp_proxyall;
 	int	_subnetsarelocal;
 	int	_sameprefixcarponly;
 
 	int	_ipforwarding;
 	int	_ipstealth;
 	int	_ipfastforward_active;
 	int	_ipsendredirects;
 	int	_ip_defttl;
 	int	_ip_keepfaith;
 	int	_ip_sendsourcequench;
 	int	_ip_do_randomid;
 	int	_ip_checkinterface;
 	u_short	_ip_id;
 
 	uma_zone_t _ipq_zone;
 	int	_nipq;			/* Total # of reass queues */
 	int	_maxnipq;		/* Admin. limit on # reass queues. */
 	int	_maxfragsperpacket;
 	TAILQ_HEAD(ipqhead, ipq) _ipq[IPREASS_NHASH];
 
 	struct	inpcbhead _tcb;		/* head of queue of active tcpcb's */
 	struct	inpcbinfo _tcbinfo;
 	struct	tcpstat _tcpstat;	/* tcp statistics */
 	struct	tcp_hostcache _tcp_hostcache;
 	struct  callout _tcp_hc_callout;
+
 	struct	tcp_syncache _tcp_syncache;
+	int	_tcp_syncookies;
+	int	_tcp_syncookiesonly;
+	int	_tcp_sc_rst_sock_fail;
+
 	struct	inpcbhead _divcb;
 	struct	inpcbinfo _divcbinfo;
 	TAILQ_HEAD(, tcptw) _twq_2msl;
 
-	int	_tcp_sc_rst_sock_fail;
 	int	_tcp_mssdflt;
 	int	_tcp_v6mssdflt;
 	int	_tcp_minmss;
 	int	_tcp_do_rfc1323;
 	int	_icmp_may_rst;
 	int	_tcp_isn_reseed_interval;
 	int	_tcp_inflight_enable;
 	int	_tcp_inflight_rttthresh;
 	int	_tcp_inflight_min;
 	int	_tcp_inflight_max;
 	int	_tcp_inflight_stab;
 	int	_nolocaltimewait;
 	int	_path_mtu_discovery;
 	int	_ss_fltsz;
 	int	_ss_fltsz_local;
 	int	_tcp_do_newreno;
 	int	_tcp_do_tso;
 	int	_tcp_do_autosndbuf;
 	int	_tcp_autosndbuf_inc;
 	int	_tcp_autosndbuf_max;
 	int	_tcp_do_sack;
 	int	_tcp_sack_maxholes;
 	int	_tcp_sack_globalmaxholes;
 	int	_tcp_sack_globalholes;
 	int	_blackhole;
 	int	_tcp_delack_enabled;
 	int	_drop_synfin;
 	int	_tcp_do_rfc3042;
 	int	_tcp_do_rfc3390;
 	int	_tcp_do_ecn;
 	int	_tcp_ecn_maxretries;
 	int	_tcp_insecure_rst;
 	int	_tcp_do_autorcvbuf;
 	int	_tcp_autorcvbuf_inc;
 	int	_tcp_autorcvbuf_max;
 	int	_tcp_reass_maxseg;
 	int	_tcp_reass_qsize;
 	int	_tcp_reass_maxqlen;
 	int	_tcp_reass_overflows;
 
 	u_char	_isn_secret[32];
 	int	_isn_last_reseed;
 	u_int32_t _isn_offset;
 	u_int32_t _isn_offset_old;
 	MD5_CTX	_isn_ctx;
 
 	struct	inpcbhead _udb;
 	struct	inpcbinfo _udbinfo;
 	struct	udpstat	_udpstat;
 	int	_udp_blackhole;
 
 	struct	inpcbhead _ripcb;
 	struct	inpcbinfo _ripcbinfo;
 	struct	socket *_ip_mrouter;
 
 	struct	socket *_ip_rsvpd;
 	int	_ip_rsvp_on;
 	int	_rsvp_on;
 
 	struct	icmpstat _icmpstat;
 	struct	ipstat _ipstat;
 	struct	igmpstat _igmpstat;
 
 	SLIST_HEAD(, router_info) _router_info_head;
 
 	int	_rtq_timeout;
 	int	_rtq_reallyold;
 	int	_rtq_minreallyold;
 	int	_rtq_toomany;
 	struct	callout _rtq_timer;
 
 	int	_ipport_lowfirstauto;
 	int	_ipport_lowlastauto;
 	int	_ipport_firstauto;
 	int	_ipport_lastauto;
 	int	_ipport_hifirstauto;
 	int	_ipport_hilastauto;
 	int	_ipport_reservedhigh;
 	int	_ipport_reservedlow;
 	int	_ipport_randomized;
 	int	_ipport_randomcps;
 	int	_ipport_randomtime;
 	int	_ipport_stoprandom;
 	int	_ipport_tcpallocs;
 	int	_ipport_tcplastcount;
 
 	int	_icmpmaskrepl;
 	u_int	_icmpmaskfake;
 	int	_drop_redirect;
 	int	_log_redirect;
 	int	_icmplim;
 	int	_icmplim_output;
 	char	_reply_src[IFNAMSIZ];
 	int	_icmp_rfi;
 	int	_icmp_quotelen;
 	int	_icmpbmcastecho;
 };
 #endif
 
 
 /*
  * Symbol translation macros
  */
 #define	INIT_VNET_INET(vnet) \
 	INIT_FROM_VNET(vnet, VNET_MOD_INET, struct vnet_inet, vnet_inet)
 
 #define	VNET_INET(sym)	VSYM(vnet_inet, sym)
 
 #define	V_arp_maxtries		VNET_INET(arp_maxtries)
 #define	V_arp_proxyall		VNET_INET(arp_proxyall)
 #define	V_arpt_keep		VNET_INET(arpt_keep)
-#define	V_arpt_prune		VNET_INET(arpt_prune)
 #define	V_blackhole		VNET_INET(blackhole)
 #define	V_divcb			VNET_INET(divcb)
 #define	V_divcbinfo		VNET_INET(divcbinfo)
 #define	V_drop_redirect		VNET_INET(drop_redirect)
 #define	V_drop_synfin		VNET_INET(drop_synfin)
 #define	V_icmp_may_rst		VNET_INET(icmp_may_rst)
 #define	V_icmp_quotelen		VNET_INET(icmp_quotelen)
 #define	V_icmp_rfi		VNET_INET(icmp_rfi)
 #define	V_icmpbmcastecho	VNET_INET(icmpbmcastecho)
 #define	V_icmplim		VNET_INET(icmplim)
 #define	V_icmplim_output	VNET_INET(icmplim_output)
 #define	V_icmpmaskfake		VNET_INET(icmpmaskfake)
 #define	V_icmpmaskrepl		VNET_INET(icmpmaskrepl)
 #define	V_icmpstat		VNET_INET(icmpstat)
 #define	V_igmpstat		VNET_INET(igmpstat)
 #define	V_in_ifaddrhashtbl	VNET_INET(in_ifaddrhashtbl)
 #define	V_in_ifaddrhead		VNET_INET(in_ifaddrhead)
 #define	V_in_ifaddrhmask	VNET_INET(in_ifaddrhmask)
 #define	V_in_multihead		VNET_INET(in_multihead)
 #define	V_ip_checkinterface	VNET_INET(ip_checkinterface)
 #define	V_ip_defttl		VNET_INET(ip_defttl)
 #define	V_ip_do_randomid	VNET_INET(ip_do_randomid)
 #define	V_ip_id			VNET_INET(ip_id)
 #define	V_ip_keepfaith		VNET_INET(ip_keepfaith)
 #define	V_ip_mrouter		VNET_INET(ip_mrouter)
 #define	V_ip_rsvp_on		VNET_INET(ip_rsvp_on)
 #define	V_ip_rsvpd		VNET_INET(ip_rsvpd)
 #define	V_ip_sendsourcequench	VNET_INET(ip_sendsourcequench)
 #define	V_ipfastforward_active	VNET_INET(ipfastforward_active)
 #define	V_ipforwarding		VNET_INET(ipforwarding)
 #define	V_ipport_firstauto	VNET_INET(ipport_firstauto)
 #define	V_ipport_hifirstauto	VNET_INET(ipport_hifirstauto)
 #define	V_ipport_hilastauto	VNET_INET(ipport_hilastauto)
 #define	V_ipport_lastauto	VNET_INET(ipport_lastauto)
 #define	V_ipport_lowfirstauto	VNET_INET(ipport_lowfirstauto)
 #define	V_ipport_lowlastauto	VNET_INET(ipport_lowlastauto)
 #define	V_ipport_randomcps	VNET_INET(ipport_randomcps)
 #define	V_ipport_randomized	VNET_INET(ipport_randomized)
 #define	V_ipport_randomtime	VNET_INET(ipport_randomtime)
 #define	V_ipport_reservedhigh	VNET_INET(ipport_reservedhigh)
 #define	V_ipport_reservedlow	VNET_INET(ipport_reservedlow)
 #define	V_ipport_stoprandom	VNET_INET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET_INET(ipport_tcpallocs)
 #define	V_ipport_tcplastcount	VNET_INET(ipport_tcplastcount)
 #define	V_ipq			VNET_INET(ipq)
 #define	V_ipq_zone		VNET_INET(ipq_zone)
 #define	V_ipsendredirects	VNET_INET(ipsendredirects)
 #define	V_ipstat		VNET_INET(ipstat)
 #define	V_ipstealth		VNET_INET(ipstealth)
 #define	V_isn_ctx		VNET_INET(isn_ctx)
 #define	V_isn_last_reseed	VNET_INET(isn_last_reseed)
 #define	V_isn_offset		VNET_INET(isn_offset)
 #define	V_isn_offset_old	VNET_INET(isn_offset_old)
 #define	V_isn_secret		VNET_INET(isn_secret)
 #define	V_llinfo_arp		VNET_INET(llinfo_arp)
 #define	V_log_redirect		VNET_INET(log_redirect)
 #define	V_maxfragsperpacket	VNET_INET(maxfragsperpacket)
 #define	V_maxnipq		VNET_INET(maxnipq)
 #define	V_nipq			VNET_INET(nipq)
 #define	V_nolocaltimewait	VNET_INET(nolocaltimewait)
 #define	V_path_mtu_discovery	VNET_INET(path_mtu_discovery)
 #define	V_reply_src		VNET_INET(reply_src)
 #define	V_ripcb			VNET_INET(ripcb)
 #define	V_ripcbinfo		VNET_INET(ripcbinfo)
 #define	V_router_info_head	VNET_INET(router_info_head)
 #define	V_rsvp_on		VNET_INET(rsvp_on)
 #define	V_rtq_minreallyold	VNET_INET(rtq_minreallyold)
 #define	V_rtq_reallyold		VNET_INET(rtq_reallyold)
 #define	V_rtq_timeout		VNET_INET(rtq_timeout)
 #define	V_rtq_timer		VNET_INET(rtq_timer)
 #define	V_rtq_toomany		VNET_INET(rtq_toomany)
 #define	V_sameprefixcarponly	VNET_INET(sameprefixcarponly)
 #define	V_ss_fltsz		VNET_INET(ss_fltsz)
 #define	V_ss_fltsz_local	VNET_INET(ss_fltsz_local)
 #define	V_subnetsarelocal	VNET_INET(subnetsarelocal)
 #define	V_tcb			VNET_INET(tcb)
 #define	V_tcbinfo		VNET_INET(tcbinfo)
 #define	V_tcp_autorcvbuf_inc	VNET_INET(tcp_autorcvbuf_inc)
 #define	V_tcp_autorcvbuf_max	VNET_INET(tcp_autorcvbuf_max)
 #define	V_tcp_autosndbuf_inc	VNET_INET(tcp_autosndbuf_inc)
 #define	V_tcp_autosndbuf_max	VNET_INET(tcp_autosndbuf_max)
 #define	V_tcp_delack_enabled	VNET_INET(tcp_delack_enabled)
 #define	V_tcp_do_autorcvbuf	VNET_INET(tcp_do_autorcvbuf)
 #define	V_tcp_do_autosndbuf	VNET_INET(tcp_do_autosndbuf)
 #define	V_tcp_do_ecn		VNET_INET(tcp_do_ecn)
 #define	V_tcp_do_newreno	VNET_INET(tcp_do_newreno)
 #define	V_tcp_do_rfc1323	VNET_INET(tcp_do_rfc1323)
 #define	V_tcp_do_rfc3042	VNET_INET(tcp_do_rfc3042)
 #define	V_tcp_do_rfc3390	VNET_INET(tcp_do_rfc3390)
 #define	V_tcp_do_sack		VNET_INET(tcp_do_sack)
 #define	V_tcp_do_tso		VNET_INET(tcp_do_tso)
 #define	V_tcp_ecn_maxretries	VNET_INET(tcp_ecn_maxretries)
 #define	V_tcp_hc_callout	VNET_INET(tcp_hc_callout)
 #define	V_tcp_hostcache		VNET_INET(tcp_hostcache)
 #define	V_tcp_inflight_enable	VNET_INET(tcp_inflight_enable)
 #define	V_tcp_inflight_max	VNET_INET(tcp_inflight_max)
 #define	V_tcp_inflight_min	VNET_INET(tcp_inflight_min)
 #define	V_tcp_inflight_rttthresh VNET_INET(tcp_inflight_rttthresh)
 #define	V_tcp_inflight_stab	VNET_INET(tcp_inflight_stab)
 #define	V_tcp_insecure_rst	VNET_INET(tcp_insecure_rst)
 #define	V_tcp_isn_reseed_interval VNET_INET(tcp_isn_reseed_interval)
 #define	V_tcp_minmss		VNET_INET(tcp_minmss)
 #define	V_tcp_mssdflt		VNET_INET(tcp_mssdflt)
 #define	V_tcp_reass_maxqlen	VNET_INET(tcp_reass_maxqlen)
 #define	V_tcp_reass_maxseg	VNET_INET(tcp_reass_maxseg)
 #define	V_tcp_reass_overflows	VNET_INET(tcp_reass_overflows)
 #define	V_tcp_reass_qsize	VNET_INET(tcp_reass_qsize)
 #define	V_tcp_sack_globalholes	VNET_INET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes VNET_INET(tcp_sack_globalmaxholes)
 #define	V_tcp_sack_maxholes	VNET_INET(tcp_sack_maxholes)
 #define	V_tcp_sc_rst_sock_fail	VNET_INET(tcp_sc_rst_sock_fail)
 #define	V_tcp_syncache		VNET_INET(tcp_syncache)
 #define	V_tcp_syncookies	VNET_INET(tcp_syncookies)
 #define	V_tcp_syncookiesonly	VNET_INET(tcp_syncookiesonly)
 #define	V_tcp_v6mssdflt		VNET_INET(tcp_v6mssdflt)
 #define	V_tcpstat		VNET_INET(tcpstat)
 #define	V_twq_2msl		VNET_INET(twq_2msl)
 #define	V_udb			VNET_INET(udb)
 #define	V_udbinfo		VNET_INET(udbinfo)
 #define	V_udp_blackhole		VNET_INET(udp_blackhole)
 #define	V_udpstat		VNET_INET(udpstat)
 #define	V_useloopback		VNET_INET(useloopback)
 
 #endif /* !_NETINET_VINET_H_ */
Index: head/sys/netinet6/icmp6.c
===================================================================
--- head/sys/netinet6/icmp6.c	(revision 185347)
+++ head/sys/netinet6/icmp6.c	(revision 185348)
@@ -1,2822 +1,2823 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif
 
 extern struct domain inet6domain;
+
+#ifdef VIMAGE_GLOBALS
 extern struct inpcbinfo ripcbinfo;
 extern struct inpcbhead ripcb;
 extern int icmp6errppslim;
 extern int icmp6_nodeinfo;
 
-#ifdef VIMAGE_GLOBALS
 struct icmp6stat icmp6stat;
 static int icmp6errpps_count;
 static struct timeval icmp6errppslim_last;
 #endif
 
 static void icmp6_errcount(struct icmp6errstat *, int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
 static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
 static const char *icmp6_redirect_diag __P((struct in6_addr *,
 	struct in6_addr *, struct in6_addr *));
 static struct mbuf *ni6_input(struct mbuf *, int);
 static struct mbuf *ni6_nametodns(const char *, int, int);
 static int ni6_dnsmatch(const char *, int, const char *, int);
 static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, struct in6_addr *));
 static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int));
 static int icmp6_notify_error(struct mbuf **, int, int, int);
 
 
 void
 icmp6_init(void)
 {
 	INIT_VNET_INET6(curvnet);
 
 	V_icmp6errpps_count = 0;
 
 	mld6_init();
 }
 
 static void
 icmp6_errcount(struct icmp6errstat *stat, int type, int code)
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			stat->icp6errs_dst_unreach_noroute++;
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			stat->icp6errs_dst_unreach_admin++;
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			stat->icp6errs_dst_unreach_beyondscope++;
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			stat->icp6errs_dst_unreach_addr++;
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			stat->icp6errs_dst_unreach_noport++;
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		stat->icp6errs_packet_too_big++;
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			stat->icp6errs_time_exceed_transit++;
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			stat->icp6errs_time_exceed_reassembly++;
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			stat->icp6errs_paramprob_header++;
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			stat->icp6errs_paramprob_nextheader++;
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			stat->icp6errs_paramprob_option++;
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		stat->icp6errs_redirect++;
 		return;
 	}
 	stat->icp6errs_unknown++;
 }
 
 /*
  * A wrapper function for icmp6_error() necessary when the erroneous packet
  * may not contain enough scope zone information.
  */
 void
 icmp6_error2(struct mbuf *m, int type, int code, int param,
     struct ifnet *ifp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *ip6;
 
 	if (ifp == NULL)
 		return;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
 		return;
 	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 		return;
 
 	icmp6_error(m, type, code, param);
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(struct mbuf *m, int type, int code, int param)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	V_icmp6stat.icp6s_error++;
 
 	/* count per-type-code statistics */
 	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		V_icmp6stat.icp6s_canterror++;
 		goto freeit;
 	}
 #endif
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * If the destination address of the erroneous packet is a multicast
 	 * address, or the packet was sent using link-layer multicast,
 	 * we should basically suppress sending an error (RFC 2463, Section
 	 * 2.4).
 	 * We have two exceptions (the item e.2 in that section):
 	 * - the Pakcet Too Big message can be sent for path MTU discovery.
 	 * - the Parameter Problem Message that can be allowed an icmp6 error
 	 *   in the option type field.  This check has been done in
 	 *   ip6_unknown_opt(), so we can just check the type and code.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/*
 	 * RFC 2463, 2.4 (e.5): source address check.
 	 * XXX: the case of anycast source?
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 #ifndef PULLDOWN_TEST
 		IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), );
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
 			sizeof(*icp));
 		if (icp == NULL) {
 			V_icmp6stat.icp6s_tooshort++;
 			return;
 		}
 #endif
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			V_icmp6stat.icp6s_canterror++;
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		V_icmp6stat.icp6s_toofreq++;
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_DONTWAIT);
 	if (m && m->m_len < preplen)
 		m = m_pullup(m, preplen);
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	in6_clearscope(&oip6->ip6_src);
 	in6_clearscope(&oip6->ip6_dst);
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	/*
 	 * icmp6_reflect() is designed to be in the input path.
 	 * icmp6_error() can be called from both input and output path,
 	 * and if we are in output path rcvif could contain bogus value.
 	 * clear m->m_pkthdr.rcvif for safety, we should have enough scope
 	 * information in ip header (nip6).
 	 */
 	m->m_pkthdr.rcvif = NULL;
 
 	V_icmp6stat.icp6s_outhist[type]++;
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell whether or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	INIT_VNET_INET6(curvnet);
 	INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
 	struct mbuf *m = *mp, *n;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	int off = *offp;
 	int icmp6len = m->m_pkthdr.len - *offp;
 	int code, sum, noff;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE);
 	/* m might change if M_LOOP.  So, call mtod after this */
 #endif
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		V_icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 
 	/*
 	 * calculate the checksum
 	 */
 #ifndef PULLDOWN_TEST
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return IPPROTO_DONE;
 	}
 #endif
 	code = icmp6->icmp6_code;
 
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum,
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 		V_icmp6stat.icp6s_checksum++;
 		goto freeit;
 	}
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/*
 		 * Deliver very specific ICMP6 type only.
 		 * This is important to deliver TOOBIG.  Otherwise PMTUD
 		 * will not work.
 		 */
 		switch (icmp6->icmp6_type) {
 		case ICMP6_DST_UNREACH:
 		case ICMP6_PACKET_TOO_BIG:
 		case ICMP6_TIME_EXCEEDED:
 			break;
 		default:
 			goto freeit;
 		}
 	}
 
 	V_icmp6stat.icp6s_inhist[icmp6->icmp6_type]++;
 	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error);
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib);
 			code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
 			break;
 		case ICMP6_DST_UNREACH_ADDR:
 			code = PRC_HOSTDEAD;
 			break;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;
 			break;
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = PRC_UNREACH_PORT;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig);
 
 		/* validation is made in icmp6_mtudisc_update */
 
 		code = PRC_MSGSIZE;
 
 		/*
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 		break;
 
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			code = PRC_TIMXCEED_INTRANS;
 			break;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			code = PRC_TIMXCEED_REASS;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			code = PRC_UNREACH_PROTOCOL;
 			break;
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			code = PRC_PARAMPROB;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if ((n->m_flags & M_EXT) != 0
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			const int maxlen = sizeof(*nip6) + sizeof(*nicmp6);
 			int n0len;
 
 			MGETHDR(n, M_DONTWAIT, n0->m_type);
 			n0len = n0->m_pkthdr.len;	/* save for use below */
 			if (n)
 				M_MOVE_PKTHDR(n, n0);
 			if (n && maxlen >= MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			/* new mbuf contains only ipv6+icmpv6 headers */
 			n->m_len = noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf.  ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			/* recalculate complete packet size */
 			n->m_pkthdr.len = n0len + (noff - off);
 			n->m_next = n0;
 		} else {
 			nip6 = mtod(n, struct ip6_hdr *);
 			IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
 			    sizeof(*nicmp6));
 			noff = off;
 		}
 		nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 		nicmp6->icmp6_code = 0;
 		if (n) {
 			V_icmp6stat.icp6s_reflect++;
 			V_icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 		if (icmp6len < sizeof(struct mld_hdr))
 			goto badlen;
 		if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery);
 		else
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport);
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			mld6_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		mld6_input(n, off);
 		/* m stays. */
 		break;
 
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone);
 		if (icmp6len < sizeof(struct mld_hdr))	/* necessary? */
 			goto badlen;
 		break;		/* nothing to be done in kernel */
 
 	case MLD_MTRACE_RESP:
 	case MLD_MTRACE:
 		/* XXX: these two are experimental.  not officially defined. */
 		/* XXX: per-interface statistics? */
 		break;		/* just pass it to applications */
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 
 		if (!V_icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 #define hostnamelen	strlen(V_hostname)
 		if (mode == FQDN) {
 #ifndef PULLDOWN_TEST
 			IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
 			    IPPROTO_DONE);
 #endif
 			n = m_copy(m, 0, M_COPYALL);
 			if (n)
 				n = ni6_input(n, off);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			u_char *p;
 			int maxlen, maxhlen;
 
 			/*
 			 * XXX: this combination of flags is pointless,
 			 * but should we keep this for compatibility?
 			 */
 			if ((V_icmp6_nodeinfo & 5) != 5)
 				break;
 
 			if (code != 0)
 				goto badcode;
 			maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4;
 			if (maxlen >= MCLBYTES) {
 				/* Give up remote */
 				break;
 			}
 			MGETHDR(n, M_DONTWAIT, m->m_type);
 			if (n && maxlen > MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n && !m_dup_pkthdr(n, m, M_DONTWAIT)) {
 				/*
 				 * Previous code did a blind M_COPY_PKTHDR
 				 * and said "just for rcvif".  If true, then
 				 * we could tolerate the dup failing (due to
 				 * the deep copy of the tag chain).  For now
 				 * be conservative and just fail.
 				 */
 				m_free(n);
 				n = NULL;
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			n->m_pkthdr.rcvif = NULL;
 			n->m_len = 0;
 			maxhlen = M_TRAILINGSPACE(n) - maxlen;
 			mtx_lock(&hostname_mtx);
 			if (maxhlen > hostnamelen)
 				maxhlen = hostnamelen;
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 			bcopy(V_hostname, p + 4, maxhlen); /* meaningless TTL */
 			mtx_unlock(&hostname_mtx);
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 #undef hostnamelen
 		if (n) {
 			V_icmp6stat.icp6s_reflect++;
 			V_icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_rs_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_rs_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ra_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ra_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ns_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ns_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_na_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_na_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			icmp6_redirect_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		icmp6_redirect_input(n, off);
 		/* m stays. */
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			code = PRC_NCMDS;
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(&m, off, icmp6len, code)) {
 			/* In this case, m should've been freed. */
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		V_icmp6stat.icp6s_badcode++;
 		break;
 
 	badlen:
 		V_icmp6stat.icp6s_badlen++;
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	return IPPROTO_DONE;
 
  freeit:
 	m_freem(m);
 	return IPPROTO_DONE;
 }
 
 static int
 icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code)
 {
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		V_icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off,
 	    sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1);
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 	    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 	if (icmp6 == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return (-1);
 	}
 #endif
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 	/* Detect the upper level protocol */
 	{
 		void (*ctlfunc)(int, struct sockaddr *, void *);
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 		    sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		struct in6_addr *finaldst = NULL;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0,
 				    eoff + sizeof(struct ip6_ext), -1);
 				eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
 				    eoff, sizeof(*eh));
 				if (eh == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1);
 				rth = (struct ip6_rthdr *)
 				    (mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
 				    eoff, sizeof(*rth));
 				if (rth == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 #ifndef PULLDOWN_TEST
 					IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1);
 					rth0 = (struct ip6_rthdr0 *)
 					    (mtod(m, caddr_t) + eoff);
 #else
 					IP6_EXTHDR_GET(rth0,
 					    struct ip6_rthdr0 *, m,
 					    eoff, rthlen);
 					if (rth0 == NULL) {
 						V_icmp6stat.icp6s_tooshort++;
 						return (-1);
 					}
 #endif
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff +
 				    sizeof(struct ip6_frag), -1);
 				fh = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    eoff);
 #else
 				IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
 				    eoff, sizeof(*fh));
 				if (fh == NULL) {
 					V_icmp6stat.icp6s_tooshort++;
 					return (-1);
 				}
 #endif
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 #ifndef PULLDOWN_TEST
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 		    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (icmp6 == NULL) {
 			V_icmp6stat.icp6s_tooshort++;
 			return (-1);
 		}
 #endif
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 * XXX: there is no guarantee that the source or destination
 		 * addresses of the inner packet are in the same scope as
 		 * the addresses of the icmp packet.  But there is no other
 		 * way to determine the zone.
 		 */
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 		bzero(&icmp6dst, sizeof(icmp6dst));
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (finaldst == NULL)
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		else
 			icmp6dst.sin6_addr = *finaldst;
 		if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		icmp6src.sin6_flowinfo =
 		    (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		if (finaldst == NULL)
 			finaldst = &eip6->ip6_dst;
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = finaldst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		ctlfunc = (void (*)(int, struct sockaddr *, void *))
 		    (inet6sw[ip6_protox[nxt]].pr_ctlinput);
 		if (ctlfunc) {
 			(void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst,
 			    &ip6cp);
 		}
 	}
 	*mp = m;
 	return (0);
 
   freeit:
 	m_freem(m);
 	return (-1);
 }
 
 void
 icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_addr *dst = ip6cp->ip6c_finaldst;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct in_conninfo inc;
 
 #if 0
 	/*
 	 * RFC2460 section 5, last paragraph.
 	 * even though minimum link MTU for IPv6 is IPV6_MMTU,
 	 * we may see ICMPv6 too big with mtu < IPV6_MMTU
 	 * due to packet translator in the middle.
 	 * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
 	 * special handling.
 	 */
 	if (mtu < IPV6_MMTU)
 		return;
 #endif
 
 	/*
 	 * we reject ICMPv6 too big with abnormally small value.
 	 * XXX what is the good definition of "abnormally small"?
 	 */
 	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
 		return;
 
 	if (!validated)
 		return;
 
 	/*
 	 * In case the suggested mtu is less than IPV6_MMTU, we
 	 * only need to remember that it was for above mentioned
 	 * "alwaysfrag" case.
 	 * Try to be as close to the spec as possible.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;
 
 	bzero(&inc, sizeof(inc));
 	inc.inc_flags = 1; /* IPv6 */
 	inc.inc6_faddr = *dst;
 	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
 		return;
 
 	if (mtu < tcp_maxmtu6(&inc, NULL)) {
 		tcp_hc_updatemtu(&inc, mtu);
 		V_icmp6stat.icp6s_pmtuchg++;
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  *
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 #define hostnamelen	strlen(V_hostname)
 static struct mbuf *
 ni6_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET6(curvnet);
 	INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct in6_addr in6_subj; /* subject address */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 #ifndef PULLDOWN_TEST
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6));
 	if (ni6 == NULL) {
 		/* m is already reclaimed */
 		return (NULL);
 	}
 #endif
 
 	/*
 	 * Validate IPv6 source address.
 	 * The default configuration MUST be to refuse answering queries from
 	 * global-scope addresses according to RFC4602.
 	 * Notes:
 	 *  - it's not very clear what "refuse" means; this implementation
 	 *    simply drops it.
 	 *  - it's not very easy to identify global-scope (unicast) addresses
 	 *    since there are many prefixes for them.  It should be safer
 	 *    and in practice sufficient to check "all" but loopback and
 	 *    link-local (note that site-local unicast was deprecated and
 	 *    ULA is defined as global scope-wise)
 	 */
 	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
 	    !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		goto bad;
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [RFC4602, Section 5.]
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 			goto bad;
 		/* else it's a link-local multicast, fine */
 	} else {		/* unicast or anycast */
 		if ((ia6 = ip6_getdstifaddr(m)) == NULL)
 			goto bad; /* XXX impossible */
 
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 	}
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(struct in6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			/* m_pulldown instead of copy? */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&in6_subj);
 			if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
 				goto bad;
 
 			subj = (char *)&in6_subj;
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y,
 			 * if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			mtx_lock(&hostname_mtx);
 			n = ni6_nametodns(V_hostname, hostnamelen, 0);
 			mtx_unlock(&hostname_mtx);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			IP6_EXTHDR_GET(subj, char *, m,
 			    off + sizeof(struct icmp6_nodeinfo), subjlen);
 			if (subj == NULL)
 				goto bad;
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 			    n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 		    sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	case NI_QTYPE_IPV4ADDR:
 		/* unsupported - should respond with unknown Qtype? */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case.  However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* allocate an mbuf to reply. */
 	MGETHDR(n, M_DONTWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	M_MOVE_PKTHDR(n, m); /* just for recvif */
 	if (replylen > MHLEN) {
 		if (replylen > MCLBYTES) {
 			/*
 			 * XXX: should we try to allocate more? But MCLBYTES
 			 * is probably much larger than IPV6_MMTU...
 			 */
 			goto bad;
 		}
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			goto bad;
 		}
 	}
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in variable "hostname"?
 		 */
 		mtx_lock(&hostname_mtx);
 		n->m_next = ni6_nametodns(V_hostname, hostnamelen, oldfqdn);
 		mtx_unlock(&hostname_mtx);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return (n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return (NULL);
 }
 #undef hostnamelen
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  *
  * old - return pascal string if non-zero
  */
 static struct mbuf *
 ni6_nametodns(const char *name, int namelen, int old)
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */
 	MGET(m, M_DONTWAIT, MT_DATA);
 	if (m && len > MLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0)
 			goto fail;
 	}
 	if (!m)
 		goto fail;
 	m->m_next = NULL;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp,
     struct in6_addr *subj)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return (0);
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return (0);
 		}
 	}
 
 	IFNET_RLOCK();
 	for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
 		addrsofif = 0;
 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		if (iffound) {
 			*ifpp = ifp;
 			IFNET_RUNLOCK();
 			return (addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 	IFNET_RUNLOCK();
 
 	return (addrs);
 }
 
 static int
 ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6,
     struct ifnet *ifp0, int resid)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet);
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return (0);	/* needless to copy */
 
 	IFNET_RLOCK();
   again:
 
 	for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
 		for (ifa = ifp->if_addrlist.tqh_first; ifa;
 		     ifa = ifa->ifa_list.tqe_next) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 			    allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
 				IFNET_RUNLOCK();
 				return (copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_second)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second);
 				else
 					ltime = 0;
 			}
 
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			    sizeof(struct in6_addr));
 			in6_clearscope((struct in6_addr *)cp); /* XXX */
 			cp += sizeof(struct in6_addr);
 
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
 		}
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	IFNET_RUNLOCK();
 
 	return (copied);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(struct mbuf **mp, int off)
 {
 	INIT_VNET_INET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct in6pcb *in6p;
 	struct in6pcb *last = NULL;
 	struct sockaddr_in6 fromsa;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 
 #ifndef PULLDOWN_TEST
 	/* this is assumed to be safe. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		/* m is already reclaimed */
 		return (IPPROTO_DONE);
 	}
 #endif
 
 	/*
 	 * XXX: the address may have embedded scope zone ID, which should be
 	 * hidden from applications.
 	 */
 	bzero(&fromsa, sizeof(fromsa));
 	fromsa.sin6_family = AF_INET6;
 	fromsa.sin6_len = sizeof(struct sockaddr_in6);
 	fromsa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&fromsa)) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
 		if ((in6p->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		INP_RLOCK(in6p);
 		if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 		    in6p->in6p_icmp6filt)) {
 			INP_RUNLOCK(in6p);
 			continue;
 		}
 		if (last) {
 			struct	mbuf *n = NULL;
 
 			/*
 			 * Recent network drivers tend to allocate a single
 			 * mbuf cluster, rather than to make a couple of
 			 * mbufs without clusters.  Also, since the IPv6 code
 			 * path tries to avoid m_pullup(), it is highly
 			 * probable that we still have an mbuf cluster here
 			 * even though the necessary length can be stored in an
 			 * mbuf's internal buffer.
 			 * Meanwhile, the default size of the receive socket
 			 * buffer for raw sockets is not so large.  This means
 			 * the possibility of packet loss is relatively higher
 			 * than before.  To avoid this scenario, we copy the
 			 * received data to a separate mbuf that does not use
 			 * a cluster, if possible.
 			 * XXX: it is better to copy the data after stripping
 			 * intermediate headers.
 			 */
 			if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 			    m->m_len <= MHLEN) {
 				MGET(n, M_DONTWAIT, m->m_type);
 				if (n != NULL) {
 					if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 						bcopy(m->m_data, n->m_data,
 						      m->m_len);
 						n->m_len = m->m_len;
 					} else {
 						m_free(n);
 						n = NULL;
 					}
 				}
 			}
 			if (n != NULL ||
 			    (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
 				if (last->in6p_flags & IN6P_CONTROLOPTS)
 					ip6_savecontrol(last, n, &opts);
 				/* strip intermediate headers */
 				m_adj(n, off);
 				SOCKBUF_LOCK(&last->in6p_socket->so_rcv);
 				if (sbappendaddr_locked(
 				    &last->in6p_socket->so_rcv,
 				    (struct sockaddr *)&fromsa, n, opts)
 				    == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts) {
 						m_freem(opts);
 					}
 					SOCKBUF_UNLOCK(
 					    &last->in6p_socket->so_rcv);
 				} else
 					sorwakeup_locked(last->in6p_socket);
 				opts = NULL;
 			}
 			INP_RUNLOCK(last);
 		}
 		last = in6p;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	if (last) {
 		if (last->in6p_flags & IN6P_CONTROLOPTS)
 			ip6_savecontrol(last, m, &opts);
 		/* strip intermediate headers */
 		m_adj(m, off);
 
 		/* avoid using mbuf clusters if possible (see above) */
 		if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 		    m->m_len <= MHLEN) {
 			struct mbuf *n;
 
 			MGET(n, M_DONTWAIT, m->m_type);
 			if (n != NULL) {
 				if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 					bcopy(m->m_data, n->m_data, m->m_len);
 					n->m_len = m->m_len;
 
 					m_freem(m);
 					m = n;
 				} else {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 		}
 		SOCKBUF_LOCK(&last->in6p_socket->so_rcv);
 		if (sbappendaddr_locked(&last->in6p_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&last->in6p_socket->so_rcv);
 		} else
 			sorwakeup_locked(last->in6p_socket);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		V_ip6stat.ip6s_delivered--;
 	}
 	return IPPROTO_DONE;
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 void
 icmp6_reflect(struct mbuf *m, size_t off)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia;
 	int plen;
 	int type, code;
 	struct ifnet *outif = NULL;
 	struct in6_addr origdst, *src = NULL;
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 
 	origdst = ip6->ip6_dst;
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 
 	/*
 	 * If the incoming packet was addressed directly to us (i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 * Note that ip6_getdstifaddr() may fail if we are in an error handling
 	 * procedure of an outgoing packet of our own, in which case we need
 	 * to search in the ifaddr list.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&origdst)) {
 		if ((ia = ip6_getdstifaddr(m))) {
 			if (!(ia->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)))
 				src = &ia->ia_addr.sin6_addr;
 		} else {
 			struct sockaddr_in6 d;
 
 			bzero(&d, sizeof(d));
 			d.sin6_family = AF_INET6;
 			d.sin6_len = sizeof(d);
 			d.sin6_addr = origdst;
 			ia = (struct in6_ifaddr *)
 			    ifa_ifwithaddr((struct sockaddr *)&d);
 			if (ia &&
 			    !(ia->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) {
 				src = &ia->ia_addr.sin6_addr;
 			}
 		}
 	}
 
 	if (src == NULL) {
 		int e;
 		struct sockaddr_in6 sin6;
 		struct route_in6 ro;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */
 
 		bzero(&ro, sizeof(ro));
 		src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e);
 		if (ro.ro_rt)
 			RTFREE(ro.ro_rt); /* XXX: we could use this */
 		if (src == NULL) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(ip6buf, &sin6.sin6_addr), e));
 			goto bad;
 		}
 	}
 
 	ip6->ip6_src = *src;
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	if (outif)
 		ip6->ip6_hlim = ND_IFINFO(outif)->chlim;
 	else if (m->m_pkthdr.rcvif) {
 		/* XXX: This may not be the outgoing interface */
 		ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
 	} else
 		ip6->ip6_hlim = V_ip6_defhlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	return;
 
  bad:
 	m_freem(m);
 	return;
 }
 
 void
 icmp6_fasttimo(void)
 {
 
 	return;
 }
 
 static const char *
 icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
     struct in6_addr *tgt6)
 {
 	static char buf[1024];
 	char ip6bufs[INET6_ADDRSTRLEN];
 	char ip6bufd[INET6_ADDRSTRLEN];
 	char ip6buft[INET6_ADDRSTRLEN];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 	    ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
 	    ip6_sprintf(ip6buft, tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_redirect *nd_rd;
 	int icmp6len = ntohs(ip6->ip6_plen);
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	u_char *redirhdr = NULL;
 	int redirhdrlen = 0;
 	struct rtentry *rt = NULL;
 	int is_router;
 	int is_onlink;
 	struct in6_addr src6 = ip6->ip6_src;
 	struct in6_addr redtgt6;
 	struct in6_addr reddst6;
 	union nd_opts ndopts;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (!m)
 		return;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (!ifp)
 		return;
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (V_ip6_forwarding)
 		goto freeit;
 	if (!V_icmp6_rediraccept)
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
 	if (nd_rd == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) {
 		goto freeit;
 	}
 
 	/* validation */
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "must be from linklocal\n",
 		    ip6_sprintf(ip6buf, &src6)));
 		goto bad;
 	}
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "hlim=%d (must be 255)\n",
 		    ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
 	struct sockaddr_in6 sin6;
 	struct in6_addr *gw6;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6));
 	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
 	if (rt) {
 		if (rt->rt_gateway == NULL ||
 		    rt->rt_gateway->sa_family != AF_INET6) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE_LOCKED(rt);
 			goto bad;
 		}
 
 		gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
 		if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; "
 			    "not equal to gw-for-src=%s (must be same): "
 			    "%s\n",
 			    ip6_sprintf(ip6buf, gw6),
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE_LOCKED(rt);
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "no route found for redirect dst: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	RTFREE_LOCKED(rt);
 	rt = NULL;
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "redirect dst must be unicast: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "neither router case nor onlink case: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	/* validation passed */
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "icmp6_redirect_input: "
 		    "invalid ND option, rejected: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (ndopts.nd_opts_rh) {
 		redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len;
 		redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "icmp6_redirect_input: lladdrlen mismatch for %s "
 		    "(if %d, icmp6 packet %d): %s\n",
 		    ip6_sprintf(ip6buf, &redtgt6),
 		    ifp->if_addrlen, lladdrlen - 2,
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 	    is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	if (!is_onlink) {	/* better router case.  perform rtredirect. */
 		/* perform rtredirect */
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&sgw, sizeof(sgw));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
 			sizeof(struct sockaddr_in6);
 		bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw,
 		    (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST,
 		    (struct sockaddr *)&ssrc);
 	}
 	/* finally update cached route in each socket via pfctlinput */
     {
 	struct sockaddr_in6 sdst;
 
 	bzero(&sdst, sizeof(sdst));
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
 #ifdef IPSEC
 	key_sa_routechange((struct sockaddr *)&sdst);
 #endif /* IPSEC */
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	V_icmp6stat.icp6s_badredirect++;
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!V_ip6_forwarding)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
 	if (m && IPV6_MMTU >= MHLEN)
 		MCLGET(m, M_DONTWAIT);
 	if (!m)
 		goto fail;
 	m->m_pkthdr.rcvif = NULL;
 	m->m_len = 0;
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
 		struct sockaddr_in6 *sin6;
 		sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (rt->rt_flags & RTF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
 	{
 		/* target lladdr option */
 		struct rtentry *rt_router = NULL;
 		int len;
 		struct sockaddr_dl *sdl;
 		struct nd_opt_hdr *nd_opt;
 		char *lladdr;
 
 		rt_router = nd6_lookup(router_ll6, 0, ifp);
 		if (!rt_router)
 			goto nolladdropt;
 		len = sizeof(*nd_opt) + ifp->if_addrlen;
 		len = (len + 7) & ~7;	/* round by 8 */
 		/* safety check */
 		if (len + (p - (u_char *)ip6) > maxlen)
 			goto nolladdropt;
 		if (!(rt_router->rt_flags & RTF_GATEWAY) &&
 		    (rt_router->rt_flags & RTF_LLINFO) &&
 		    (rt_router->rt_gateway->sa_family == AF_LINK) &&
 		    (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) &&
 		    sdl->sdl_alen) {
 			nd_opt = (struct nd_opt_hdr *)p;
 			nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 			nd_opt->nd_opt_len = len >> 3;
 			lladdr = (char *)(nd_opt + 1);
 			bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen);
 			p += len;
 		}
 	}
 nolladdropt:;
 
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
 	{
 		/* redirected header option */
 		int len;
 		struct nd_opt_rd_hdr *nd_opt_rh;
 
 		/*
 		 * compute the maximum size for icmp6 redirect header option.
 		 * XXX room for auth header?
 		 */
 		len = maxlen - (p - (u_char *)ip6);
 		len &= ~7;
 
 		/* This is just for simplicity. */
 		if (m0->m_pkthdr.len != m0->m_len) {
 			if (m0->m_next) {
 				m_freem(m0->m_next);
 				m0->m_next = NULL;
 			}
 			m0->m_pkthdr.len = m0->m_len;
 		}
 
 		/*
 		 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 		 * about padding/truncate rule for the original IP packet.
 		 * From the discussion on IPv6imp in Feb 1999,
 		 * the consensus was:
 		 * - "attach as much as possible" is the goal
 		 * - pad if not aligned (original size can be guessed by
 		 *   original ip6 header)
 		 * Following code adds the padding if it is simple enough,
 		 * and truncates if not.
 		 */
 		if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 			panic("assumption failed in %s:%d", __FILE__,
 			    __LINE__);
 
 		if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 			/* not enough room, truncate */
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		} else {
 			/* enough room, pad or truncate */
 			size_t extra;
 
 			extra = m0->m_pkthdr.len % 8;
 			if (extra) {
 				/* pad if easy enough, truncate if not */
 				if (8 - extra <= M_TRAILINGSPACE(m0)) {
 					/* pad */
 					m0->m_len += (8 - extra);
 					m0->m_pkthdr.len += (8 - extra);
 				} else {
 					/* truncate */
 					m0->m_pkthdr.len -= extra;
 					m0->m_len -= extra;
 				}
 			}
 			len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		}
 
 		nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 		bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 		nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 		nd_opt_rh->nd_opt_rh_len = len >> 3;
 		p += sizeof(*nd_opt_rh);
 		m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 		/* connect m0 to m */
 		m_tag_delete_chain(m0, NULL);
 		m0->m_flags &= ~M_PKTHDR;
 		m->m_next = m0;
 		m->m_pkthdr.len = m->m_len + m0->m_len;
 		m0 = NULL;
 	}
 noredhdropt:;
 	if (m0) {
 		m_freem(m0);
 		m0 = NULL;
 	}
 
 	/* XXX: clear embedded link IDs in the inner header */
 	in6_clearscope(&sip6->ip6_src);
 	in6_clearscope(&sip6->ip6_dst);
 	in6_clearscope(&nd_rd->nd_rd_target);
 	in6_clearscope(&nd_rd->nd_rd_dst);
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(*ip6), ntohs(ip6->ip6_plen));
 
 	/* send the packet to outside... */
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	V_icmp6stat.icp6s_outhist[ND_REDIRECT]++;
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			if (optlen != sizeof(ic6f)) {
 				error = EMSGSIZE;
 				break;
 			}
 			error = sooptcopyin(sopt, &ic6f, optlen, optlen);
 			if (error == 0) {
 				INP_WLOCK(inp);
 				*inp->in6p_icmp6filt = ic6f;
 				INP_WUNLOCK(inp);
 			}
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			INP_RLOCK(inp);
 			ic6f = *inp->in6p_icmp6filt;
 			INP_RUNLOCK(inp);
 			error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  *
  * dst - not used at this moment
  * type - not used at this moment
  * code - not used at this moment
  */
 static int
 icmp6_ratelimit(const struct in6_addr *dst, const int type,
     const int code)
 {
 	INIT_VNET_INET6(curvnet);
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
 	    V_icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
Index: head/sys/netinet6/in6_ifattach.c
===================================================================
--- head/sys/netinet6/in6_ifattach.c	(revision 185347)
+++ head/sys/netinet6/in6_ifattach.c	(revision 185348)
@@ -1,916 +1,915 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef VIMAGE_GLOBALS
 unsigned long in6_maxmtu;
 int ip6_auto_linklocal;
 struct callout in6_tmpaddrtimer_ch;
-#endif
-
 extern struct inpcbinfo udbinfo;
 extern struct inpcbinfo ripcbinfo;
+#endif
 
 static int get_rand_ifid(struct ifnet *, struct in6_addr *);
 static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *);
 static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
 static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
 static int in6_ifattach_loopback(struct ifnet *);
 static void in6_purgemaddrs(struct ifnet *);
 
 #define EUI64_GBIT	0x01
 #define EUI64_UBIT	0x02
 #define EUI64_TO_IFID(in6)	do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
 #define EUI64_GROUP(in6)	((in6)->s6_addr[8] & EUI64_GBIT)
 #define EUI64_INDIVIDUAL(in6)	(!EUI64_GROUP(in6))
 #define EUI64_LOCAL(in6)	((in6)->s6_addr[8] & EUI64_UBIT)
 #define EUI64_UNIVERSAL(in6)	(!EUI64_LOCAL(in6))
 
 #define IFID_LOCAL(in6)		(!EUI64_LOCAL(in6))
 #define IFID_UNIVERSAL(in6)	(!EUI64_UNIVERSAL(in6))
 
 /*
  * Generate a last-resort interface identifier, when the machine has no
  * IEEE802/EUI64 address sources.
  * The goal here is to get an interface identifier that is
  * (1) random enough and (2) does not change across reboot.
  * We currently use MD5(hostname) for it.
  *
  * in6 - upper 64bits are preserved
  */
 static int
 get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	int hostnamelen;
 
 	mtx_lock(&hostname_mtx);
 	hostnamelen = strlen(V_hostname);
 #if 0
 	/* we need at least several letters as seed for ifid */
 	if (hostnamelen < 3)
 		return -1;
 #endif
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, V_hostname, hostnamelen);
 	mtx_unlock(&hostname_mtx);
 	MD5Final(digest, &ctxt);
 
 	/* assumes sizeof(digest) > sizeof(ifid) */
 	bcopy(digest, &in6->s6_addr[8], 8);
 
 	/* make sure to set "u" bit to local, and "g" bit to individual. */
 	in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 	in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	return 0;
 }
 
 static int
 generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret)
 {
 	INIT_VNET_INET6(curvnet);
 	MD5_CTX ctxt;
 	u_int8_t seed[16], digest[16], nullbuf[8];
 	u_int32_t val32;
 
 	/* If there's no history, start with a random seed. */
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
 		int i;
 
 		for (i = 0; i < 2; i++) {
 			val32 = arc4random();
 			bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
 		}
 	} else
 		bcopy(seed0, seed, 8);
 
 	/* copy the right-most 64-bits of the given address */
 	/* XXX assumption on the size of IFID */
 	bcopy(seed1, &seed[8], 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("generate_tmp_ifid: new randomized ID from: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", seed[i]);
 		printf(" ");
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, seed, sizeof(seed));
 	MD5Final(digest, &ctxt);
 
 	/*
 	 * RFC 3041 3.2.1. (3)
 	 * Take the left-most 64-bits of the MD5 digest and set bit 6 (the
 	 * left-most bit is numbered 0) to zero.
 	 */
 	bcopy(digest, ret, 8);
 	ret[0] &= ~EUI64_UBIT;
 
 	/*
 	 * XXX: we'd like to ensure that the generated value is not zero
 	 * for simplicity.  If the caclculated digest happens to be zero,
 	 * use a random non-zero value as the last resort.
 	 */
 	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
 		nd6log((LOG_INFO,
 		    "generate_tmp_ifid: computed MD5 value is zero.\n"));
 
 		val32 = arc4random();
 		val32 = 1 + (val32 % (0xffffffff - 1));
 	}
 
 	/*
 	 * RFC 3041 3.2.1. (4)
 	 * Take the rightmost 64-bits of the MD5 digest and save them in
 	 * stable storage as the history value to be used in the next
 	 * iteration of the algorithm.
 	 */
 	bcopy(&digest[8], seed0, 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("to: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", digest[i]);
 		printf("\n");
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.
  * XXX assumes single sockaddr_dl (AF_LINK address) per an interface
  *
  * in6 - upper 64bits are preserved
  */
 int
 in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	u_int8_t *addr;
 	size_t addrlen;
 	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	static u_int8_t allone[8] =
 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 	for (ifa = ifp->if_addrlist.tqh_first;
 	     ifa;
 	     ifa = ifa->ifa_list.tqe_next) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		if (sdl == NULL)
 			continue;
 		if (sdl->sdl_alen == 0)
 			continue;
 
 		goto found;
 	}
 
 	return -1;
 
 found:
 	addr = LLADDR(sdl);
 	addrlen = sdl->sdl_alen;
 
 	/* get EUI64 */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_ISO88025:
 	case IFT_ATM:
 	case IFT_IEEE1394:
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 		/* IEEE802/EUI64 cases - what others? */
 		/* IEEE1394 uses 16byte length address starting with EUI64 */
 		if (addrlen > 8)
 			addrlen = 8;
 
 		/* look at IEEE802/EUI64 only */
 		if (addrlen != 8 && addrlen != 6)
 			return -1;
 
 		/*
 		 * check for invalid MAC address - on bsdi, we see it a lot
 		 * since wildboar configures all-zero MAC on pccard before
 		 * card insertion.
 		 */
 		if (bcmp(addr, allzero, addrlen) == 0)
 			return -1;
 		if (bcmp(addr, allone, addrlen) == 0)
 			return -1;
 
 		/* make EUI64 address */
 		if (addrlen == 8)
 			bcopy(addr, &in6->s6_addr[8], 8);
 		else if (addrlen == 6) {
 			in6->s6_addr[8] = addr[0];
 			in6->s6_addr[9] = addr[1];
 			in6->s6_addr[10] = addr[2];
 			in6->s6_addr[11] = 0xff;
 			in6->s6_addr[12] = 0xfe;
 			in6->s6_addr[13] = addr[3];
 			in6->s6_addr[14] = addr[4];
 			in6->s6_addr[15] = addr[5];
 		}
 		break;
 
 	case IFT_ARCNET:
 		if (addrlen != 1)
 			return -1;
 		if (!addr[0])
 			return -1;
 
 		bzero(&in6->s6_addr[8], 8);
 		in6->s6_addr[15] = addr[0];
 
 		/*
 		 * due to insufficient bitwidth, we mark it local.
 		 */
 		in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 		in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 		break;
 
 	case IFT_GIF:
 #ifdef IFT_STF
 	case IFT_STF:
 #endif
 		/*
 		 * RFC2893 says: "SHOULD use IPv4 address as ifid source".
 		 * however, IPv4 address is not very suitable as unique
 		 * identifier source (can be renumbered).
 		 * we don't do this.
 		 */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* sanity check: g bit must not indicate "group" */
 	if (EUI64_GROUP(in6))
 		return -1;
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	/*
 	 * sanity check: ifid must not be all zero, avoid conflict with
 	 * subnet router anycast
 	 */
 	if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
 	    bcmp(&in6->s6_addr[9], allzero, 7) == 0) {
 		return -1;
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.  If it is not
  * available on ifp0, borrow interface identifier from other information
  * sources.
  *
  * altifp - secondary EUI64 source
  */
 static int
 get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
     struct in6_addr *in6)
 {
 	INIT_VNET_NET(ifp0->if_vnet);
 	INIT_VNET_INET6(ifp0->if_vnet);
 	struct ifnet *ifp;
 
 	/* first, try to get it from the interface itself */
 	if (in6_get_hw_ifid(ifp0, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	/* try secondary EUI64 source. this basically is for ATM PVC */
 	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
 		    if_name(ifp0), if_name(altifp)));
 		goto success;
 	}
 
 	/* next, try to get it from some other hardware interface */
 	IFNET_RLOCK();
 	for (ifp = V_ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) {
 		if (ifp == ifp0)
 			continue;
 		if (in6_get_hw_ifid(ifp, in6) != 0)
 			continue;
 
 		/*
 		 * to borrow ifid from other interface, ifid needs to be
 		 * globally unique
 		 */
 		if (IFID_UNIVERSAL(in6)) {
 			nd6log((LOG_DEBUG,
 			    "%s: borrow interface identifier from %s\n",
 			    if_name(ifp0), if_name(ifp)));
 			IFNET_RUNLOCK();
 			goto success;
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/* last resort: get from random number source */
 	if (get_rand_ifid(ifp, in6) == 0) {
 		nd6log((LOG_DEBUG,
 		    "%s: interface identifier generated by random number\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	printf("%s: failed to get interface identifier\n", if_name(ifp0));
 	return -1;
 
 success:
 	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
 	    in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
 	    in6->s6_addr[14], in6->s6_addr[15]));
 	return 0;
 }
 
 /*
  * altifp - secondary EUI64 source
  */
 static int
 in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia;
 	struct in6_aliasreq ifra;
 	struct nd_prefixctl pr0;
 	int i, error;
 
 	/*
 	 * configure link-local address.
 	 */
 	bzero(&ifra, sizeof(ifra));
 
 	/*
 	 * in6_update_ifa() does not use ifra_name, but we accurately set it
 	 * for safety.
 	 */
 	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
 
 	ifra.ifra_addr.sin6_family = AF_INET6;
 	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
 		ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
 	} else {
 		if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: no ifid available\n", if_name(ifp)));
 			return (-1);
 		}
 	}
 	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
 		return (-1);
 
 	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_prefixmask.sin6_family = AF_INET6;
 	ifra.ifra_prefixmask.sin6_addr = in6mask64;
 	/* link-local addresses should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * Now call in6_update_ifa() to do a bunch of procedures to configure
 	 * a link-local address. We can set the 3rd argument to NULL, because
 	 * we know there's no other link-local address on the interface
 	 * and therefore we are adding one (instead of updating one).
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL,
 				    IN6_IFAUPDATE_DADDELAY)) != 0) {
 		/*
 		 * XXX: When the interface does not support IPv6, this call
 		 * would fail in the SIOCSIFADDR ioctl.  I believe the
 		 * notification is rather confusing in this case, so just
 		 * suppress it.  (jinmei@kame.net 20010130)
 		 */
 		if (error != EAFNOSUPPORT)
 			nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
 			    "configure a link-local address on %s "
 			    "(errno=%d)\n",
 			    if_name(ifp), error));
 		return (-1);
 	}
 
 	ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */
 #ifdef DIAGNOSTIC
 	if (!ia) {
 		panic("ia == NULL in in6_ifattach_linklocal");
 		/* NOTREACHED */
 	}
 #endif
 
 	/*
 	 * Make the link-local prefix (fe80::%link/64) as on-link.
 	 * Since we'd like to manage prefixes separately from addresses,
 	 * we make an ND6 prefix structure for the link-local prefix,
 	 * and add it to the prefix list as a never-expire prefix.
 	 * XXX: this change might affect some existing code base...
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	/* this should be 64 at this moment. */
 	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
 	pr0.ndpr_prefix = ifra.ifra_addr;
 	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
 	for (i = 0; i < 4; i++) {
 		pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &=
 		    in6mask64.s6_addr32[i];
 	}
 	/*
 	 * Initialize parameters.  The link-local prefix must always be
 	 * on-link, and its lifetimes never expire.
 	 */
 	pr0.ndpr_raf_onlink = 1;
 	pr0.ndpr_raf_auto = 1;	/* probably meaningless */
 	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
 	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
 	/*
 	 * Since there is no other link-local addresses, nd6_prefix_lookup()
 	 * probably returns NULL.  However, we cannot always expect the result.
 	 * For example, if we first remove the (only) existing link-local
 	 * address, and then reconfigure another one, the prefix is still
 	 * valid with referring to the old link-local address.
 	 */
 	if (nd6_prefix_lookup(&pr0) == NULL) {
 		if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0)
 			return (error);
 	}
 
 	return 0;
 }
 
 /*
  * ifp - must be IFT_LOOP
  */
 static int
 in6_ifattach_loopback(struct ifnet *ifp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_aliasreq ifra;
 	int error;
 
 	bzero(&ifra, sizeof(ifra));
 
 	/*
 	 * in6_update_ifa() does not use ifra_name, but we accurately set it
 	 * for safety.
 	 */
 	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
 
 	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_prefixmask.sin6_family = AF_INET6;
 	ifra.ifra_prefixmask.sin6_addr = in6mask128;
 
 	/*
 	 * Always initialize ia_dstaddr (= broadcast address) to loopback
 	 * address.  Follows IPv4 practice - see in_ifinit().
 	 */
 	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_dstaddr.sin6_family = AF_INET6;
 	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;
 
 	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_addr.sin6_family = AF_INET6;
 	ifra.ifra_addr.sin6_addr = in6addr_loopback;
 
 	/* the loopback  address should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/* we don't need to perform DAD on loopback interfaces. */
 	ifra.ifra_flags |= IN6_IFF_NODAD;
 
 	/* skip registration to the prefix list. XXX should be temporary. */
 	ifra.ifra_flags |= IN6_IFF_NOPFX;
 
 	/*
 	 * We are sure that this is a newly assigned address, so we can set
 	 * NULL to the 3rd arg.
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
 		nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
 		    "the loopback address on %s (errno=%d)\n",
 		    if_name(ifp), error));
 		return (-1);
 	}
 
 	return 0;
 }
 
 /*
  * compute NI group address, based on the current hostname setting.
  * see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later).
  *
  * when ifp == NULL, the caller is responsible for filling scopeid.
  */
 int
 in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 	const char *p;
 	u_char *q;
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	char l;
 	char n[64];	/* a single label must not exceed 63 chars */
 
 	if (!namelen || !name)
 		return -1;
 
 	p = name;
 	while (p && *p && *p != '.' && p - name < namelen)
 		p++;
 	if (p - name > sizeof(n) - 1)
 		return -1;	/* label too long */
 	l = p - name;
 	strncpy(n, name, l);
 	n[(int)l] = '\0';
 	for (q = n; *q; q++) {
 		if ('A' <= *q && *q <= 'Z')
 			*q = *q - 'A' + 'a';
 	}
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, &l, sizeof(l));
 	MD5Update(&ctxt, n, l);
 	MD5Final(digest, &ctxt);
 
 	bzero(in6, sizeof(*in6));
 	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 	in6->s6_addr8[11] = 2;
 	bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
 	if (in6_setscope(in6, ifp, NULL))
 		return (-1); /* XXX: should not fail */
 
 	return 0;
 }
 
 /*
  * XXX multiple loopback interface needs more care.  for instance,
  * nodelocal address needs to be configured onto only one of them.
  * XXX multiple link-local address case
  *
  * altifp - secondary EUI64 source
  */
 void
 in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
 {
 	INIT_VNET_INET6(ifp->if_vnet);
 	struct in6_ifaddr *ia;
 	struct in6_addr in6;
 
 	/* some of the interfaces are inherently not IPv6 capable */
 	switch (ifp->if_type) {
 	case IFT_PFLOG:
 	case IFT_PFSYNC:
 	case IFT_CARP:
 		return;
 	}
 
 	/*
 	 * quirks based on interface type
 	 */
 	switch (ifp->if_type) {
 #ifdef IFT_STF
 	case IFT_STF:
 		/*
 		 * 6to4 interface is a very special kind of beast.
 		 * no multicast, no linklocal.  RFC2529 specifies how to make
 		 * linklocals for 6to4 interface, but there's no use and
 		 * it is rather harmful to have one.
 		 */
 		goto statinit;
 #endif
 	default:
 		break;
 	}
 
 	/*
 	 * usually, we require multicast capability to the interface
 	 */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		nd6log((LOG_INFO, "in6_ifattach: "
 		    "%s is not multicast capable, IPv6 not enabled\n",
 		    if_name(ifp)));
 		return;
 	}
 
 	/*
 	 * assign loopback address for loopback interface.
 	 * XXX multiple loopback interface case.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		in6 = in6addr_loopback;
 		if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) {
 			if (in6_ifattach_loopback(ifp) != 0)
 				return;
 		}
 	}
 
 	/*
 	 * assign a link-local address, if there's none.
 	 */
 	if (V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) {
 		ia = in6ifa_ifpforlinklocal(ifp, 0);
 		if (ia == NULL) {
 			if (in6_ifattach_linklocal(ifp, altifp) == 0) {
 				/* linklocal address assigned */
 			} else {
 				/* failed to assign linklocal address. bark? */
 			}
 		}
 	}
 
 #ifdef IFT_STF			/* XXX */
 statinit:
 #endif
 
 	/* update dynamically. */
 	if (V_in6_maxmtu < ifp->if_mtu)
 		V_in6_maxmtu = ifp->if_mtu;
 }
 
 /*
  * NOTE: in6_ifdetach() does not support loopback if at this moment.
  * We don't need this function in bsdi, because interfaces are never removed
  * from the ifnet list in bsdi.
  */
 void
 in6_ifdetach(struct ifnet *ifp)
 {
 	INIT_VNET_NET(ifp->if_vnet);
 	INIT_VNET_INET(ifp->if_vnet);
 	INIT_VNET_INET6(ifp->if_vnet);
 	struct in6_ifaddr *ia, *oia;
 	struct ifaddr *ifa, *next;
 	struct rtentry *rt;
 	short rtflags;
 	struct sockaddr_in6 sin6;
 	struct in6_multi_mship *imm;
 
 	/* remove neighbor management table */
 	nd6_purge(ifp);
 
 	/* nuke any of IPv6 addresses we have */
 	for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) {
 		next = ifa->ifa_list.tqe_next;
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		in6_purgeaddr(ifa);
 	}
 
 	/* undo everything done by in6_ifattach(), just in case */
 	for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) {
 		next = ifa->ifa_list.tqe_next;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6
 		 || !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)->sin6_addr)) {
 			continue;
 		}
 
 		ia = (struct in6_ifaddr *)ifa;
 
 		/*
 		 * leave from multicast groups we have joined for the interface
 		 */
 		while ((imm = ia->ia6_memberships.lh_first) != NULL) {
 			LIST_REMOVE(imm, i6mm_chain);
 			in6_leavegroup(imm);
 		}
 
 		/* remove from the routing table */
 		if ((ia->ia_flags & IFA_ROUTE) &&
 		    (rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0UL))) {
 			rtflags = rt->rt_flags;
 			rtfree(rt);
 			rtrequest(RTM_DELETE, (struct sockaddr *)&ia->ia_addr,
 			    (struct sockaddr *)&ia->ia_addr,
 			    (struct sockaddr *)&ia->ia_prefixmask,
 			    rtflags, (struct rtentry **)0);
 		}
 
 		/* remove from the linked list */
 		TAILQ_REMOVE(&ifp->if_addrlist, (struct ifaddr *)ia, ifa_list);
 		IFAFREE(&ia->ia_ifa);
 
 		/* also remove from the IPv6 address chain(itojun&jinmei) */
 		oia = ia;
 		if (oia == (ia = V_in6_ifaddr))
 			V_in6_ifaddr = ia->ia_next;
 		else {
 			while (ia->ia_next && (ia->ia_next != oia))
 				ia = ia->ia_next;
 			if (ia->ia_next)
 				ia->ia_next = oia->ia_next;
 			else {
 				nd6log((LOG_ERR,
 				    "%s: didn't unlink in6ifaddr from list\n",
 				    if_name(ifp)));
 			}
 		}
 
 		IFAFREE(&oia->ia_ifa);
 	}
 
 	in6_pcbpurgeif0(&V_udbinfo, ifp);
 	in6_pcbpurgeif0(&V_ripcbinfo, ifp);
 	/* leave from all multicast groups joined */
 	in6_purgemaddrs(ifp);
 
 	/*
 	 * remove neighbor management table.  we call it twice just to make
 	 * sure we nuke everything.  maybe we need just one call.
 	 * XXX: since the first call did not release addresses, some prefixes
 	 * might remain.  We should call nd6_purge() again to release the
 	 * prefixes after removing all addresses above.
 	 * (Or can we just delay calling nd6_purge until at this point?)
 	 */
 	nd6_purge(ifp);
 
 	/* remove route to link-local allnodes multicast (ff02::1) */
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = in6addr_linklocal_allnodes;
 	if (in6_setscope(&sin6.sin6_addr, ifp, NULL))
 		/* XXX: should not fail */
 		return;
 	/* XXX grab lock first to avoid LOR */
 	if (V_rt_tables[0][AF_INET6] != NULL) {
 		RADIX_NODE_HEAD_LOCK(V_rt_tables[0][AF_INET6]);
 		rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
 		if (rt) {
 			if (rt->rt_ifp == ifp)
 				rtexpunge(rt);
 			RTFREE_LOCKED(rt);
 		}
 		RADIX_NODE_HEAD_UNLOCK(V_rt_tables[0][AF_INET6]);
 	}
 }
 
 int
 in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf,
     const u_int8_t *baseid, int generate)
 {
 	u_int8_t nullbuf[8];
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
 		/* we've never created a random ID.  Create a new one. */
 		generate = 1;
 	}
 
 	if (generate) {
 		bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));
 
 		/* generate_tmp_ifid will update seedn and buf */
 		(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
 		    ndi->randomid);
 	}
 	bcopy(ndi->randomid, retbuf, 8);
 
 	return (0);
 }
 
 void
 in6_tmpaddrtimer(void *ignored_arg)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	struct nd_ifinfo *ndi;
 	u_int8_t nullbuf[8];
 	struct ifnet *ifp;
 
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	for (ifp = TAILQ_FIRST(&V_ifnet); ifp;
 	    ifp = TAILQ_NEXT(ifp, if_list)) {
 		ndi = ND_IFINFO(ifp);
 		if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
 			/*
 			 * We've been generating a random ID on this interface.
 			 * Create a new one.
 			 */
 			(void)generate_tmp_ifid(ndi->randomseed0,
 			    ndi->randomseed1, ndi->randomid);
 		}
 	}
 
 }
 
 static void
 in6_purgemaddrs(struct ifnet *ifp)
 {
 	struct in6_multi *in6m;
 	struct in6_multi *oin6m;
 
 #ifdef DIAGNOSTIC
 	printf("%s: purging ifp %p\n", __func__, ifp);
 #endif
 
 	IFF_LOCKGIANT(ifp);
 	LIST_FOREACH_SAFE(in6m, &in6_multihead, in6m_entry, oin6m) {
 		if (in6m->in6m_ifp == ifp)
 			in6_delmulti(in6m);
 	}
 	IFF_UNLOCKGIANT(ifp);
 }
Index: head/sys/netinet6/in6_proto.c
===================================================================
--- head/sys/netinet6/in6_proto.c	(revision 185347)
+++ head/sys/netinet6/in6_proto.c	(revision 185348)
@@ -1,568 +1,568 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ipstealth.h"
 #include "opt_carp.h"
 #include "opt_sctp.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/radix.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/pim6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef DEV_CARP
 #include <netinet/ip_carp.h>
 #endif
 
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #include <netinet6/sctp6_var.h>
 #endif /* SCTP */
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 /*
  * TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
  */
 
 extern	struct domain inet6domain;
 static	struct pr_usrreqs nousrreqs;
 
 #define PR_LISTEN	0
 #define PR_ABRTACPTDIS	0
 
 struct ip6protosw inet6sw[] = {
 {
 	.pr_type =		0,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_init =		ip6_init,
 	.pr_slowtimo =		frag6_slowtimo,
 	.pr_drain =		frag6_drain,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_UDP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp6_input,
 	.pr_ctlinput =		udp6_ctlinput,
 	.pr_ctloutput =		ip6_ctloutput,
 	.pr_usrreqs =		&udp6_usrreqs,
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN,
 	.pr_input =		tcp6_input,
 	.pr_ctlinput =		tcp6_ctlinput,
 	.pr_ctloutput =		tcp_ctloutput,
 #ifndef INET	/* don't call initialization and timeout routines twice */
 	.pr_init =		tcp_init,
 	.pr_fasttimo =		tcp_fasttimo,
 	.pr_slowtimo =		tcp_slowtimo,
 #endif
 	.pr_drain =		tcp_drain,
 	.pr_usrreqs =		&tcp6_usrreqs,
 },
 #ifdef SCTP
 {
 	.pr_type =	SOCK_DGRAM,
 	.pr_domain =	&inet6domain,
         .pr_protocol =	IPPROTO_SCTP,
         .pr_flags =	PR_WANTRCVD,
         .pr_input =	sctp6_input,
         .pr_ctlinput =  sctp6_ctlinput,
         .pr_ctloutput = sctp_ctloutput,
         .pr_drain =	sctp_drain,
         .pr_usrreqs =	&sctp6_usrreqs
 },
 {
 	.pr_type =	SOCK_SEQPACKET,
 	.pr_domain =	&inet6domain,
         .pr_protocol =	IPPROTO_SCTP,
         .pr_flags =	PR_WANTRCVD,
         .pr_input =	sctp6_input,
         .pr_ctlinput =  sctp6_ctlinput,
         .pr_ctloutput = sctp_ctloutput,
         .pr_drain =	sctp_drain,
         .pr_usrreqs =	&sctp6_usrreqs
 },
 
 {
 	.pr_type =	SOCK_STREAM,
 	.pr_domain =	&inet6domain,
         .pr_protocol =	IPPROTO_SCTP,
         .pr_flags =	PR_WANTRCVD,
         .pr_input =	sctp6_input,
         .pr_ctlinput =  sctp6_ctlinput,
         .pr_ctloutput = sctp_ctloutput,
         .pr_drain =	sctp_drain,
         .pr_usrreqs =	&sctp6_usrreqs
 },
 #endif /* SCTP */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ICMPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		icmp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		icmp6_init,
 	.pr_fasttimo =		icmp6_fasttimo,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_DSTOPTS,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		dest6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ROUTING,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		route6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_FRAGMENT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		frag6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 #ifdef IPSEC
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_AH,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ESP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_ctlinput =		esp6_ctlinput,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPCOMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 #endif /* IPSEC */
 #ifdef INET
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV4,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 #endif /* INET */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 #ifdef DEV_CARP
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 #endif /* DEV_CARP */
 /* raw wildcard */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 };
 
 extern int in6_inithead(void **, int);
 
 struct domain inet6domain = {
 	.dom_family =		AF_INET6,
 	.dom_name =		"internet6",
 	.dom_protosw =		(struct protosw *)inet6sw,
 	.dom_protoswNPROTOSW =	(struct protosw *)
 				&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])],
 #ifdef RADIX_MPATH
 	.dom_rtattach =		rn6_mpath_inithead,
 #else
 	.dom_rtattach =		in6_inithead,
 #endif
 	.dom_rtoffset =		offsetof(struct sockaddr_in6, sin6_addr) << 3,
 	.dom_maxrtkey =		sizeof(struct sockaddr_in6),
 	.dom_ifattach =		in6_domifattach,
 	.dom_ifdetach =		in6_domifdetach
 };
 
 DOMAIN_SET(inet6);
 
 /*
  * Internet configuration info
  */
 #ifdef VIMAGE_GLOBALS
 int	ip6_forwarding;
 int	ip6_sendredirects;
 int	ip6_defhlim;
 int	ip6_defmcasthlim;
 int	ip6_accept_rtadv;
 int	ip6_maxfragpackets;
 int	ip6_maxfrags;
 int	ip6_log_interval;
 int	ip6_hdrnestlimit;
 int	ip6_dad_count;
 int	ip6_auto_flowlabel;
 int	ip6_use_deprecated;
 int	ip6_rr_prune;
 int	ip6_mcast_pmtu;
 int	ip6_v6only;
 int	ip6_keepfaith;
 time_t	ip6_log_time;
 int	ip6stealth;
 int	nd6_onlink_ns_rfc4861;
 #endif
 
 #ifdef VIMAGE_GLOBALS
 /* icmp6 */
 /*
  * BSDI4 defines these variables in in_proto.c...
  * XXX: what if we don't define INET? Should we define pmtu6_expire
  * or so? (jinmei@kame.net 19990310)
  */
 int pmtu_expire;
 int pmtu_probe;
 
 /* raw IP6 parameters */
 /*
  * Nominal space allocated to a raw ip socket.
  */
 u_long	rip6_sendspace;
 u_long	rip6_recvspace;
 
 /* ICMPV6 parameters */
 int	icmp6_rediraccept;
 int	icmp6_redirtimeout;
 int	icmp6errppslim;
 /* control how to respond to NI queries */
 int	icmp6_nodeinfo;
 
 /* UDP on IP6 parameters */
 int	udp6_sendspace;
 int	udp6_recvspace;
 #endif /* VIMAGE_GLOBALS */
 
 /*
  * sysctl related items.
  */
 SYSCTL_NODE(_net,	PF_INET6,	inet6,	CTLFLAG_RW,	0,
 	"Internet6 Family");
 
 /* net.inet6 */
 SYSCTL_NODE(_net_inet6,	IPPROTO_IPV6,	ip6,	CTLFLAG_RW, 0,	"IP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_ICMPV6,	icmp6,	CTLFLAG_RW, 0,	"ICMP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_UDP,	udp6,	CTLFLAG_RW, 0,	"UDP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_TCP,	tcp6,	CTLFLAG_RW, 0,	"TCP6");
 #ifdef SCTP
 SYSCTL_NODE(_net_inet6,	IPPROTO_SCTP,	sctp6,	CTLFLAG_RW, 0,	"SCTP6");
 #endif
 #ifdef IPSEC
 SYSCTL_NODE(_net_inet6,	IPPROTO_ESP,	ipsec6,	CTLFLAG_RW, 0,	"IPSEC6");
 #endif /* IPSEC */
 
 /* net.inet6.ip6 */
 static int
 sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 	int old;
 
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_preferred_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_preferred_lifetime <
 	    V_ip6_desync_factor + V_ip6_temp_regen_advance) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
 static int
 sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
 {
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 	int old;
 
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_valid_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_FORWARDING,
 	forwarding, CTLFLAG_RW,		ip6_forwarding,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_SENDREDIRECTS,
 	redirect, CTLFLAG_RW,		ip6_sendredirects,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFHLIM,
 	hlim, CTLFLAG_RW,		ip6_defhlim,	0, "");
 SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STATS, stats,
 	CTLFLAG_RD, ip6stat, ip6stat, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS,
 	maxfragpackets, CTLFLAG_RW,	ip6_maxfragpackets,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_ACCEPT_RTADV,
 	accept_rtadv, CTLFLAG_RW,	ip6_accept_rtadv,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_KEEPFAITH,
 	keepfaith, CTLFLAG_RW,		ip6_keepfaith,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_LOG_INTERVAL,
 	log_interval, CTLFLAG_RW,	ip6_log_interval,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_HDRNESTLIMIT,
 	hdrnestlimit, CTLFLAG_RW,	ip6_hdrnestlimit,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DAD_COUNT,
 	dad_count, CTLFLAG_RW,	ip6_dad_count,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL,
 	auto_flowlabel, CTLFLAG_RW,	ip6_auto_flowlabel,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFMCASTHLIM,
 	defmcasthlim, CTLFLAG_RW,	ip6_defmcasthlim,	0, "");
 SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION,
 	kame_version, CTLFLAG_RD,	__KAME_VERSION,		0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEPRECATED,
 	use_deprecated, CTLFLAG_RW,	ip6_use_deprecated,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RR_PRUNE,
 	rr_prune, CTLFLAG_RW,	ip6_rr_prune,			0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USETEMPADDR,
 	use_tempaddr, CTLFLAG_RW, ip6_use_tempaddr,		0, "");
 SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
 	CTLTYPE_INT|CTLFLAG_RW, ip6_temp_preferred_lifetime, 0,
    	sysctl_ip6_temppltime, "I", "");
 SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
 	CTLTYPE_INT|CTLFLAG_RW, ip6_temp_valid_lifetime, 0,
    	sysctl_ip6_tempvltime, "I", "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_V6ONLY,
 	v6only,	CTLFLAG_RW,	ip6_v6only,			0, "");
 #ifndef VIMAGE
 TUNABLE_INT("net.inet6.ip6.auto_linklocal", &ip6_auto_linklocal);
 #endif
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL,
 	auto_linklocal, CTLFLAG_RW, ip6_auto_linklocal,	0, "");
 SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RIP6STATS,
 	rip6stats, CTLFLAG_RD, rip6stat, rip6stat, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR,
 	prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE,
 	use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,"");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGS,
 	maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MCAST_PMTU,
 	mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, "");
 #ifdef IPSTEALTH
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH,
 	stealth, CTLFLAG_RW, ip6stealth, 0, "");
 #endif
 
 /* net.inet6.icmp6 */
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT,
 	rediraccept, CTLFLAG_RW,	icmp6_rediraccept,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT,
 	redirtimeout, CTLFLAG_RW,	icmp6_redirtimeout,	0, "");
 SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_STATS,
 	stats, CTLFLAG_RD, icmp6stat, icmp6stat, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE,
 	nd6_prune, CTLFLAG_RW,		nd6_prune,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DELAY,
 	nd6_delay, CTLFLAG_RW,		nd6_delay,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES,
 	nd6_umaxtries, CTLFLAG_RW,	nd6_umaxtries,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES,
 	nd6_mmaxtries, CTLFLAG_RW,	nd6_mmaxtries,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK,
 	nd6_useloopback, CTLFLAG_RW,	nd6_useloopback, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_NODEINFO,
 	nodeinfo, CTLFLAG_RW,	icmp6_nodeinfo,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT,
 	errppslimit, CTLFLAG_RW,	icmp6errppslim,	0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT,
 	nd6_maxnudhint, CTLFLAG_RW,	nd6_maxnudhint, 0, "");
 SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG,
 	nd6_debug, CTLFLAG_RW,	nd6_debug,		0, "");
 
-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
-	nd6_onlink_ns_rfc4861, CTLFLAG_RW, &nd6_onlink_ns_rfc4861, 0,
+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
+	nd6_onlink_ns_rfc4861, CTLFLAG_RW, nd6_onlink_ns_rfc4861, 0,
 	"Accept 'on-link' nd6 NS in compliance with RFC 4861.");
Index: head/sys/netinet6/in6_rmx.c
===================================================================
--- head/sys/netinet6/in6_rmx.c	(revision 185347)
+++ head/sys/netinet6/in6_rmx.c	(revision 185348)
@@ -1,499 +1,499 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $
  */
 
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * This code does two things necessary for the enhanced TCP metrics to
  * function in a useful manner:
  *  1) It marks all non-host routes as `cloning', thus ensuring that
  *     every actual reference to such a route actually gets turned
  *     into a reference to a host route to the specific destination
  *     requested.
  *  2) When such routes lose all their references, it arranges for them
  *     to be deleted in some random collection of circumstances, so that
  *     a large quantity of stale routing data is not kept in kernel memory
  *     indefinitely.  See in6_rtqtimo() below for the exact mechanism.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 extern int	in6_inithead(void **head, int off);
 
 #define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
     struct radix_node *treenodes)
 {
 	struct rtentry *rt = (struct rtentry *)treenodes;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
 	struct radix_node *ret;
 
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		rt->rt_flags |= RTF_MULTICAST;
 
 	/*
 	 * A little bit of help for both IPv6 output and input:
 	 *   For local addresses, we make sure that RTF_LOCAL is set,
 	 *   with the thought that this might one day be used to speed up
 	 *   ip_input().
 	 *
 	 * We also mark routes to multicast addresses as such, because
 	 * it's easy to do and might be useful (but this is much more
 	 * dubious since it's so easy to inspect the address).  (This
 	 * is done above.)
 	 *
 	 * XXX
 	 * should elaborate the code.
 	 */
 	if (rt->rt_flags & RTF_HOST) {
 		if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
 					->sin6_addr,
 				       &sin6->sin6_addr)) {
 			rt->rt_flags |= RTF_LOCAL;
 		}
 	}
 
 	if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
 		rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
 
 	ret = rn_addroute(v_arg, n_arg, head, treenodes);
 	if (ret == NULL && rt->rt_flags & RTF_HOST) {
 		struct rtentry *rt2;
 		/*
 		 * We are trying to add a host route, but can't.
 		 * Find out if it is because of an
 		 * ARP entry and delete it if so.
 		 */
 		rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING);
 		if (rt2) {
 			if (rt2->rt_flags & RTF_LLINFO &&
 				rt2->rt_flags & RTF_HOST &&
 				rt2->rt_gateway &&
 				rt2->rt_gateway->sa_family == AF_LINK) {
 				rtexpunge(rt2);
 				RTFREE_LOCKED(rt2);
 				ret = rn_addroute(v_arg, n_arg, head,
 					treenodes);
 			} else
 				RTFREE_LOCKED(rt2);
 		}
 	} else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
 		struct rtentry *rt2;
 		/*
 		 * We are trying to add a net route, but can't.
 		 * The following case should be allowed, so we'll make a
 		 * special check for this:
 		 *	Two IPv6 addresses with the same prefix is assigned
 		 *	to a single interrface.
 		 *	# ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
 		 *	# ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
 		 *	In this case, (*1) and (*2) want to add the same
 		 *	net route entry, 3ffe:0501:: -> if0.
 		 *	This case should not raise an error.
 		 */
 		rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING);
 		if (rt2) {
 			if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY))
 					== RTF_CLONING
 			 && rt2->rt_gateway
 			 && rt2->rt_gateway->sa_family == AF_LINK
 			 && rt2->rt_ifp == rt->rt_ifp) {
 				ret = rt2->rt_nodes;
 			}
 			RTFREE_LOCKED(rt2);
 		}
 	}
 	return ret;
 }
 
 /*
  * This code is the inverse of in6_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
 in6_matroute(void *v_arg, struct radix_node_head *head)
 {
 	struct radix_node *rn = rn_match(v_arg, head);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
 		if (rt->rt_flags & RTPRF_OURS) {
 			rt->rt_flags &= ~RTPRF_OURS;
 			rt->rt_rmx.rmx_expire = 0;
 		}
 	}
 	return rn;
 }
 
 SYSCTL_DECL(_net_inet6_ip6);
 
 #ifdef VIMAGE_GLOBALS
 static int rtq_reallyold6;
 static int rtq_minreallyold6;
 static int rtq_toomany6;
 #endif
 
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire,
-	CTLFLAG_RW, &rtq_reallyold6 , 0, "");
+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTEXPIRE,
+    rtexpire, CTLFLAG_RW, rtq_reallyold6 , 0, "");
 
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire,
-	CTLFLAG_RW, &rtq_minreallyold6 , 0, "");
+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTMINEXPIRE,
+    rtminexpire, CTLFLAG_RW, rtq_minreallyold6 , 0, "");
 
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache,
-	CTLFLAG_RW, &rtq_toomany6 , 0, "");
+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTMAXCACHE,
+    rtmaxcache, CTLFLAG_RW, rtq_toomany6 , 0, "");
 
 
 /*
  * On last reference drop, mark the route as belong to us so that it can be
  * timed out.
  */
 static void
 in6_clsroute(struct radix_node *rn, struct radix_node_head *head)
 {
 	INIT_VNET_INET6(curvnet);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (!(rt->rt_flags & RTF_UP))
 		return;		/* prophylactic measures */
 
 	if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
 		return;
 
 	if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED)
 		return;
 
 	/*
 	 * As requested by David Greenman:
 	 * If rtq_reallyold6 is 0, just delete the route without
 	 * waiting for a timeout cycle to kill it.
 	 */
 	if (V_rtq_reallyold6 != 0) {
 		rt->rt_flags |= RTPRF_OURS;
 		rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold6;
 	} else {
 		rtexpunge(rt);
 	}
 }
 
 struct rtqk_arg {
 	struct radix_node_head *rnh;
 	int mode;
 	int updating;
 	int draining;
 	int killed;
 	int found;
 	time_t nextstop;
 };
 
 /*
  * Get rid of old routes.  When draining, this deletes everything, even when
  * the timeout is not expired yet.  When updating, this makes sure that
  * nothing has a timeout longer than the current value of rtq_reallyold6.
  */
 static int
 in6_rtqkill(struct radix_node *rn, void *rock)
 {
 	INIT_VNET_INET6(curvnet);
 	struct rtqk_arg *ap = rock;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int err;
 
 	if (rt->rt_flags & RTPRF_OURS) {
 		ap->found++;
 
 		if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) {
 			if (rt->rt_refcnt > 0)
 				panic("rtqkill route really not free");
 
 			err = rtrequest(RTM_DELETE,
 					(struct sockaddr *)rt_key(rt),
 					rt->rt_gateway, rt_mask(rt),
 					rt->rt_flags, 0);
 			if (err) {
 				log(LOG_WARNING, "in6_rtqkill: error %d", err);
 			} else {
 				ap->killed++;
 			}
 		} else {
 			if (ap->updating
 			   && (rt->rt_rmx.rmx_expire - time_uptime
 			       > V_rtq_reallyold6)) {
 				rt->rt_rmx.rmx_expire = time_uptime
 					+ V_rtq_reallyold6;
 			}
 			ap->nextstop = lmin(ap->nextstop,
 					    rt->rt_rmx.rmx_expire);
 		}
 	}
 
 	return 0;
 }
 
 #define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
 #ifdef VIMAGE_GLOBALS
 static int rtq_timeout6;
 static struct callout rtq_timer6;
 #endif
 
 static void
 in6_rtqtimo(void *rock)
 {
 	CURVNET_SET_QUIET((struct vnet *) rock);
 	INIT_VNET_NET((struct vnet *) rock);
 	INIT_VNET_INET6((struct vnet *) rock);
 	struct radix_node_head *rnh = rock;
 	struct rtqk_arg arg;
 	struct timeval atv;
 	static time_t last_adjusted_timeout = 0;
 
 	arg.found = arg.killed = 0;
 	arg.rnh = rnh;
 	arg.nextstop = time_uptime + V_rtq_timeout6;
 	arg.draining = arg.updating = 0;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 
 	/*
 	 * Attempt to be somewhat dynamic about this:
 	 * If there are ``too many'' routes sitting around taking up space,
 	 * then crank down the timeout, and see if we can't make some more
 	 * go away.  However, we make sure that we will never adjust more
 	 * than once in rtq_timeout6 seconds, to keep from cranking down too
 	 * hard.
 	 */
 	if ((arg.found - arg.killed > V_rtq_toomany6)
 	   && (time_uptime - last_adjusted_timeout >= V_rtq_timeout6)
 	   && V_rtq_reallyold6 > V_rtq_minreallyold6) {
 		V_rtq_reallyold6 = 2*V_rtq_reallyold6 / 3;
 		if (V_rtq_reallyold6 < V_rtq_minreallyold6) {
 			V_rtq_reallyold6 = V_rtq_minreallyold6;
 		}
 
 		last_adjusted_timeout = time_uptime;
 #ifdef DIAGNOSTIC
 		log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold6 to %d",
 		    V_rtq_reallyold6);
 #endif
 		arg.found = arg.killed = 0;
 		arg.updating = 1;
 		RADIX_NODE_HEAD_LOCK(rnh);
 		rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	}
 
 	atv.tv_usec = 0;
 	atv.tv_sec = arg.nextstop - time_uptime;
 	callout_reset(&V_rtq_timer6, tvtohz(&atv), in6_rtqtimo, rock);
 	CURVNET_RESTORE();
 }
 
 /*
  * Age old PMTUs.
  */
 struct mtuex_arg {
 	struct radix_node_head *rnh;
 	time_t nextstop;
 };
 #ifdef VIMAGE_GLOBALS
 static struct callout rtq_mtutimer;
 #endif
 
 static int
 in6_mtuexpire(struct radix_node *rn, void *rock)
 {
 	struct rtentry *rt = (struct rtentry *)rn;
 	struct mtuex_arg *ap = rock;
 
 	/* sanity */
 	if (!rt)
 		panic("rt == NULL in in6_mtuexpire");
 
 	if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
 		if (rt->rt_rmx.rmx_expire <= time_uptime) {
 			rt->rt_flags |= RTF_PROBEMTU;
 		} else {
 			ap->nextstop = lmin(ap->nextstop,
 					rt->rt_rmx.rmx_expire);
 		}
 	}
 
 	return 0;
 }
 
 #define	MTUTIMO_DEFAULT	(60*1)
 
 static void
 in6_mtutimo(void *rock)
 {
 	CURVNET_SET_QUIET((struct vnet *) rock);
 	INIT_VNET_NET((struct vnet *) rock);
 	INIT_VNET_INET6((struct vnet *) rock);
 	struct radix_node_head *rnh = rock;
 	struct mtuex_arg arg;
 	struct timeval atv;
 
 	arg.rnh = rnh;
 	arg.nextstop = time_uptime + MTUTIMO_DEFAULT;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 
 	atv.tv_usec = 0;
 	atv.tv_sec = arg.nextstop - time_uptime;
 	if (atv.tv_sec < 0) {
 		printf("invalid mtu expiration time on routing table\n");
 		arg.nextstop = time_uptime + 30;	/* last resort */
 		atv.tv_sec = 30;
 	}
 	callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock);
 	CURVNET_RESTORE();
 }
 
 #if 0
 void
 in6_rtqdrain(void)
 {
 	INIT_VNET_NET(curvnet);
 	struct radix_node_head *rnh = V_rt_tables[AF_INET6];
 	struct rtqk_arg arg;
 
 	arg.found = arg.killed = 0;
 	arg.rnh = rnh;
 	arg.nextstop = 0;
 	arg.draining = 1;
 	arg.updating = 0;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 }
 #endif
 
 /*
  * Initialize our routing tree.
  * XXX MRT When off == 0, we are being called from vfs_export.c
  * so just set up their table and leave. (we know what the correct
  * value should be so just use that).. FIX AFTER RELENG_7 is MFC'd
  * see also comments in in_inithead() vfs_export.c and domain.h
  */
 int
 in6_inithead(void **head, int off)
 {
 	INIT_VNET_INET6(curvnet);
 	struct radix_node_head *rnh;
 
 	if (!rn_inithead(head, offsetof(struct sockaddr_in6, sin6_addr) << 3))
 		return 0;		/* See above */
 
 	if (off == 0)		/* See above */
 		return 1;	/* only do the rest for the real thing */
 
 	V_rtq_reallyold6 = 60*60; /* one hour is ``really old'' */
 	V_rtq_minreallyold6 = 10; /* never automatically crank down to less */
 	V_rtq_toomany6 = 128;	  /* 128 cached routes is ``too many'' */
 	V_rtq_timeout6 = RTQ_TIMEOUT;
 
 	rnh = *head;
 	rnh->rnh_addaddr = in6_addroute;
 	rnh->rnh_matchaddr = in6_matroute;
 	rnh->rnh_close = in6_clsroute;
 	callout_init(&V_rtq_timer6, CALLOUT_MPSAFE);
 	in6_rtqtimo(rnh);	/* kick off timeout first time */
 	callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE);
 	in6_mtutimo(rnh);	/* kick off timeout first time */
 	return 1;
 }
Index: head/sys/netinet6/in6_src.c
===================================================================
--- head/sys/netinet6/in6_src.c	(revision 185347)
+++ head/sys/netinet6/in6_src.c	(revision 185348)
@@ -1,1131 +1,1130 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/sx.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 
 static struct mtx addrsel_lock;
 #define	ADDRSEL_LOCK_INIT()	mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
 #define	ADDRSEL_LOCK()		mtx_lock(&addrsel_lock)
 #define	ADDRSEL_UNLOCK()	mtx_unlock(&addrsel_lock)
 #define	ADDRSEL_LOCK_ASSERT()	mtx_assert(&addrsel_lock, MA_OWNED)
 
 static struct sx addrsel_sxlock;
 #define	ADDRSEL_SXLOCK_INIT()	sx_init(&addrsel_sxlock, "addrsel_sxlock")
 #define	ADDRSEL_SLOCK()		sx_slock(&addrsel_sxlock)
 #define	ADDRSEL_SUNLOCK()	sx_sunlock(&addrsel_sxlock)
 #define	ADDRSEL_XLOCK()		sx_xlock(&addrsel_sxlock)
 #define	ADDRSEL_XUNLOCK()	sx_xunlock(&addrsel_sxlock)
 
 #define ADDR_LABEL_NOTAPP (-1)
 
 #ifdef VIMAGE_GLOBALS
 struct in6_addrpolicy defaultaddrpolicy;
 int ip6_prefer_tempaddr;
 #endif
 
 static int selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	struct rtentry **, int, int));
 static int in6_selectif __P((struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *ro, struct ifnet **));
 
 static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
 
 static void init_policy_queue(void);
 static int add_addrsel_policyent(struct in6_addrpolicy *);
 static int delete_addrsel_policyent(struct in6_addrpolicy *);
 static int walk_addrsel_policy __P((int (*)(struct in6_addrpolicy *, void *),
 				    void *));
 static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
 static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
 
 /*
  * Return an IPv6 address, which is the most appropriate for a given
  * destination and user specified options.
  * If necessary, this function lookups the routing table and returns
  * an entry to the caller for later use.
  */
 #define REPLACE(r) do {\
 	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
 		sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
 		V_ip6stat.ip6s_sources_rule[(r)]++; \
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto replace; \
 } while(0)
 #define NEXT(r) do {\
 	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
 		sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
 		V_ip6stat.ip6s_sources_rule[(r)]++; \
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto next;		/* XXX: we can't use 'continue' here */ \
 } while(0)
 #define BREAK(r) do { \
 	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
 		sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
 		V_ip6stat.ip6s_sources_rule[(r)]++; \
 	goto out;		/* XXX: we can't use 'break' here */ \
 } while(0)
 
 struct in6_addr *
 in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct inpcb *inp, struct route_in6 *ro, struct ucred *cred,
     struct ifnet **ifpp, int *errorp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_addr dst;
 	struct ifnet *ifp = NULL;
 	struct in6_ifaddr *ia = NULL, *ia_best = NULL;
 	struct in6_pktinfo *pi = NULL;
 	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
 	struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
 	u_int32_t odstzone;
 	int prefer_tempaddr;
 	struct ip6_moptions *mopts;
 
 	dst = dstsock->sin6_addr; /* make a copy for local operation */
 	*errorp = 0;
 	if (ifpp)
 		*ifpp = NULL;
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		mopts = inp->in6p_moptions;
 	} else {
 		mopts = NULL;
 	}
 
 	/*
 	 * If the source address is explicitly specified by the caller,
 	 * check if the requested source address is indeed a unicast address
 	 * assigned to the node, and can be used as the packet's source
 	 * address.  If everything is okay, use the address as source.
 	 */
 	if (opts && (pi = opts->ip6po_pktinfo) &&
 	    !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
 		struct sockaddr_in6 srcsock;
 		struct in6_ifaddr *ia6;
 
 		/* get the outgoing interface */
 		if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp))
 		    != 0) {
 			return (NULL);
 		}
 
 		/*
 		 * determine the appropriate zone id of the source based on
 		 * the zone of the destination and the outgoing interface.
 		 * If the specified address is ambiguous wrt the scope zone,
 		 * the interface must be specified; otherwise, ifa_ifwithaddr()
 		 * will fail matching the address.
 		 */
 		bzero(&srcsock, sizeof(srcsock));
 		srcsock.sin6_family = AF_INET6;
 		srcsock.sin6_len = sizeof(srcsock);
 		srcsock.sin6_addr = pi->ipi6_addr;
 		if (ifp) {
 			*errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
 			if (*errorp != 0)
 				return (NULL);
 		}
 
 		ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock));
 		if (ia6 == NULL ||
 		    (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) {
 			*errorp = EADDRNOTAVAIL;
 			return (NULL);
 		}
 		pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
 		if (ifpp)
 			*ifpp = ifp;
 		return (&ia6->ia_addr.sin6_addr);
 	}
 
 	/*
 	 * Otherwise, if the socket has already bound the source, just use it.
 	 */
 	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		return (&inp->in6p_laddr);
 	}
 
 	/*
 	 * If the address is not specified, choose the best one based on
 	 * the outgoing interface and the destination address.
 	 */
 	/* get the outgoing interface */
 	if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0)
 		return (NULL);
 
 #ifdef DIAGNOSTIC
 	if (ifp == NULL)	/* this should not happen */
 		panic("in6_selectsrc: NULL ifp");
 #endif
 	*errorp = in6_setscope(&dst, ifp, &odstzone);
 	if (*errorp != 0)
 		return (NULL);
 
 	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
 		int new_scope = -1, new_matchlen = -1;
 		struct in6_addrpolicy *new_policy = NULL;
 		u_int32_t srczone, osrczone, dstzone;
 		struct in6_addr src;
 		struct ifnet *ifp1 = ia->ia_ifp;
 
 		/*
 		 * We'll never take an address that breaks the scope zone
 		 * of the destination.  We also skip an address if its zone
 		 * does not contain the outgoing interface.
 		 * XXX: we should probably use sin6_scope_id here.
 		 */
 		if (in6_setscope(&dst, ifp1, &dstzone) ||
 		    odstzone != dstzone) {
 			continue;
 		}
 		src = ia->ia_addr.sin6_addr;
 		if (in6_setscope(&src, ifp, &osrczone) ||
 		    in6_setscope(&src, ifp1, &srczone) ||
 		    osrczone != srczone) {
 			continue;
 		}
 
 		/* avoid unusable addresses */
 		if ((ia->ia6_flags &
 		     (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) {
 				continue;
 		}
 		if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
 			continue;
 
 		/* Rule 1: Prefer same address */
 		if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
 			ia_best = ia;
 			BREAK(1); /* there should be no better candidate */
 		}
 
 		if (ia_best == NULL)
 			REPLACE(0);
 
 		/* Rule 2: Prefer appropriate scope */
 		if (dst_scope < 0)
 			dst_scope = in6_addrscope(&dst);
 		new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
 		if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
 				REPLACE(2);
 			NEXT(2);
 		} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
 				NEXT(2);
 			REPLACE(2);
 		}
 
 		/*
 		 * Rule 3: Avoid deprecated addresses.  Note that the case of
 		 * !ip6_use_deprecated is already rejected above.
 		 */
 		if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
 			NEXT(3);
 		if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
 			REPLACE(3);
 
 		/* Rule 4: Prefer home addresses */
 		/*
 		 * XXX: This is a TODO.  We should probably merge the MIP6
 		 * case above.
 		 */
 
 		/* Rule 5: Prefer outgoing interface */
 		if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
 			NEXT(5);
 		if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
 			REPLACE(5);
 
 		/*
 		 * Rule 6: Prefer matching label
 		 * Note that best_policy should be non-NULL here.
 		 */
 		if (dst_policy == NULL)
 			dst_policy = lookup_addrsel_policy(dstsock);
 		if (dst_policy->label != ADDR_LABEL_NOTAPP) {
 			new_policy = lookup_addrsel_policy(&ia->ia_addr);
 			if (dst_policy->label == best_policy->label &&
 			    dst_policy->label != new_policy->label)
 				NEXT(6);
 			if (dst_policy->label != best_policy->label &&
 			    dst_policy->label == new_policy->label)
 				REPLACE(6);
 		}
 
 		/*
 		 * Rule 7: Prefer public addresses.
 		 * We allow users to reverse the logic by configuring
 		 * a sysctl variable, so that privacy conscious users can
 		 * always prefer temporary addresses.
 		 */
 		if (opts == NULL ||
 		    opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
 			prefer_tempaddr = V_ip6_prefer_tempaddr;
 		} else if (opts->ip6po_prefer_tempaddr ==
 		    IP6PO_TEMPADDR_NOTPREFER) {
 			prefer_tempaddr = 0;
 		} else
 			prefer_tempaddr = 1;
 		if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    (ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				REPLACE(7);
 			else
 				NEXT(7);
 		}
 		if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				NEXT(7);
 			else
 				REPLACE(7);
 		}
 
 		/*
 		 * Rule 8: prefer addresses on alive interfaces.
 		 * This is a KAME specific rule.
 		 */
 		if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    !(ia->ia_ifp->if_flags & IFF_UP))
 			NEXT(8);
 		if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    (ia->ia_ifp->if_flags & IFF_UP))
 			REPLACE(8);
 
 		/*
 		 * Rule 14: Use longest matching prefix.
 		 * Note: in the address selection draft, this rule is
 		 * documented as "Rule 8".  However, since it is also
 		 * documented that this rule can be overridden, we assign
 		 * a large number so that it is easy to assign smaller numbers
 		 * to more preferred rules.
 		 */
 		new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
 		if (best_matchlen < new_matchlen)
 			REPLACE(14);
 		if (new_matchlen < best_matchlen)
 			NEXT(14);
 
 		/* Rule 15 is reserved. */
 
 		/*
 		 * Last resort: just keep the current candidate.
 		 * Or, do we need more rules?
 		 */
 		continue;
 
 	  replace:
 		ia_best = ia;
 		best_scope = (new_scope >= 0 ? new_scope :
 			      in6_addrscope(&ia_best->ia_addr.sin6_addr));
 		best_policy = (new_policy ? new_policy :
 			       lookup_addrsel_policy(&ia_best->ia_addr));
 		best_matchlen = (new_matchlen >= 0 ? new_matchlen :
 				 in6_matchlen(&ia_best->ia_addr.sin6_addr,
 					      &dst));
 
 	  next:
 		continue;
 
 	  out:
 		break;
 	}
 
 	if ((ia = ia_best) == NULL) {
 		*errorp = EADDRNOTAVAIL;
 		return (NULL);
 	}
 
 	if (ifpp)
 		*ifpp = ifp;
 
 	return (&ia->ia_addr.sin6_addr);
 }
 
 /*
  * clone - meaningful only for bsdi and freebsd
  */
 static int
 selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt, int clone,
     int norouteok)
 {
-	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 	struct ifnet *ifp = NULL;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *sin6_next;
 	struct in6_pktinfo *pi = NULL;
 	struct in6_addr *dst = &dstsock->sin6_addr;
 #if 0
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
 	    dstsock->sin6_addr.s6_addr32[1] == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
 		printf("in6_selectroute: strange destination %s\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr));
 	} else {
 		printf("in6_selectroute: destination = %s%%%d\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr),
 		       dstsock->sin6_scope_id); /* for debug */
 	}
 #endif
 
 	/* If the caller specify the outgoing interface explicitly, use it. */
 	if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
 		/* XXX boundary check is assumed to be already done. */
 		ifp = ifnet_byindex(pi->ipi6_ifindex);
 		if (ifp != NULL &&
 		    (norouteok || retrt == NULL ||
 		    IN6_IS_ADDR_MULTICAST(dst))) {
 			/*
 			 * we do not have to check or get the route for
 			 * multicast.
 			 */
 			goto done;
 		} else
 			goto getroute;
 	}
 
 	/*
 	 * If the destination address is a multicast address and the outgoing
 	 * interface for the address is specified by the caller, use it.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(dst) &&
 	    mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
 		goto done; /* we do not need a route for multicast. */
 	}
 
   getroute:
 	/*
 	 * If the next hop address for the packet is specified by the caller,
 	 * use it as the gateway.
 	 */
 	if (opts && opts->ip6po_nexthop) {
 		struct route_in6 *ron;
 
 		sin6_next = satosin6(opts->ip6po_nexthop);
 
 		/* at this moment, we only support AF_INET6 next hops */
 		if (sin6_next->sin6_family != AF_INET6) {
 			error = EAFNOSUPPORT; /* or should we proceed? */
 			goto done;
 		}
 
 		/*
 		 * If the next hop is an IPv6 address, then the node identified
 		 * by that address must be a neighbor of the sending host.
 		 */
 		ron = &opts->ip6po_nextroute;
 		if ((ron->ro_rt &&
 		     (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
 		     (RTF_UP | RTF_LLINFO)) ||
 		    !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
 		    &sin6_next->sin6_addr)) {
 			if (ron->ro_rt) {
 				RTFREE(ron->ro_rt);
 				ron->ro_rt = NULL;
 			}
 			*satosin6(&ron->ro_dst) = *sin6_next;
 		}
 		if (ron->ro_rt == NULL) {
 			rtalloc((struct route *)ron); /* multi path case? */
 			if (ron->ro_rt == NULL ||
 			    !(ron->ro_rt->rt_flags & RTF_LLINFO)) {
 				if (ron->ro_rt) {
 					RTFREE(ron->ro_rt);
 					ron->ro_rt = NULL;
 				}
 				error = EHOSTUNREACH;
 				goto done;
 			}
 		}
 		rt = ron->ro_rt;
 		ifp = rt->rt_ifp;
 
 		/*
 		 * When cloning is required, try to allocate a route to the
 		 * destination so that the caller can store path MTU
 		 * information.
 		 */
 		if (!clone)
 			goto done;
 	}
 
 	/*
 	 * Use a cached route if it exists and is valid, else try to allocate
 	 * a new one.  Note that we should check the address family of the
 	 * cached destination, in case of sharing the cache with IPv4.
 	 */
 	if (ro) {
 		if (ro->ro_rt &&
 		    (!(ro->ro_rt->rt_flags & RTF_UP) ||
 		     ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
 		     !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
 		     dst))) {
 			RTFREE(ro->ro_rt);
 			ro->ro_rt = (struct rtentry *)NULL;
 		}
 		if (ro->ro_rt == (struct rtentry *)NULL) {
 			struct sockaddr_in6 *sa6;
 
 			/* No route yet, so try to acquire one */
 			bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
 			sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
 			*sa6 = *dstsock;
 			sa6->sin6_scope_id = 0;
 
 			if (clone) {
 #ifdef RADIX_MPATH
 				rtalloc_mpath((struct route *)ro,
 				    ntohl(sa6->sin6_addr.s6_addr32[3]));
 #else
 				rtalloc((struct route *)ro);
 #endif
 			} else {
 				ro->ro_rt = rtalloc1(&((struct route *)ro)
 						     ->ro_dst, 0, 0UL);
 				if (ro->ro_rt)
 					RT_UNLOCK(ro->ro_rt);
 			}
 		}
 
 		/*
 		 * do not care about the result if we have the nexthop
 		 * explicitly specified.
 		 */
 		if (opts && opts->ip6po_nexthop)
 			goto done;
 
 		if (ro->ro_rt) {
 			ifp = ro->ro_rt->rt_ifp;
 
 			if (ifp == NULL) { /* can this really happen? */
 				RTFREE(ro->ro_rt);
 				ro->ro_rt = NULL;
 			}
 		}
 		if (ro->ro_rt == NULL)
 			error = EHOSTUNREACH;
 		rt = ro->ro_rt;
 
 		/*
 		 * Check if the outgoing interface conflicts with
 		 * the interface specified by ipi6_ifindex (if specified).
 		 * Note that loopback interface is always okay.
 		 * (this may happen when we are sending a packet to one of
 		 *  our own addresses.)
 		 */
 		if (ifp && opts && opts->ip6po_pktinfo &&
 		    opts->ip6po_pktinfo->ipi6_ifindex) {
 			if (!(ifp->if_flags & IFF_LOOPBACK) &&
 			    ifp->if_index !=
 			    opts->ip6po_pktinfo->ipi6_ifindex) {
 				error = EHOSTUNREACH;
 				goto done;
 			}
 		}
 	}
 
   done:
 	if (ifp == NULL && rt == NULL) {
 		/*
 		 * This can happen if the caller did not pass a cached route
 		 * nor any other hints.  We treat this case an error.
 		 */
 		error = EHOSTUNREACH;
 	}
 	if (error == EHOSTUNREACH)
 		V_ip6stat.ip6s_noroute++;
 
 	if (retifp != NULL)
 		*retifp = ifp;
 	if (retrt != NULL)
 		*retrt = rt;	/* rt may be NULL */
 
 	return (error);
 }
 
 static int
 in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp)
 {
 	int error;
 	struct route_in6 sro;
 	struct rtentry *rt = NULL;
 
 	if (ro == NULL) {
 		bzero(&sro, sizeof(sro));
 		ro = &sro;
 	}
 
 	if ((error = selectroute(dstsock, opts, mopts, ro, retifp,
 				     &rt, 0, 1)) != 0) {
 		if (ro == &sro && rt && rt == sro.ro_rt)
 			RTFREE(rt);
 		return (error);
 	}
 
 	/*
 	 * do not use a rejected or black hole route.
 	 * XXX: this check should be done in the L2 output routine.
 	 * However, if we skipped this check here, we'd see the following
 	 * scenario:
 	 * - install a rejected route for a scoped address prefix
 	 *   (like fe80::/10)
 	 * - send a packet to a destination that matches the scoped prefix,
 	 *   with ambiguity about the scope zone.
 	 * - pick the outgoing interface from the route, and disambiguate the
 	 *   scope zone with the interface.
 	 * - ip6_output() would try to get another route with the "new"
 	 *   destination, which may be valid.
 	 * - we'd see no error on output.
 	 * Although this may not be very harmful, it should still be confusing.
 	 * We thus reject the case here.
 	 */
 	if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
 		int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
 
 		if (ro == &sro && rt && rt == sro.ro_rt)
 			RTFREE(rt);
 		return (flags);
 	}
 
 	/*
 	 * Adjust the "outgoing" interface.  If we're going to loop the packet
 	 * back to ourselves, the ifp would be the loopback interface.
 	 * However, we'd rather know the interface associated to the
 	 * destination address (which should probably be one of our own
 	 * addresses.)
 	 */
 	if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp)
 		*retifp = rt->rt_ifa->ifa_ifp;
 
 	if (ro == &sro && rt && rt == sro.ro_rt)
 		RTFREE(rt);
 	return (0);
 }
 
 /*
  * clone - meaningful only for bsdi and freebsd
  */
 int
 in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt, int clone)
 {
 
 	return (selectroute(dstsock, opts, mopts, ro, retifp,
 	    retrt, clone, 0));
 }
 
 /*
  * Default hop limit selection. The precedence is as follows:
  * 1. Hoplimit value specified via ioctl.
  * 2. (If the outgoing interface is detected) the current
  *     hop limit of the interface specified by router advertisement.
  * 3. The system default hoplimit.
  */
 int
 in6_selecthlim(struct in6pcb *in6p, struct ifnet *ifp)
 {
 	INIT_VNET_INET6(curvnet);
 
 	if (in6p && in6p->in6p_hops >= 0)
 		return (in6p->in6p_hops);
 	else if (ifp)
 		return (ND_IFINFO(ifp)->chlim);
 	else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
 		struct route_in6 ro6;
 		struct ifnet *lifp;
 
 		bzero(&ro6, sizeof(ro6));
 		ro6.ro_dst.sin6_family = AF_INET6;
 		ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
 		rtalloc((struct route *)&ro6);
 		if (ro6.ro_rt) {
 			lifp = ro6.ro_rt->rt_ifp;
 			RTFREE(ro6.ro_rt);
 			if (lifp)
 				return (ND_IFINFO(lifp)->chlim);
 		} else
 			return (V_ip6_defhlim);
 	}
 	return (V_ip6_defhlim);
 }
 
 /*
  * XXX: this is borrowed from in6_pcbbind(). If possible, we should
  * share this function by all *bsd*...
  */
 int
 in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
 {
 	INIT_VNET_INET(curvnet);
 	struct socket *so = inp->inp_socket;
 	u_int16_t lport = 0, first, last, *lastport;
 	int count, error = 0, wild = 0, dorandom;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* XXX: this is redundant when called from in6_pcbbind */
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		wild = INPLOOKUP_WILDCARD;
 
 	inp->inp_flags |= INP_ANONPORT;
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
 		if (error)
 			return error;
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 
 	/*
 	 * For UDP, use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 	    (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP packets in the count. */
 	if (pcbinfo != &V_udbinfo)
 		V_ipport_tcpallocs++;
 
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		u_int16_t aux;
 
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0) {	/* completely used? */
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			return (EADDRNOTAVAIL);
 		}
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 	} while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr,
 	    lport, wild, cred));
 
 	inp->inp_lport = lport;
 	if (in_pcbinshash(inp) != 0) {
 		inp->in6p_laddr = in6addr_any;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 void
 addrsel_policy_init(void)
 {
 	ADDRSEL_LOCK_INIT();
 	ADDRSEL_SXLOCK_INIT();
 	INIT_VNET_INET6(curvnet);
 
 	V_ip6_prefer_tempaddr = 0;
 
 	init_policy_queue();
 
 	/* initialize the "last resort" policy */
 	bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
 	V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
 }
 
 static struct in6_addrpolicy *
 lookup_addrsel_policy(struct sockaddr_in6 *key)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_addrpolicy *match = NULL;
 
 	ADDRSEL_LOCK();
 	match = match_addrsel_policy(key);
 
 	if (match == NULL)
 		match = &V_defaultaddrpolicy;
 	else
 		match->use++;
 	ADDRSEL_UNLOCK();
 
 	return (match);
 }
 
 /*
  * Subroutines to manage the address selection policy table via sysctl.
  */
 struct walkarg {
 	struct sysctl_req *w_req;
 };
 
 static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
 	CTLFLAG_RD, in6_src_sysctl, "");
 
 static int
 in6_src_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct walkarg w;
 
 	if (req->newptr)
 		return EPERM;
 
 	bzero(&w, sizeof(w));
 	w.w_req = req;
 
 	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
 }
 
 int
 in6_src_ioctl(u_long cmd, caddr_t data)
 {
 	int i;
 	struct in6_addrpolicy ent0;
 
 	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
 		return (EOPNOTSUPP); /* check for safety */
 
 	ent0 = *(struct in6_addrpolicy *)data;
 
 	if (ent0.label == ADDR_LABEL_NOTAPP)
 		return (EINVAL);
 	/* check if the prefix mask is consecutive. */
 	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
 		return (EINVAL);
 	/* clear trailing garbages (if any) of the prefix address. */
 	for (i = 0; i < 4; i++) {
 		ent0.addr.sin6_addr.s6_addr32[i] &=
 			ent0.addrmask.sin6_addr.s6_addr32[i];
 	}
 	ent0.use = 0;
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 		return (add_addrsel_policyent(&ent0));
 	case SIOCDADDRCTL_POLICY:
 		return (delete_addrsel_policyent(&ent0));
 	}
 
 	return (0);		/* XXX: compromise compilers */
 }
 
 /*
  * The followings are implementation of the policy table using a
  * simple tail queue.
  * XXX such details should be hidden.
  * XXX implementation using binary tree should be more efficient.
  */
 struct addrsel_policyent {
 	TAILQ_ENTRY(addrsel_policyent) ape_entry;
 	struct in6_addrpolicy ape_policy;
 };
 
 TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
 
 #ifdef VIMAGE_GLOBALS
 struct addrsel_policyhead addrsel_policytab;
 #endif
 
 static void
 init_policy_queue(void)
 {
 	INIT_VNET_INET6(curvnet);
 
 	TAILQ_INIT(&V_addrsel_policytab);
 }
 
 static int
 add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
 {
 	INIT_VNET_INET6(curvnet);
 	struct addrsel_policyent *new, *pol;
 
 	new = malloc(sizeof(*new), M_IFADDR,
 	       M_WAITOK);
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* duplication check */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
 				       &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
 				       &pol->ape_policy.addrmask.sin6_addr)) {
 			ADDRSEL_UNLOCK();
 			ADDRSEL_XUNLOCK();
 			free(new, M_IFADDR);
 			return (EEXIST);	/* or override it? */
 		}
 	}
 
 	bzero(new, sizeof(*new));
 
 	/* XXX: should validate entry */
 	new->ape_policy = *newpolicy;
 
 	TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 
 	return (0);
 }
 
 static int
 delete_addrsel_policyent(struct in6_addrpolicy *key)
 {
 	INIT_VNET_INET6(curvnet);
 	struct addrsel_policyent *pol;
 
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* search for the entry in the table */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
 		    &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
 		    &pol->ape_policy.addrmask.sin6_addr)) {
 			break;
 		}
 	}
 	if (pol == NULL) {
 		ADDRSEL_UNLOCK();
 		ADDRSEL_XUNLOCK();
 		return (ESRCH);
 	}
 
 	TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 
 	return (0);
 }
 
 static int
 walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *),
     void *w)
 {
 	INIT_VNET_INET6(curvnet);
 	struct addrsel_policyent *pol;
 	int error = 0;
 
 	ADDRSEL_SLOCK();
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
 			ADDRSEL_SUNLOCK();
 			return (error);
 		}
 	}
 	ADDRSEL_SUNLOCK();
 	return (error);
 }
 
 static int
 dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
 {
 	int error = 0;
 	struct walkarg *w = arg;
 
 	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
 
 	return (error);
 }
 
 static struct in6_addrpolicy *
 match_addrsel_policy(struct sockaddr_in6 *key)
 {
 	INIT_VNET_INET6(curvnet);
 	struct addrsel_policyent *pent;
 	struct in6_addrpolicy *bestpol = NULL, *pol;
 	int matchlen, bestmatchlen = -1;
 	u_char *mp, *ep, *k, *p, m;
 
 	TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
 		matchlen = 0;
 
 		pol = &pent->ape_policy;
 		mp = (u_char *)&pol->addrmask.sin6_addr;
 		ep = mp + 16;	/* XXX: scope field? */
 		k = (u_char *)&key->sin6_addr;
 		p = (u_char *)&pol->addr.sin6_addr;
 		for (; mp < ep && *mp; mp++, k++, p++) {
 			m = *mp;
 			if ((*k & m) != *p)
 				goto next; /* not match */
 			if (m == 0xff) /* short cut for a typical case */
 				matchlen += 8;
 			else {
 				while (m >= 0x80) {
 					matchlen++;
 					m <<= 1;
 				}
 			}
 		}
 
 		/* matched.  check if this is better than the current best. */
 		if (bestpol == NULL ||
 		    matchlen > bestmatchlen) {
 			bestpol = pol;
 			bestmatchlen = matchlen;
 		}
 
 	  next:
 		continue;
 	}
 
 	return (bestpol);
 }
Index: head/sys/netinet6/nd6_nbr.c
===================================================================
--- head/sys/netinet6/nd6_nbr.c	(revision 185347)
+++ head/sys/netinet6/nd6_nbr.c	(revision 185348)
@@ -1,1504 +1,1504 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_carp.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 
 #ifdef DEV_CARP
 #include <netinet/ip_carp.h>
 #endif
 
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 struct dadq;
 static struct dadq *nd6_dad_find(struct ifaddr *);
 static void nd6_dad_starttimer(struct dadq *, int);
 static void nd6_dad_stoptimer(struct dadq *);
 static void nd6_dad_timer(struct ifaddr *);
 static void nd6_dad_ns_output(struct dadq *, struct ifaddr *);
 static void nd6_dad_ns_input(struct ifaddr *);
 static void nd6_dad_na_input(struct ifaddr *);
 
 #ifdef VIMAGE_GLOBALS
 int dad_ignore_ns;
 int dad_maxtry;
 #endif
 
 /*
  * Input a Neighbor Solicitation Message.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  */
 void
 nd6_ns_input(struct mbuf *m, int off, int icmp6len)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	struct in6_addr myaddr6;
 	char *lladdr = NULL;
 	struct ifaddr *ifa = NULL;
 	int lladdrlen = 0;
 	int anycast = 0, proxy = 0, tentative = 0;
 	int tlladdr;
 	union nd_opts ndopts;
 	struct sockaddr_dl *proxydl = NULL;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
 	if (nd_ns == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */
 	taddr6 = nd_ns->nd_ns_target;
 	if (in6_setscope(&taddr6, ifp, NULL) != 0)
 		goto bad;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		/* dst has to be a solicited node multicast address. */
 		if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
 		    /* don't check ifindex portion */
 		    daddr6.s6_addr32[1] == 0 &&
 		    daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
 		    daddr6.s6_addr8[12] == 0xff) {
 			; /* good */
 		} else {
 			nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 			    "(wrong ip6 dst)\n"));
 			goto bad;
 		}
-	} else if (!nd6_onlink_ns_rfc4861) {
+	} else if (!V_nd6_onlink_ns_rfc4861) {
 		struct sockaddr_in6 src_sa6;
 
 		/*
 		 * According to recent IETF discussions, it is not a good idea
 		 * to accept a NS from an address which would not be deemed
 		 * to be a neighbor otherwise.  This point is expected to be
 		 * clarified in future revisions of the specification.
 		 */
 		bzero(&src_sa6, sizeof(src_sa6));
 		src_sa6.sin6_family = AF_INET6;
 		src_sa6.sin6_len = sizeof(src_sa6);
 		src_sa6.sin6_addr = saddr6;
 		if (!nd6_is_addr_neighbor(&src_sa6, ifp)) {
 			nd6log((LOG_INFO, "nd6_ns_input: "
 				"NS packet from non-neighbor\n"));
 			goto bad;
 		}
 	}
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_ns);
 	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ns_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 		    "(link-layer address option)\n"));
 		goto bad;
 	}
 
 	/*
 	 * Attaching target link-layer address to the NA?
 	 * (RFC 2461 7.2.4)
 	 *
 	 * NS IP dst is unicast/anycast			MUST NOT add
 	 * NS IP dst is solicited-node multicast	MUST add
 	 *
 	 * In implementation, we add target link-layer address by default.
 	 * We do not add one in MUST NOT cases.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
 		tlladdr = 0;
 	else
 		tlladdr = 1;
 
 	/*
 	 * Target address (taddr6) must be either:
 	 * (1) Valid unicast/anycast address for my receiving interface,
 	 * (2) Unicast address for which I'm offering proxy service, or
 	 * (3) "tentative" address on which DAD is being performed.
 	 */
 	/* (1) and (3) check. */
 #ifdef DEV_CARP
 	if (ifp->if_carp)
 		ifa = carp_iamatch6(ifp->if_carp, &taddr6);
 	if (ifa == NULL)
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 #else
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 #endif
 
 	/* (2) check. */
 	if (ifa == NULL) {
 		struct rtentry *rt;
 		struct sockaddr_in6 tsin6;
 		int need_proxy;
 #ifdef RADIX_MPATH
 		struct route_in6 ro;
 #endif
 
 		bzero(&tsin6, sizeof tsin6);
 		tsin6.sin6_len = sizeof(struct sockaddr_in6);
 		tsin6.sin6_family = AF_INET6;
 		tsin6.sin6_addr = taddr6;
 
 #ifdef RADIX_MPATH
 		bzero(&ro, sizeof(ro));
 		ro.ro_dst = tsin6;
 		rtalloc_mpath((struct route *)&ro, RTF_ANNOUNCE);
 		rt = ro.ro_rt;
 #else
 		rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0);
 #endif
 		need_proxy = (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 &&
 		    rt->rt_gateway->sa_family == AF_LINK);
 		if (rt)
 			rtfree(rt);
 		if (need_proxy) {
 			/*
 			 * proxy NDP for single entry
 			 */
 			ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 				IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 			if (ifa) {
 				proxy = 1;
 				proxydl = SDL(rt->rt_gateway);
 			}
 		}
 	}
 	if (ifa == NULL) {
 		/*
 		 * We've got an NS packet, and we don't have that adddress
 		 * assigned for us.  We MUST silently ignore it.
 		 * See RFC2461 7.2.3.
 		 */
 		goto freeit;
 	}
 	myaddr6 = *IFA_IN6(ifa);
 	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
 	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
 	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
 		goto freeit;
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
 		    "(if %d, NS packet %d)\n",
 		    ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto freeit;
 	}
 
 	/*
 	 * We have neighbor solicitation packet, with target address equals to
 	 * one of my tentative address.
 	 *
 	 * src addr	how to process?
 	 * ---		---
 	 * multicast	of course, invalid (rejected in ip6_input)
 	 * unicast	somebody is doing address resolution -> ignore
 	 * unspec	dup address detection
 	 *
 	 * The processing is defined in RFC 2462.
 	 */
 	if (tentative) {
 		/*
 		 * If source address is unspecified address, it is for
 		 * duplicate address detection.
 		 *
 		 * If not, the packet is for addess resolution;
 		 * silently ignore it.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 			nd6_dad_ns_input(ifa);
 
 		goto freeit;
 	}
 
 	/*
 	 * If the source address is unspecified address, entries must not
 	 * be created or updated.
 	 * It looks that sender is performing DAD.  Output NA toward
 	 * all-node multicast address, to tell the sender that I'm using
 	 * the address.
 	 * S bit ("solicited") must be zero.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		struct in6_addr in6_all;
 
 		in6_all = in6addr_linklocal_allnodes;
 		if (in6_setscope(&in6_all, ifp, NULL) != 0)
 			goto bad;
 		nd6_na_output(ifp, &in6_all, &taddr6,
 		    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 		    (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0),
 		    tlladdr, (struct sockaddr *)proxydl);
 		goto freeit;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
 	    ND_NEIGHBOR_SOLICIT, 0);
 
 	nd6_na_output(ifp, &saddr6, &taddr6,
 	    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 	    (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED,
 	    tlladdr, (struct sockaddr *)proxydl);
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
 		ip6_sprintf(ip6bufs, &saddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
 		ip6_sprintf(ip6bufs, &daddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
 		ip6_sprintf(ip6bufs, &taddr6)));
 	V_icmp6stat.icp6s_badns++;
 	m_freem(m);
 }
 
 /*
  * Output a Neighbor Solicitation Message. Caller specifies:
  *	- ICMP6 header source IP6 address
  *	- ND6 header target IP6 address
  *	- ND6 header source datalink address
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  *   ln - for source address determination
  *  dad - duplicate address detection
  */
 void
 nd6_ns_output(struct ifnet *ifp, const struct in6_addr *daddr6,
     const struct in6_addr *taddr6, struct llinfo_nd6 *ln, int dad)
 {
 	INIT_VNET_INET6(ifp->if_vnet);
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_addr *src, src_in;
 	struct ip6_moptions im6o;
 	int icmp6len;
 	int maxlen;
 	caddr_t mac;
 	struct route_in6 ro;
 
 	bzero(&ro, sizeof(ro));
 
 	if (IN6_IS_ADDR_MULTICAST(taddr6))
 		return;
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_ns);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m && max_linkhdr + maxlen >= MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			m = NULL;
 		}
 	}
 	if (m == NULL)
 		return;
 	m->m_pkthdr.rcvif = NULL;
 
 	if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_ns);
 	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
 	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
 
 	/* fill neighbor solicitation packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (daddr6)
 		ip6->ip6_dst = *daddr6;
 	else {
 		ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		ip6->ip6_dst.s6_addr16[1] = 0;
 		ip6->ip6_dst.s6_addr32[1] = 0;
 		ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
 		ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
 		ip6->ip6_dst.s6_addr8[12] = 0xff;
 		if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 			goto bad;
 	}
 	if (!dad) {
 		/*
 		 * RFC2461 7.2.2:
 		 * "If the source address of the packet prompting the
 		 * solicitation is the same as one of the addresses assigned
 		 * to the outgoing interface, that address SHOULD be placed
 		 * in the IP Source Address of the outgoing solicitation.
 		 * Otherwise, any one of the addresses assigned to the
 		 * interface should be used."
 		 *
 		 * We use the source address for the prompting packet
 		 * (saddr6), if:
 		 * - saddr6 is given from the caller (by giving "ln"), and
 		 * - saddr6 belongs to the outgoing interface.
 		 * Otherwise, we perform the source address selection as usual.
 		 */
 		struct ip6_hdr *hip6;		/* hold ip6 */
 		struct in6_addr *hsrc = NULL;
 
 		if (ln && ln->ln_hold) {
 			/*
 			 * assuming every packet in ln_hold has the same IP
 			 * header
 			 */
 			hip6 = mtod(ln->ln_hold, struct ip6_hdr *);
 			/* XXX pullup? */
 			if (sizeof(*hip6) < ln->ln_hold->m_len)
 				hsrc = &hip6->ip6_src;
 			else
 				hsrc = NULL;
 		}
 		if (hsrc && in6ifa_ifpwithaddr(ifp, hsrc))
 			src = hsrc;
 		else {
 			int error;
 			struct sockaddr_in6 dst_sa;
 
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 
 			src = in6_selectsrc(&dst_sa, NULL,
 			    NULL, &ro, NULL, NULL, &error);
 			if (src == NULL) {
 				char ip6buf[INET6_ADDRSTRLEN];
 				nd6log((LOG_DEBUG,
 				    "nd6_ns_output: source can't be "
 				    "determined: dst=%s, error=%d\n",
 				    ip6_sprintf(ip6buf, &dst_sa.sin6_addr),
 				    error));
 				goto bad;
 			}
 		}
 	} else {
 		/*
 		 * Source address for DAD packet must always be IPv6
 		 * unspecified address. (0::0)
 		 * We actually don't have to 0-clear the address (we did it
 		 * above), but we do so here explicitly to make the intention
 		 * clearer.
 		 */
 		bzero(&src_in, sizeof(src_in));
 		src = &src_in;
 	}
 	ip6->ip6_src = *src;
 	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
 	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
 	nd_ns->nd_ns_code = 0;
 	nd_ns->nd_ns_reserved = 0;
 	nd_ns->nd_ns_target = *taddr6;
 	in6_clearscope(&nd_ns->nd_ns_target); /* XXX */
 
 	/*
 	 * Add source link-layer address option.
 	 *
 	 *				spec		implementation
 	 *				---		---
 	 * DAD packet			MUST NOT	do not add the option
 	 * there's no link layer address:
 	 *				impossible	do not add the option
 	 * there's link layer address:
 	 *	Multicast NS		MUST add one	add the option
 	 *	Unicast NS		SHOULD add one	add the option
 	 */
 	if (!dad && (mac = nd6_ifptomac(ifp))) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
 		/* 8 byte alignments... */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	}
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_ns->nd_ns_cksum = 0;
 	nd_ns->nd_ns_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);
 
 	ip6_output(m, NULL, &ro, dad ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
 	V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++;
 
 	if (ro.ro_rt) {		/* we don't cache this route. */
 		RTFREE(ro.ro_rt);
 	}
 	return;
 
   bad:
 	if (ro.ro_rt) {
 		RTFREE(ro.ro_rt);
 	}
 	m_freem(m);
 	return;
 }
 
 /*
  * Neighbor advertisement input handling.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  */
 void
 nd6_na_input(struct mbuf *m, int off, int icmp6len)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_advert *nd_na;
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	int flags;
 	int is_router;
 	int is_solicited;
 	int is_override;
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	struct ifaddr *ifa;
 	struct llinfo_nd6 *ln;
 	struct rtentry *rt;
 	struct sockaddr_dl *sdl;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
 	if (nd_na == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 
 	flags = nd_na->nd_na_flags_reserved;
 	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
 	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
 	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
 
 	taddr6 = nd_na->nd_na_target;
 	if (in6_setscope(&taddr6, ifp, NULL))
 		goto bad;	/* XXX: impossible */
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid target address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6)));
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&daddr6))
 		if (is_solicited) {
 			nd6log((LOG_ERR,
 			    "nd6_na_input: a solicited adv is multicasted\n"));
 			goto bad;
 		}
 
 	icmp6len -= sizeof(*nd_na);
 	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_na_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/*
 	 * Target address matches one of my interface address.
 	 *
 	 * If my address is tentative, this means that there's somebody
 	 * already using the same address as mine.  This indicates DAD failure.
 	 * This is defined in RFC 2462.
 	 *
 	 * Otherwise, process as defined in RFC 2461.
 	 */
 	if (ifa
 	 && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
 		nd6_dad_na_input(ifa);
 		goto freeit;
 	}
 
 	/* Just for safety, maybe unnecessary. */
 	if (ifa) {
 		log(LOG_ERR,
 		    "nd6_na_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6));
 		goto freeit;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
 		    "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	/*
 	 * If no neighbor cache entry is found, NA SHOULD silently be
 	 * discarded.
 	 */
 	rt = nd6_lookup(&taddr6, 0, ifp);
 	if ((rt == NULL) ||
 	   ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) ||
 	   ((sdl = SDL(rt->rt_gateway)) == NULL))
 		goto freeit;
 
 	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 		/*
 		 * If the link-layer has address, and no lladdr option came,
 		 * discard the packet.
 		 */
 		if (ifp->if_addrlen && lladdr == NULL)
 			goto freeit;
 
 		/*
 		 * Record link-layer address, and update the state.
 		 */
 		sdl->sdl_alen = ifp->if_addrlen;
 		bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
 		if (is_solicited) {
 			ln->ln_state = ND6_LLINFO_REACHABLE;
 			ln->ln_byhint = 0;
 			if (!ND6_LLINFO_PERMANENT(ln)) {
 				nd6_llinfo_settimer(ln,
 				    (long)ND_IFINFO(rt->rt_ifp)->reachable * hz);
 			}
 		} else {
 			ln->ln_state = ND6_LLINFO_STALE;
 			nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
 		}
 		if ((ln->ln_router = is_router) != 0) {
 			/*
 			 * This means a router's state has changed from
 			 * non-reachable to probably reachable, and might
 			 * affect the status of associated prefixes..
 			 */
 			pfxlist_onlink_check();
 		}
 	} else {
 		int llchange;
 
 		/*
 		 * Check if the link-layer address has changed or not.
 		 */
 		if (lladdr == NULL)
 			llchange = 0;
 		else {
 			if (sdl->sdl_alen) {
 				if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen))
 					llchange = 1;
 				else
 					llchange = 0;
 			} else
 				llchange = 1;
 		}
 
 		/*
 		 * This is VERY complex.  Look at it with care.
 		 *
 		 * override solicit lladdr llchange	action
 		 *					(L: record lladdr)
 		 *
 		 *	0	0	n	--	(2c)
 		 *	0	0	y	n	(2b) L
 		 *	0	0	y	y	(1)    REACHABLE->STALE
 		 *	0	1	n	--	(2c)   *->REACHABLE
 		 *	0	1	y	n	(2b) L *->REACHABLE
 		 *	0	1	y	y	(1)    REACHABLE->STALE
 		 *	1	0	n	--	(2a)
 		 *	1	0	y	n	(2a) L
 		 *	1	0	y	y	(2a) L *->STALE
 		 *	1	1	n	--	(2a)   *->REACHABLE
 		 *	1	1	y	n	(2a) L *->REACHABLE
 		 *	1	1	y	y	(2a) L *->REACHABLE
 		 */
 		if (!is_override && (lladdr != NULL && llchange)) {  /* (1) */
 			/*
 			 * If state is REACHABLE, make it STALE.
 			 * no other updates should be done.
 			 */
 			if (ln->ln_state == ND6_LLINFO_REACHABLE) {
 				ln->ln_state = ND6_LLINFO_STALE;
 				nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
 			}
 			goto freeit;
 		} else if (is_override				   /* (2a) */
 			|| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
 			|| lladdr == NULL) {			   /* (2c) */
 			/*
 			 * Update link-local address, if any.
 			 */
 			if (lladdr != NULL) {
 				sdl->sdl_alen = ifp->if_addrlen;
 				bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
 			}
 
 			/*
 			 * If solicited, make the state REACHABLE.
 			 * If not solicited and the link-layer address was
 			 * changed, make it STALE.
 			 */
 			if (is_solicited) {
 				ln->ln_state = ND6_LLINFO_REACHABLE;
 				ln->ln_byhint = 0;
 				if (!ND6_LLINFO_PERMANENT(ln)) {
 					nd6_llinfo_settimer(ln,
 					    (long)ND_IFINFO(ifp)->reachable * hz);
 				}
 			} else {
 				if (lladdr != NULL && llchange) {
 					ln->ln_state = ND6_LLINFO_STALE;
 					nd6_llinfo_settimer(ln,
 					    (long)V_nd6_gctimer * hz);
 				}
 			}
 		}
 
 		if (ln->ln_router && !is_router) {
 			/*
 			 * The peer dropped the router flag.
 			 * Remove the sender from the Default Router List and
 			 * update the Destination Cache entries.
 			 */
 			struct nd_defrouter *dr;
 			struct in6_addr *in6;
 			int s;
 
 			in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr;
 
 			/*
 			 * Lock to protect the default router list.
 			 * XXX: this might be unnecessary, since this function
 			 * is only called under the network software interrupt
 			 * context.  However, we keep it just for safety.
 			 */
 			s = splnet();
 			dr = defrouter_lookup(in6, ifp);
 			if (dr)
 				defrtrlist_del(dr);
 			else if (!V_ip6_forwarding) {
 				/*
 				 * Even if the neighbor is not in the default
 				 * router list, the neighbor may be used
 				 * as a next hop for some destinations
 				 * (e.g. redirect case). So we must
 				 * call rt6_flush explicitly.
 				 */
 				rt6_flush(&ip6->ip6_src, ifp);
 			}
 			splx(s);
 		}
 		ln->ln_router = is_router;
 	}
 	rt->rt_flags &= ~RTF_REJECT;
 	ln->ln_asked = 0;
 	if (ln->ln_hold) {
 		struct mbuf *m_hold, *m_hold_next;
 
 		/*
 		 * reset the ln_hold in advance, to explicitly
 		 * prevent a ln_hold lookup in nd6_output()
 		 * (wouldn't happen, though...)
 		 */
 		for (m_hold = ln->ln_hold;
 		    m_hold; m_hold = m_hold_next) {
 			m_hold_next = m_hold->m_nextpkt;
 			m_hold->m_nextpkt = NULL;
 			/*
 			 * we assume ifp is not a loopback here, so just set
 			 * the 2nd argument as the 1st one.
 			 */
 			nd6_output(ifp, ifp, m_hold,
 			    (struct sockaddr_in6 *)rt_key(rt), rt);
 		}
 		ln->ln_hold = NULL;
 	}
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	V_icmp6stat.icp6s_badna++;
 	m_freem(m);
 }
 
 /*
  * Neighbor advertisement output handling.
  *
  * Based on RFC 2461
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  *
  * tlladdr - 1 if include target link-layer address
  * sdl0 - sockaddr_dl (= proxy NA) or NULL
  */
 void
 nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0,
     const struct in6_addr *taddr6, u_long flags, int tlladdr,
     struct sockaddr *sdl0)
 {
 	INIT_VNET_INET6(ifp->if_vnet);
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_advert *nd_na;
 	struct ip6_moptions im6o;
 	struct in6_addr *src, daddr6;
 	struct sockaddr_in6 dst_sa;
 	int icmp6len, maxlen, error;
 	caddr_t mac = NULL;
 	struct route_in6 ro;
 
 	bzero(&ro, sizeof(ro));
 
 	daddr6 = *daddr6_0;	/* make a local copy for modification */
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_na);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m && max_linkhdr + maxlen >= MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			m = NULL;
 		}
 	}
 	if (m == NULL)
 		return;
 	m->m_pkthdr.rcvif = NULL;
 
 	if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_na);
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
 	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
 
 	/* fill neighbor advertisement packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
 		/* reply to DAD */
 		daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		daddr6.s6_addr16[1] = 0;
 		daddr6.s6_addr32[1] = 0;
 		daddr6.s6_addr32[2] = 0;
 		daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
 		if (in6_setscope(&daddr6, ifp, NULL))
 			goto bad;
 
 		flags &= ~ND_NA_FLAG_SOLICITED;
 	}
 	ip6->ip6_dst = daddr6;
 	bzero(&dst_sa, sizeof(struct sockaddr_in6));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(struct sockaddr_in6);
 	dst_sa.sin6_addr = daddr6;
 
 	/*
 	 * Select a source whose scope is the same as that of the dest.
 	 */
 	bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa));
 	src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, NULL, &error);
 	if (src == NULL) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
 		    "determined: dst=%s, error=%d\n",
 		    ip6_sprintf(ip6buf, &dst_sa.sin6_addr), error));
 		goto bad;
 	}
 	ip6->ip6_src = *src;
 	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
 	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
 	nd_na->nd_na_code = 0;
 	nd_na->nd_na_target = *taddr6;
 	in6_clearscope(&nd_na->nd_na_target); /* XXX */
 
 	/*
 	 * "tlladdr" indicates NS's condition for adding tlladdr or not.
 	 * see nd6_ns_input() for details.
 	 * Basically, if NS packet is sent to unicast/anycast addr,
 	 * target lladdr option SHOULD NOT be included.
 	 */
 	if (tlladdr) {
 		/*
 		 * sdl0 != NULL indicates proxy NA.  If we do proxy, use
 		 * lladdr in sdl0.  If we are not proxying (sending NA for
 		 * my address) use lladdr configured for the interface.
 		 */
 		if (sdl0 == NULL) {
 #ifdef DEV_CARP
 			if (ifp->if_carp)
 				mac = carp_macmatch6(ifp->if_carp, m, taddr6);
 			if (mac == NULL)
 				mac = nd6_ifptomac(ifp);
 #else
 			mac = nd6_ifptomac(ifp);
 #endif
 		} else if (sdl0->sa_family == AF_LINK) {
 			struct sockaddr_dl *sdl;
 			sdl = (struct sockaddr_dl *)sdl0;
 			if (sdl->sdl_alen == ifp->if_addrlen)
 				mac = LLADDR(sdl);
 		}
 	}
 	if (tlladdr && mac) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
 
 		/* roundup to 8 bytes alignment! */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	} else
 		flags &= ~ND_NA_FLAG_OVERRIDE;
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_na->nd_na_flags_reserved = flags;
 	nd_na->nd_na_cksum = 0;
 	nd_na->nd_na_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);
 
 	ip6_output(m, NULL, &ro, 0, &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
 	V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++;
 
 	if (ro.ro_rt) {		/* we don't cache this route. */
 		RTFREE(ro.ro_rt);
 	}
 	return;
 
   bad:
 	if (ro.ro_rt) {
 		RTFREE(ro.ro_rt);
 	}
 	m_freem(m);
 	return;
 }
 
 caddr_t
 nd6_ifptomac(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 #ifdef IFT_L2VLAN
 	case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 #ifdef IFT_CARP
 	case IFT_CARP:
 #endif
 	case IFT_BRIDGE:
 	case IFT_ISO88025:
 		return IF_LLADDR(ifp);
 	default:
 		return NULL;
 	}
 }
 
 TAILQ_HEAD(dadq_head, dadq);
 struct dadq {
 	TAILQ_ENTRY(dadq) dad_list;
 	struct ifaddr *dad_ifa;
 	int dad_count;		/* max NS to send */
 	int dad_ns_tcount;	/* # of trials to send NS */
 	int dad_ns_ocount;	/* NS sent so far */
 	int dad_ns_icount;
 	int dad_na_icount;
 	struct callout dad_timer_ch;
 };
 
 #ifdef VIMAGE_GLOBALS
 static struct dadq_head dadq;
 int dad_init;
 #endif
 
 static struct dadq *
 nd6_dad_find(struct ifaddr *ifa)
 {
 	INIT_VNET_INET6(curvnet);
 	struct dadq *dp;
 
 	for (dp = V_dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) {
 		if (dp->dad_ifa == ifa)
 			return dp;
 	}
 	return NULL;
 }
 
 static void
 nd6_dad_starttimer(struct dadq *dp, int ticks)
 {
 
 	callout_reset(&dp->dad_timer_ch, ticks,
 	    (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa);
 }
 
 static void
 nd6_dad_stoptimer(struct dadq *dp)
 {
 
 	callout_stop(&dp->dad_timer_ch);
 }
 
 /*
  * Start Duplicate Address Detection (DAD) for specified interface address.
  */
 void
 nd6_dad_start(struct ifaddr *ifa, int delay)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (!V_dad_init) {
 		TAILQ_INIT(&V_dadq);
 		V_dad_init++;
 	}
 
 	/*
 	 * If we don't need DAD, don't do it.
 	 * There are several cases:
 	 * - DAD is disabled (ip6_dad_count == 0)
 	 * - the interface address is anycast
 	 */
 	if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		log(LOG_DEBUG,
 			"nd6_dad_start: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	if (ia->ia6_flags & IN6_IFF_ANYCAST) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (!V_ip6_dad_count) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (ifa->ifa_ifp == NULL)
 		panic("nd6_dad_start: ifa->ifa_ifp == NULL");
 	if (!(ifa->ifa_ifp->if_flags & IFF_UP)) {
 		return;
 	}
 	if (nd6_dad_find(ifa) != NULL) {
 		/* DAD already in progress */
 		return;
 	}
 
 	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	bzero(dp, sizeof(*dp));
 	callout_init(&dp->dad_timer_ch, 0);
 	TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list);
 
 	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
 	    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 	/*
 	 * Send NS packet for DAD, ip6_dad_count times.
 	 * Note that we must delay the first transmission, if this is the
 	 * first packet to be sent from the interface after interface
 	 * (re)initialization.
 	 */
 	dp->dad_ifa = ifa;
 	IFAREF(ifa);	/* just for safety */
 	dp->dad_count = V_ip6_dad_count;
 	dp->dad_ns_icount = dp->dad_na_icount = 0;
 	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
 	if (delay == 0) {
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp,
 		    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 	} else {
 		nd6_dad_starttimer(dp, delay);
 	}
 }
 
 /*
  * terminate DAD unconditionally.  used for address removals.
  */
 void
 nd6_dad_stop(struct ifaddr *ifa)
 {
 	INIT_VNET_INET6(curvnet);
 	struct dadq *dp;
 
 	if (!V_dad_init)
 		return;
 	dp = nd6_dad_find(ifa);
 	if (!dp) {
 		/* DAD wasn't started yet */
 		return;
 	}
 
 	nd6_dad_stoptimer(dp);
 
 	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
 	free(dp, M_IP6NDP);
 	dp = NULL;
 	IFAFREE(ifa);
 }
 
 static void
 nd6_dad_timer(struct ifaddr *ifa)
 {
 	CURVNET_SET(dp->dad_vnet);
 	INIT_VNET_INET6(curvnet);
 	int s;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	s = splnet();		/* XXX */
 
 	/* Sanity check */
 	if (ia == NULL) {
 		log(LOG_ERR, "nd6_dad_timer: called with null parameter\n");
 		goto done;
 	}
 	dp = nd6_dad_find(ifa);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n");
 		goto done;
 	}
 	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
 		log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto done;
 	}
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
 		log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto done;
 	}
 
 	/* timeouted with IFF_{RUNNING,UP} check */
 	if (dp->dad_ns_tcount > V_dad_maxtry) {
 		nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n",
 		    if_name(ifa->ifa_ifp)));
 
 		TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
 		free(dp, M_IP6NDP);
 		dp = NULL;
 		IFAFREE(ifa);
 		goto done;
 	}
 
 	/* Need more checks? */
 	if (dp->dad_ns_ocount < dp->dad_count) {
 		/*
 		 * We have more NS to go.  Send NS packet for DAD.
 		 */
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp,
 		    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 	} else {
 		/*
 		 * We have transmitted sufficient number of DAD packets.
 		 * See what we've got.
 		 */
 		int duplicate;
 
 		duplicate = 0;
 
 		if (dp->dad_na_icount) {
 			/*
 			 * the check is in nd6_dad_na_input(),
 			 * but just in case
 			 */
 			duplicate++;
 		}
 
 		if (dp->dad_ns_icount) {
 			/* We've seen NS, means DAD has failed. */
 			duplicate++;
 		}
 
 		if (duplicate) {
 			/* (*dp) will be freed in nd6_dad_duplicated() */
 			dp = NULL;
 			nd6_dad_duplicated(ifa);
 		} else {
 			/*
 			 * We are done with DAD.  No NA came, no NS came.
 			 * No duplicate address found.
 			 */
 			ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 
 			nd6log((LOG_DEBUG,
 			    "%s: DAD complete for %s - no duplicates found\n",
 			    if_name(ifa->ifa_ifp),
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 			TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
 			free(dp, M_IP6NDP);
 			dp = NULL;
 			IFAFREE(ifa);
 		}
 	}
 
 done:
 	splx(s);
 	CURVNET_RESTORE();
 }
 
 void
 nd6_dad_duplicated(struct ifaddr *ifa)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp;
 	struct dadq *dp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	dp = nd6_dad_find(ifa);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n");
 		return;
 	}
 
 	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
 	    "NS in/out=%d/%d, NA in=%d\n",
 	    if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 	    dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount);
 
 	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 	ia->ia6_flags |= IN6_IFF_DUPLICATED;
 
 	/* We are done with DAD, with duplicate address found. (failure) */
 	nd6_dad_stoptimer(dp);
 
 	ifp = ifa->ifa_ifp;
 	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
 	    if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
 	log(LOG_ERR, "%s: manual intervention required\n",
 	    if_name(ifp));
 
 	/*
 	 * If the address is a link-local address formed from an interface
 	 * identifier based on the hardware address which is supposed to be
 	 * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
 	 * operation on the interface SHOULD be disabled.
 	 * [rfc2462bis-03 Section 5.4.5]
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
 		struct in6_addr in6;
 
 		/*
 		 * To avoid over-reaction, we only apply this logic when we are
 		 * very sure that hardware addresses are supposed to be unique.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 		case IFT_ATM:
 		case IFT_IEEE1394:
 #ifdef IFT_IEEE80211
 		case IFT_IEEE80211:
 #endif
 			in6 = ia->ia_addr.sin6_addr;
 			if (in6_get_hw_ifid(ifp, &in6) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
 				ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "%s: possible hardware address "
 				    "duplication detected, disable IPv6\n",
 				    if_name(ifp));
 			}
 			break;
 		}
 	}
 
 	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
 	free(dp, M_IP6NDP);
 	dp = NULL;
 	IFAFREE(ifa);
 }
 
 static void
 nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp = ifa->ifa_ifp;
 
 	dp->dad_ns_tcount++;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		return;
 	}
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		return;
 	}
 
 	dp->dad_ns_ocount++;
 	nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1);
 }
 
 static void
 nd6_dad_ns_input(struct ifaddr *ifa)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia;
 	struct ifnet *ifp;
 	const struct in6_addr *taddr6;
 	struct dadq *dp;
 	int duplicate;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_ns_input");
 
 	ia = (struct in6_ifaddr *)ifa;
 	ifp = ifa->ifa_ifp;
 	taddr6 = &ia->ia_addr.sin6_addr;
 	duplicate = 0;
 	dp = nd6_dad_find(ifa);
 
 	/* Quickhack - completely ignore DAD NS packets */
 	if (V_dad_ignore_ns) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		nd6log((LOG_INFO,
 		    "nd6_dad_ns_input: ignoring DAD NS packet for "
 		    "address %s(%s)\n", ip6_sprintf(ip6buf, taddr6),
 		    if_name(ifa->ifa_ifp)));
 		return;
 	}
 
 	/*
 	 * if I'm yet to start DAD, someone else started using this address
 	 * first.  I have a duplicate and you win.
 	 */
 	if (dp == NULL || dp->dad_ns_ocount == 0)
 		duplicate++;
 
 	/* XXX more checks for loopback situation - see nd6_dad_timer too */
 
 	if (duplicate) {
 		dp = NULL;	/* will be freed in nd6_dad_duplicated() */
 		nd6_dad_duplicated(ifa);
 	} else {
 		/*
 		 * not sure if I got a duplicate.
 		 * increment ns count and see what happens.
 		 */
 		if (dp)
 			dp->dad_ns_icount++;
 	}
 }
 
 static void
 nd6_dad_na_input(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_na_input");
 
 	dp = nd6_dad_find(ifa);
 	if (dp)
 		dp->dad_na_icount++;
 
 	/* remove the address. */
 	nd6_dad_duplicated(ifa);
 }
Index: head/sys/netinet6/nd6_rtr.c
===================================================================
--- head/sys/netinet6/nd6_rtr.c	(revision 185347)
+++ head/sys/netinet6/nd6_rtr.c	(revision 185348)
@@ -1,2114 +1,2114 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/radix.h>
 
 #include <netinet/in.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 
 #define SDL(s)	((struct sockaddr_dl *)s)
 
 static int rtpref(struct nd_defrouter *);
 static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
 static int prelist_update __P((struct nd_prefixctl *, struct nd_defrouter *,
     struct mbuf *, int));
 static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *,	int);
 static struct nd_pfxrouter *pfxrtr_lookup __P((struct nd_prefix *,
 	struct nd_defrouter *));
 static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *);
 static void pfxrtr_del(struct nd_pfxrouter *);
 static struct nd_pfxrouter *find_pfxlist_reachable_router
 (struct nd_prefix *);
 static void defrouter_delreq(struct nd_defrouter *);
 static void nd6_rtmsg(int, struct rtentry *);
 
 static int in6_init_prefix_ltimes(struct nd_prefix *);
 static void in6_init_address_ltimes __P((struct nd_prefix *,
 	struct in6_addrlifetime *));
 
 static int rt6_deleteroute(struct radix_node *, void *);
 
+#ifdef VIMAGE_GLOBALS
 extern int nd6_recalc_reachtm_interval;
 
-#ifdef VIMAGE_GLOBALS
 static struct ifnet *nd6_defifp;
 int nd6_defifindex;
 
 int ip6_use_tempaddr;
 int ip6_desync_factor;
 u_int32_t ip6_temp_preferred_lifetime;
 u_int32_t ip6_temp_valid_lifetime;
 int ip6_temp_regen_advance;
 #endif
 
 /* RTPREF_MEDIUM has to be 0! */
 #define RTPREF_HIGH	1
 #define RTPREF_MEDIUM	0
 #define RTPREF_LOW	(-1)
 #define RTPREF_RESERVED	(-2)
 #define RTPREF_INVALID	(-3)	/* internal */
 
 /*
  * Receive Router Solicitation Message - just for routers.
  * Router solicitation/advertisement is mostly managed by userland program
  * (rtadvd) so here we have no function like nd6_ra_output().
  *
  * Based on RFC 2461
  */
 void
 nd6_rs_input(struct mbuf *m, int off, int icmp6len)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_router_solicit *nd_rs;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	/* If I'm not a router, ignore it. */
 	if (V_ip6_accept_rtadv != 0 || V_ip6_forwarding != 1)
 		goto freeit;
 
 	/* Sanity checks */
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	/*
 	 * Don't update the neighbor cache, if src = ::.
 	 * This indicates that the src has no IP address assigned yet.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len);
 	if (nd_rs == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 
 	icmp6len -= sizeof(*nd_rs);
 	nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_rs_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_rs_input: lladdrlen mismatch for %s "
 		    "(if %d, RS packet %d)\n",
 		    ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0);
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	V_icmp6stat.icp6s_badrs++;
 	m_freem(m);
 }
 
 /*
  * Receive Router Advertisement Message.
  *
  * Based on RFC 2461
  * TODO: on-link bit on prefix information
  * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing
  */
 void
 nd6_ra_input(struct mbuf *m, int off, int icmp6len)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_router_advert *nd_ra;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	int mcast = 0;
 	union nd_opts ndopts;
 	struct nd_defrouter *dr;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	/*
 	 * We only accept RAs only when
 	 * the system-wide variable allows the acceptance, and
 	 * per-interface variable allows RAs on the receiving interface.
 	 */
 	if (V_ip6_accept_rtadv == 0)
 		goto freeit;
 	if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV))
 		goto freeit;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_ra_input: src %s is not link-local\n",
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto bad;
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len);
 	if (nd_ra == NULL) {
 		V_icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 
 	icmp6len -= sizeof(*nd_ra);
 	nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ra_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
     {
 	struct nd_defrouter dr0;
 	u_int32_t advreachable = nd_ra->nd_ra_reachable;
 
 	/* remember if this is a multicasted advertisement */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 		mcast = 1;
 
 	bzero(&dr0, sizeof(dr0));
 	dr0.rtaddr = saddr6;
 	dr0.flags  = nd_ra->nd_ra_flags_reserved;
 	dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime);
 	dr0.expire = time_second + dr0.rtlifetime;
 	dr0.ifp = ifp;
 	/* unspecified or not? (RFC 2461 6.3.4) */
 	if (advreachable) {
 		advreachable = ntohl(advreachable);
 		if (advreachable <= MAX_REACHABLE_TIME &&
 		    ndi->basereachable != advreachable) {
 			ndi->basereachable = advreachable;
 			ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
 			ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */
 		}
 	}
 	if (nd_ra->nd_ra_retransmit)
 		ndi->retrans = ntohl(nd_ra->nd_ra_retransmit);
 	if (nd_ra->nd_ra_curhoplimit)
 		ndi->chlim = nd_ra->nd_ra_curhoplimit;
 	dr = defrtrlist_update(&dr0);
     }
 
 	/*
 	 * prefix
 	 */
 	if (ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *pt;
 		struct nd_opt_prefix_info *pi = NULL;
 		struct nd_prefixctl pr;
 
 		for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi;
 		     pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end;
 		     pt = (struct nd_opt_hdr *)((caddr_t)pt +
 						(pt->nd_opt_len << 3))) {
 			if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION)
 				continue;
 			pi = (struct nd_opt_prefix_info *)pt;
 
 			if (pi->nd_opt_pi_len != 4) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid option "
 				    "len %d for prefix information option, "
 				    "ignored\n", pi->nd_opt_pi_len));
 				continue;
 			}
 
 			if (128 < pi->nd_opt_pi_prefix_len) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid prefix "
 				    "len %d for prefix information option, "
 				    "ignored\n", pi->nd_opt_pi_prefix_len));
 				continue;
 			}
 
 			if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix)
 			 || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid prefix "
 				    "%s, ignored\n",
 				    ip6_sprintf(ip6bufs,
 					&pi->nd_opt_pi_prefix)));
 				continue;
 			}
 
 			bzero(&pr, sizeof(pr));
 			pr.ndpr_prefix.sin6_family = AF_INET6;
 			pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix);
 			pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix;
 			pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif;
 
 			pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_ONLINK) ? 1 : 0;
 			pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_AUTO) ? 1 : 0;
 			pr.ndpr_plen = pi->nd_opt_pi_prefix_len;
 			pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time);
 			pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time);
 			(void)prelist_update(&pr, dr, m, mcast);
 		}
 	}
 
 	/*
 	 * MTU
 	 */
 	if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) {
 		u_long mtu;
 		u_long maxmtu;
 
 		mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu);
 
 		/* lower bound */
 		if (mtu < IPV6_MMTU) {
 			nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option "
 			    "mtu=%lu sent from %s, ignoring\n",
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 			goto skip;
 		}
 
 		/* upper bound */
 		maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu)
 		    ? ndi->maxmtu : ifp->if_mtu;
 		if (mtu <= maxmtu) {
 			int change = (ndi->linkmtu != mtu);
 
 			ndi->linkmtu = mtu;
 			if (change) /* in6_maxmtu may change */
 				in6_setmaxmtu();
 		} else {
 			nd6log((LOG_INFO, "nd6_ra_input: bogus mtu "
 			    "mtu=%lu sent from %s; "
 			    "exceeds maxmtu %lu, ignoring\n",
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu));
 		}
 	}
 
  skip:
 
 	/*
 	 * Source link layer address
 	 */
     {
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_ra_input: lladdrlen mismatch for %s "
 		    "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr,
 	    lladdrlen, ND_ROUTER_ADVERT, 0);
 
 	/*
 	 * Installing a link-layer address might change the state of the
 	 * router's neighbor cache, which might also affect our on-link
 	 * detection of adveritsed prefixes.
 	 */
 	pfxlist_onlink_check();
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	V_icmp6stat.icp6s_badra++;
 	m_freem(m);
 }
 
 /*
  * default router list proccessing sub routines
  */
 
 /* tell the change to user processes watching the routing socket. */
 static void
 nd6_rtmsg(int cmd, struct rtentry *rt)
 {
 	struct rt_addrinfo info;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 	if (rt->rt_ifp) {
 		info.rti_info[RTAX_IFP] =
 		    TAILQ_FIRST(&rt->rt_ifp->if_addrlist)->ifa_addr;
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 	}
 
 	rt_missmsg(cmd, &info, rt->rt_flags, 0);
 }
 
 void
 defrouter_addreq(struct nd_defrouter *new)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *newrt = NULL;
 	int s;
 	int error;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = new->rtaddr;
 
 	s = splnet();
 	error = rtrequest(RTM_ADD, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate, (struct sockaddr *)&mask,
 	    RTF_GATEWAY, &newrt);
 	if (newrt) {
 		RT_LOCK(newrt);
 		nd6_rtmsg(RTM_ADD, newrt); /* tell user process */
 		RT_REMREF(newrt);
 		RT_UNLOCK(newrt);
 	}
 	if (error == 0)
 		new->installed = 1;
 	splx(s);
 	return;
 }
 
 struct nd_defrouter *
 defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp)
 {
 	INIT_VNET_INET6(ifp->if_vnet);
 	struct nd_defrouter *dr;
 
 	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
 	     dr = TAILQ_NEXT(dr, dr_entry)) {
 		if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr))
 			return (dr);
 	}
 
 	return (NULL);		/* search failed */
 }
 
 /*
  * Remove the default route for a given router.
  * This is just a subroutine function for defrouter_select(), and should
  * not be called from anywhere else.
  */
 static void
 defrouter_delreq(struct nd_defrouter *dr)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *oldrt = NULL;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = dr->rtaddr;
 
 	rtrequest(RTM_DELETE, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate,
 	    (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt);
 	if (oldrt) {
 		nd6_rtmsg(RTM_DELETE, oldrt);
 		RTFREE(oldrt);
 	}
 
 	dr->installed = 0;
 }
 
 /*
  * remove all default routes from default router list
  */
 void
 defrouter_reset(void)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_defrouter *dr;
 
 	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
 	     dr = TAILQ_NEXT(dr, dr_entry))
 		defrouter_delreq(dr);
 
 	/*
 	 * XXX should we also nuke any default routers in the kernel, by
 	 * going through them by rtalloc1()?
 	 */
 }
 
 void
 defrtrlist_del(struct nd_defrouter *dr)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_defrouter *deldr = NULL;
 	struct nd_prefix *pr;
 
 	/*
 	 * Flush all the routing table entries that use the router
 	 * as a next hop.
 	 */
 	if (!V_ip6_forwarding && V_ip6_accept_rtadv) /* XXX: better condition? */
 		rt6_flush(&dr->rtaddr, dr->ifp);
 
 	if (dr->installed) {
 		deldr = dr;
 		defrouter_delreq(dr);
 	}
 	TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);
 
 	/*
 	 * Also delete all the pointers to the router in each prefix lists.
 	 */
 	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
 		struct nd_pfxrouter *pfxrtr;
 		if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL)
 			pfxrtr_del(pfxrtr);
 	}
 	pfxlist_onlink_check();
 
 	/*
 	 * If the router is the primary one, choose a new one.
 	 * Note that defrouter_select() will remove the current gateway
 	 * from the routing table.
 	 */
 	if (deldr)
 		defrouter_select();
 
 	free(dr, M_IP6NDP);
 }
 
 /*
  * Default Router Selection according to Section 6.3.6 of RFC 2461 and
  * draft-ietf-ipngwg-router-selection:
  * 1) Routers that are reachable or probably reachable should be preferred.
  *    If we have more than one (probably) reachable router, prefer ones
  *    with the highest router preference.
  * 2) When no routers on the list are known to be reachable or
  *    probably reachable, routers SHOULD be selected in a round-robin
  *    fashion, regardless of router preference values.
  * 3) If the Default Router List is empty, assume that all
  *    destinations are on-link.
  *
  * We assume nd_defrouter is sorted by router preference value.
  * Since the code below covers both with and without router preference cases,
  * we do not need to classify the cases by ifdef.
  *
  * At this moment, we do not try to install more than one default router,
  * even when the multipath routing is available, because we're not sure about
  * the benefits for stub hosts comparing to the risk of making the code
  * complicated and the possibility of introducing bugs.
  */
 void
 defrouter_select(void)
 {
 	INIT_VNET_INET6(curvnet);
 	int s = splnet();
 	struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL;
 	struct rtentry *rt = NULL;
 	struct llinfo_nd6 *ln = NULL;
 
 	/*
 	 * This function should be called only when acting as an autoconfigured
 	 * host.  Although the remaining part of this function is not effective
 	 * if the node is not an autoconfigured host, we explicitly exclude
 	 * such cases here for safety.
 	 */
 	if (V_ip6_forwarding || !V_ip6_accept_rtadv) {
 		nd6log((LOG_WARNING,
 		    "defrouter_select: called unexpectedly (forwarding=%d, "
 		    "accept_rtadv=%d)\n", V_ip6_forwarding, V_ip6_accept_rtadv));
 		splx(s);
 		return;
 	}
 
 	/*
 	 * Let's handle easy case (3) first:
 	 * If default router list is empty, there's nothing to be done.
 	 */
 	if (!TAILQ_FIRST(&V_nd_defrouter)) {
 		splx(s);
 		return;
 	}
 
 	/*
 	 * Search for a (probably) reachable router from the list.
 	 * We just pick up the first reachable one (if any), assuming that
 	 * the ordering rule of the list described in defrtrlist_update().
 	 */
 	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
 	     dr = TAILQ_NEXT(dr, dr_entry)) {
 		if (selected_dr == NULL &&
 		    (rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) &&
 		    (ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
 		    ND6_IS_LLINFO_PROBREACH(ln)) {
 			selected_dr = dr;
 		}
 
 		if (dr->installed && installed_dr == NULL)
 			installed_dr = dr;
 		else if (dr->installed && installed_dr) {
 			/* this should not happen.  warn for diagnosis. */
 			log(LOG_ERR, "defrouter_select: more than one router"
 			    " is installed\n");
 		}
 	}
 	/*
 	 * If none of the default routers was found to be reachable,
 	 * round-robin the list regardless of preference.
 	 * Otherwise, if we have an installed router, check if the selected
 	 * (reachable) router should really be preferred to the installed one.
 	 * We only prefer the new router when the old one is not reachable
 	 * or when the new one has a really higher preference value.
 	 */
 	if (selected_dr == NULL) {
 		if (installed_dr == NULL || !TAILQ_NEXT(installed_dr, dr_entry))
 			selected_dr = TAILQ_FIRST(&V_nd_defrouter);
 		else
 			selected_dr = TAILQ_NEXT(installed_dr, dr_entry);
 	} else if (installed_dr &&
 	    (rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) &&
 	    (ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
 	    ND6_IS_LLINFO_PROBREACH(ln) &&
 	    rtpref(selected_dr) <= rtpref(installed_dr)) {
 		selected_dr = installed_dr;
 	}
 
 	/*
 	 * If the selected router is different than the installed one,
 	 * remove the installed router and install the selected one.
 	 * Note that the selected router is never NULL here.
 	 */
 	if (installed_dr != selected_dr) {
 		if (installed_dr)
 			defrouter_delreq(installed_dr);
 		defrouter_addreq(selected_dr);
 	}
 
 	splx(s);
 	return;
 }
 
 /*
  * for default router selection
  * regards router-preference field as a 2-bit signed integer
  */
 static int
 rtpref(struct nd_defrouter *dr)
 {
 	switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) {
 	case ND_RA_FLAG_RTPREF_HIGH:
 		return (RTPREF_HIGH);
 	case ND_RA_FLAG_RTPREF_MEDIUM:
 	case ND_RA_FLAG_RTPREF_RSV:
 		return (RTPREF_MEDIUM);
 	case ND_RA_FLAG_RTPREF_LOW:
 		return (RTPREF_LOW);
 	default:
 		/*
 		 * This case should never happen.  If it did, it would mean a
 		 * serious bug of kernel internal.  We thus always bark here.
 		 * Or, can we even panic?
 		 */
 		log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags);
 		return (RTPREF_INVALID);
 	}
 	/* NOTREACHED */
 }
 
 static struct nd_defrouter *
 defrtrlist_update(struct nd_defrouter *new)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_defrouter *dr, *n;
 	int s = splnet();
 
 	if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) {
 		/* entry exists */
 		if (new->rtlifetime == 0) {
 			defrtrlist_del(dr);
 			dr = NULL;
 		} else {
 			int oldpref = rtpref(dr);
 
 			/* override */
 			dr->flags = new->flags; /* xxx flag check */
 			dr->rtlifetime = new->rtlifetime;
 			dr->expire = new->expire;
 
 			/*
 			 * If the preference does not change, there's no need
 			 * to sort the entries.
 			 */
 			if (rtpref(new) == oldpref) {
 				splx(s);
 				return (dr);
 			}
 
 			/*
 			 * preferred router may be changed, so relocate
 			 * this router.
 			 * XXX: calling TAILQ_REMOVE directly is a bad manner.
 			 * However, since defrtrlist_del() has many side
 			 * effects, we intentionally do so here.
 			 * defrouter_select() below will handle routing
 			 * changes later.
 			 */
 			TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);
 			n = dr;
 			goto insert;
 		}
 		splx(s);
 		return (dr);
 	}
 
 	/* entry does not exist */
 	if (new->rtlifetime == 0) {
 		splx(s);
 		return (NULL);
 	}
 
 	n = (struct nd_defrouter *)malloc(sizeof(*n), M_IP6NDP, M_NOWAIT);
 	if (n == NULL) {
 		splx(s);
 		return (NULL);
 	}
 	bzero(n, sizeof(*n));
 	*n = *new;
 
 insert:
 	/*
 	 * Insert the new router in the Default Router List;
 	 * The Default Router List should be in the descending order
 	 * of router-preferece.  Routers with the same preference are
 	 * sorted in the arriving time order.
 	 */
 
 	/* insert at the end of the group */
 	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
 	     dr = TAILQ_NEXT(dr, dr_entry)) {
 		if (rtpref(n) > rtpref(dr))
 			break;
 	}
 	if (dr)
 		TAILQ_INSERT_BEFORE(dr, n, dr_entry);
 	else
 		TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry);
 
 	defrouter_select();
 
 	splx(s);
 
 	return (n);
 }
 
 static struct nd_pfxrouter *
 pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *search;
 
 	for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) {
 		if (search->router == dr)
 			break;
 	}
 
 	return (search);
 }
 
 static void
 pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *new;
 
 	new = (struct nd_pfxrouter *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT);
 	if (new == NULL)
 		return;
 	bzero(new, sizeof(*new));
 	new->router = dr;
 
 	LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry);
 
 	pfxlist_onlink_check();
 }
 
 static void
 pfxrtr_del(struct nd_pfxrouter *pfr)
 {
 	LIST_REMOVE(pfr, pfr_entry);
 	free(pfr, M_IP6NDP);
 }
 
 struct nd_prefix *
 nd6_prefix_lookup(struct nd_prefixctl *key)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_prefix *search;
 
 	for (search = V_nd_prefix.lh_first;
 	    search; search = search->ndpr_next) {
 		if (key->ndpr_ifp == search->ndpr_ifp &&
 		    key->ndpr_plen == search->ndpr_plen &&
 		    in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr,
 		    &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) {
 			break;
 		}
 	}
 
 	return (search);
 }
 
 int
 nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr,
     struct nd_prefix **newp)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_prefix *new = NULL;
 	int error = 0;
 	int i, s;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	new = (struct nd_prefix *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT);
 	if (new == NULL)
 		return(ENOMEM);
 	bzero(new, sizeof(*new));
 	new->ndpr_ifp = pr->ndpr_ifp;
 	new->ndpr_prefix = pr->ndpr_prefix;
 	new->ndpr_plen = pr->ndpr_plen;
 	new->ndpr_vltime = pr->ndpr_vltime;
 	new->ndpr_pltime = pr->ndpr_pltime;
 	new->ndpr_flags = pr->ndpr_flags;
 	if ((error = in6_init_prefix_ltimes(new)) != 0) {
 		free(new, M_IP6NDP);
 		return(error);
 	}
 	new->ndpr_lastupdate = time_second;
 	if (newp != NULL)
 		*newp = new;
 
 	/* initialization */
 	LIST_INIT(&new->ndpr_advrtrs);
 	in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen);
 	/* make prefix in the canonical form */
 	for (i = 0; i < 4; i++)
 		new->ndpr_prefix.sin6_addr.s6_addr32[i] &=
 		    new->ndpr_mask.s6_addr32[i];
 
 	s = splnet();
 	/* link ndpr_entry to nd_prefix list */
 	LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry);
 	splx(s);
 
 	/* ND_OPT_PI_FLAG_ONLINK processing */
 	if (new->ndpr_raf_onlink) {
 		int e;
 
 		if ((e = nd6_prefix_onlink(new)) != 0) {
 			nd6log((LOG_ERR, "nd6_prelist_add: failed to make "
 			    "the prefix %s/%d on-link on %s (errno=%d)\n",
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 			/* proceed anyway. XXX: is it correct? */
 		}
 	}
 
 	if (dr)
 		pfxrtr_add(new, dr);
 
 	return 0;
 }
 
 void
 prelist_remove(struct nd_prefix *pr)
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_pfxrouter *pfr, *next;
 	int e, s;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* make sure to invalidate the prefix until it is really freed. */
 	pr->ndpr_vltime = 0;
 	pr->ndpr_pltime = 0;
 
 	/*
 	 * Though these flags are now meaningless, we'd rather keep the value
 	 * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users
 	 * when executing "ndp -p".
 	 */
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 &&
 	    (e = nd6_prefix_offlink(pr)) != 0) {
 		nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink "
 		    "on %s, errno=%d\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 		/* what should we do? */
 	}
 
 	if (pr->ndpr_refcnt > 0)
 		return;		/* notice here? */
 
 	s = splnet();
 
 	/* unlink ndpr_entry from nd_prefix list */
 	LIST_REMOVE(pr, ndpr_entry);
 
 	/* free list of routers that adversed the prefix */
 	for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = next) {
 		next = pfr->pfr_next;
 
 		free(pfr, M_IP6NDP);
 	}
 	splx(s);
 
 	free(pr, M_IP6NDP);
 
 	pfxlist_onlink_check();
 }
 
 /*
  * dr - may be NULL
  */
 
 static int
 prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr,
     struct mbuf *m, int mcast)
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = new->ndpr_ifp;
 	struct nd_prefix *pr;
 	int s = splnet();
 	int error = 0;
 	int newprefix = 0;
 	int auth;
 	struct in6_addrlifetime lt6_tmp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	auth = 0;
 	if (m) {
 		/*
 		 * Authenticity for NA consists authentication for
 		 * both IP header and IP datagrams, doesn't it ?
 		 */
 #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM)
 		auth = ((m->m_flags & M_AUTHIPHDR) &&
 		    (m->m_flags & M_AUTHIPDGM));
 #endif
 	}
 
 	if ((pr = nd6_prefix_lookup(new)) != NULL) {
 		/*
 		 * nd6_prefix_lookup() ensures that pr and new have the same
 		 * prefix on a same interface.
 		 */
 
 		/*
 		 * Update prefix information.  Note that the on-link (L) bit
 		 * and the autonomous (A) bit should NOT be changed from 1
 		 * to 0.
 		 */
 		if (new->ndpr_raf_onlink == 1)
 			pr->ndpr_raf_onlink = 1;
 		if (new->ndpr_raf_auto == 1)
 			pr->ndpr_raf_auto = 1;
 		if (new->ndpr_raf_onlink) {
 			pr->ndpr_vltime = new->ndpr_vltime;
 			pr->ndpr_pltime = new->ndpr_pltime;
 			(void)in6_init_prefix_ltimes(pr); /* XXX error case? */
 			pr->ndpr_lastupdate = time_second;
 		}
 
 		if (new->ndpr_raf_onlink &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 			int e;
 
 			if ((e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "prelist_update: failed to make "
 				    "the prefix %s/%d on-link on %s "
 				    "(errno=%d)\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 				    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 				/* proceed anyway. XXX: is it correct? */
 			}
 		}
 
 		if (dr && pfxrtr_lookup(pr, dr) == NULL)
 			pfxrtr_add(pr, dr);
 	} else {
 		struct nd_prefix *newpr = NULL;
 
 		newprefix = 1;
 
 		if (new->ndpr_vltime == 0)
 			goto end;
 		if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0)
 			goto end;
 
 		error = nd6_prelist_add(new, dr, &newpr);
 		if (error != 0 || newpr == NULL) {
 			nd6log((LOG_NOTICE, "prelist_update: "
 			    "nd6_prelist_add failed for %s/%d on %s "
 			    "errno=%d, returnpr=%p\n",
 			    ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr),
 			    new->ndpr_plen, if_name(new->ndpr_ifp),
 			    error, newpr));
 			goto end; /* we should just give up in this case. */
 		}
 
 		/*
 		 * XXX: from the ND point of view, we can ignore a prefix
 		 * with the on-link bit being zero.  However, we need a
 		 * prefix structure for references from autoconfigured
 		 * addresses.  Thus, we explicitly make sure that the prefix
 		 * itself expires now.
 		 */
 		if (newpr->ndpr_raf_onlink == 0) {
 			newpr->ndpr_vltime = 0;
 			newpr->ndpr_pltime = 0;
 			in6_init_prefix_ltimes(newpr);
 		}
 
 		pr = newpr;
 	}
 
 	/*
 	 * Address autoconfiguration based on Section 5.5.3 of RFC 2462.
 	 * Note that pr must be non NULL at this point.
 	 */
 
 	/* 5.5.3 (a). Ignore the prefix without the A bit set. */
 	if (!new->ndpr_raf_auto)
 		goto end;
 
 	/*
 	 * 5.5.3 (b). the link-local prefix should have been ignored in
 	 * nd6_ra_input.
 	 */
 
 	/* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */
 	if (new->ndpr_pltime > new->ndpr_vltime) {
 		error = EINVAL;	/* XXX: won't be used */
 		goto end;
 	}
 
 	/*
 	 * 5.5.3 (d).  If the prefix advertised is not equal to the prefix of
 	 * an address configured by stateless autoconfiguration already in the
 	 * list of addresses associated with the interface, and the Valid
 	 * Lifetime is not 0, form an address.  We first check if we have
 	 * a matching prefix.
 	 * Note: we apply a clarification in rfc2462bis-02 here.  We only
 	 * consider autoconfigured addresses while RFC2462 simply said
 	 * "address".
 	 */
 	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
 		struct in6_ifaddr *ifa6;
 		u_int32_t remaininglifetime;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		ifa6 = (struct in6_ifaddr *)ifa;
 
 		/*
 		 * We only consider autoconfigured addresses as per rfc2462bis.
 		 */
 		if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		/*
 		 * Spec is not clear here, but I believe we should concentrate
 		 * on unicast (i.e. not anycast) addresses.
 		 * XXX: other ia6_flags? detached or duplicated?
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0)
 			continue;
 
 		/*
 		 * Ignore the address if it is not associated with a prefix
 		 * or is associated with a prefix that is different from this
 		 * one.  (pr is never NULL here)
 		 */
 		if (ifa6->ia6_ndpr != pr)
 			continue;
 
 		if (ia6_match == NULL) /* remember the first one */
 			ia6_match = ifa6;
 
 		/*
 		 * An already autoconfigured address matched.  Now that we
 		 * are sure there is at least one matched address, we can
 		 * proceed to 5.5.3. (e): update the lifetimes according to the
 		 * "two hours" rule and the privacy extension.
 		 * We apply some clarifications in rfc2462bis:
 		 * - use remaininglifetime instead of storedlifetime as a
 		 *   variable name
 		 * - remove the dead code in the "two-hour" rule
 		 */
 #define TWOHOUR		(120*60)
 		lt6_tmp = ifa6->ia6_lifetime;
 
 		if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME)
 			remaininglifetime = ND6_INFINITE_LIFETIME;
 		else if (time_second - ifa6->ia6_updatetime >
 			 lt6_tmp.ia6t_vltime) {
 			/*
 			 * The case of "invalid" address.  We should usually
 			 * not see this case.
 			 */
 			remaininglifetime = 0;
 		} else
 			remaininglifetime = lt6_tmp.ia6t_vltime -
 			    (time_second - ifa6->ia6_updatetime);
 
 		/* when not updating, keep the current stored lifetime. */
 		lt6_tmp.ia6t_vltime = remaininglifetime;
 
 		if (TWOHOUR < new->ndpr_vltime ||
 		    remaininglifetime < new->ndpr_vltime) {
 			lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 		} else if (remaininglifetime <= TWOHOUR) {
 			if (auth) {
 				lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 			}
 		} else {
 			/*
 			 * new->ndpr_vltime <= TWOHOUR &&
 			 * TWOHOUR < remaininglifetime
 			 */
 			lt6_tmp.ia6t_vltime = TWOHOUR;
 		}
 
 		/* The 2 hour rule is not imposed for preferred lifetime. */
 		lt6_tmp.ia6t_pltime = new->ndpr_pltime;
 
 		in6_init_address_ltimes(pr, &lt6_tmp);
 
 		/*
 		 * We need to treat lifetimes for temporary addresses
 		 * differently, according to
 		 * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1);
 		 * we only update the lifetimes when they are in the maximum
 		 * intervals.
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 			u_int32_t maxvltime, maxpltime;
 
 			if (V_ip6_temp_valid_lifetime >
 			    (u_int32_t)((time_second - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxvltime = V_ip6_temp_valid_lifetime -
 				    (time_second - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxvltime = 0;
 			if (V_ip6_temp_preferred_lifetime >
 			    (u_int32_t)((time_second - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxpltime = V_ip6_temp_preferred_lifetime -
 				    (time_second - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxpltime = 0;
 
 			if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_vltime > maxvltime) {
 				lt6_tmp.ia6t_vltime = maxvltime;
 			}
 			if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_pltime > maxpltime) {
 				lt6_tmp.ia6t_pltime = maxpltime;
 			}
 		}
 		ifa6->ia6_lifetime = lt6_tmp;
 		ifa6->ia6_updatetime = time_second;
 	}
 	if (ia6_match == NULL && new->ndpr_vltime) {
 		int ifidlen;
 
 		/*
 		 * 5.5.3 (d) (continued)
 		 * No address matched and the valid lifetime is non-zero.
 		 * Create a new address.
 		 */
 
 		/*
 		 * Prefix Length check:
 		 * If the sum of the prefix length and interface identifier
 		 * length does not equal 128 bits, the Prefix Information
 		 * option MUST be ignored.  The length of the interface
 		 * identifier is defined in a separate link-type specific
 		 * document.
 		 */
 		ifidlen = in6_if2idlen(ifp);
 		if (ifidlen < 0) {
 			/* this should not happen, so we always log it. */
 			log(LOG_ERR, "prelist_update: IFID undefined (%s)\n",
 			    if_name(ifp));
 			goto end;
 		}
 		if (ifidlen + pr->ndpr_plen != 128) {
 			nd6log((LOG_INFO,
 			    "prelist_update: invalid prefixlen "
 			    "%d for %s, ignored\n",
 			    pr->ndpr_plen, if_name(ifp)));
 			goto end;
 		}
 
 		if ((ia6 = in6_ifadd(new, mcast)) != NULL) {
 			/*
 			 * note that we should use pr (not new) for reference.
 			 */
 			pr->ndpr_refcnt++;
 			ia6->ia6_ndpr = pr;
 
 			/*
 			 * RFC 3041 3.3 (2).
 			 * When a new public address is created as described
 			 * in RFC2462, also create a new temporary address.
 			 *
 			 * RFC 3041 3.5.
 			 * When an interface connects to a new link, a new
 			 * randomized interface identifier should be generated
 			 * immediately together with a new set of temporary
 			 * addresses.  Thus, we specifiy 1 as the 2nd arg of
 			 * in6_tmpifadd().
 			 */
 			if (V_ip6_use_tempaddr) {
 				int e;
 				if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) {
 					nd6log((LOG_NOTICE, "prelist_update: "
 					    "failed to create a temporary "
 					    "address, errno=%d\n",
 					    e));
 				}
 			}
 
 			/*
 			 * A newly added address might affect the status
 			 * of other addresses, so we check and update it.
 			 * XXX: what if address duplication happens?
 			 */
 			pfxlist_onlink_check();
 		} else {
 			/* just set an error. do not bark here. */
 			error = EADDRNOTAVAIL; /* XXX: might be unused. */
 		}
 	}
 
  end:
 	splx(s);
 	return error;
 }
 
 /*
  * A supplement function used in the on-link detection below;
  * detect if a given prefix has a (probably) reachable advertising router.
  * XXX: lengthy function name...
  */
 static struct nd_pfxrouter *
 find_pfxlist_reachable_router(struct nd_prefix *pr)
 {
 	struct nd_pfxrouter *pfxrtr;
 	struct rtentry *rt;
 	struct llinfo_nd6 *ln;
 
 	for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr;
 	     pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) {
 		if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0,
 		    pfxrtr->router->ifp)) &&
 		    (ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
 		    ND6_IS_LLINFO_PROBREACH(ln))
 			break;	/* found */
 	}
 
 	return (pfxrtr);
 }
 
 /*
  * Check if each prefix in the prefix list has at least one available router
  * that advertised the prefix (a router is "available" if its neighbor cache
  * entry is reachable or probably reachable).
  * If the check fails, the prefix may be off-link, because, for example,
  * we have moved from the network but the lifetime of the prefix has not
  * expired yet.  So we should not use the prefix if there is another prefix
  * that has an available router.
  * But, if there is no prefix that has an available router, we still regards
  * all the prefixes as on-link.  This is because we can't tell if all the
  * routers are simply dead or if we really moved from the network and there
  * is no router around us.
  */
 void
 pfxlist_onlink_check()
 {
 	INIT_VNET_INET6(curvnet);
 	struct nd_prefix *pr;
 	struct in6_ifaddr *ifa;
 	struct nd_defrouter *dr;
 	struct nd_pfxrouter *pfxrtr = NULL;
 
 	/*
 	 * Check if there is a prefix that has a reachable advertising
 	 * router.
 	 */
 	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
 		if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr))
 			break;
 	}
 
 	/*
 	 * If we have no such prefix, check whether we still have a router
 	 * that does not advertise any prefixes.
 	 */
 	if (pr == NULL) {
 		for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
 		    dr = TAILQ_NEXT(dr, dr_entry)) {
 			struct nd_prefix *pr0;
 
 			for (pr0 = V_nd_prefix.lh_first; pr0;
 			    pr0 = pr0->ndpr_next) {
 				if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL)
 					break;
 			}
 			if (pfxrtr != NULL)
 				break;
 		}
 	}
 	if (pr != NULL || (TAILQ_FIRST(&V_nd_defrouter) && pfxrtr == NULL)) {
 		/*
 		 * There is at least one prefix that has a reachable router,
 		 * or at least a router which probably does not advertise
 		 * any prefixes.  The latter would be the case when we move
 		 * to a new link where we have a router that does not provide
 		 * prefixes and we configure an address by hand.
 		 * Detach prefixes which have no reachable advertising
 		 * router, and attach other prefixes.
 		 */
 		for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
 			/* XXX: a link-local prefix should never be detached */
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue;
 
 			/*
 			 * we aren't interested in prefixes without the L bit
 			 * set.
 			 */
 			if (pr->ndpr_raf_onlink == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 			    find_pfxlist_reachable_router(pr) == NULL)
 				pr->ndpr_stateflags |= NDPRF_DETACHED;
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 			    find_pfxlist_reachable_router(pr) != 0)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	} else {
 		/* there is no prefix that has a reachable router */
 		for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue;
 
 			if (pr->ndpr_raf_onlink == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	}
 
 	/*
 	 * Remove each interface route associated with a (just) detached
 	 * prefix, and reinstall the interface route for a (just) attached
 	 * prefix.  Note that all attempt of reinstallation does not
 	 * necessarily success, when a same prefix is shared among multiple
 	 * interfaces.  Such cases will be handled in nd6_prefix_onlink,
 	 * so we don't have to care about them.
 	 */
 	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
 		int e;
 		char ip6buf[INET6_ADDRSTRLEN];
 
 		if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 			continue;
 
 		if (pr->ndpr_raf_onlink == 0)
 			continue;
 
 		if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 			if ((e = nd6_prefix_offlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "pfxlist_onlink_check: failed to "
 				    "make %s/%d offlink, errno=%d\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 		}
 		if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 &&
 		    pr->ndpr_raf_onlink) {
 			if ((e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "pfxlist_onlink_check: failed to "
 				    "make %s/%d onlink, errno=%d\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 		}
 	}
 
 	/*
 	 * Changes on the prefix status might affect address status as well.
 	 * Make sure that all addresses derived from an attached prefix are
 	 * attached, and that all addresses derived from a detached prefix are
 	 * detached.  Note, however, that a manually configured address should
 	 * always be attached.
 	 * The precise detection logic is same as the one for prefixes.
 	 */
 	for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
 		if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		if (ifa->ia6_ndpr == NULL) {
 			/*
 			 * This can happen when we first configure the address
 			 * (i.e. the address exists, but the prefix does not).
 			 * XXX: complicated relationships...
 			 */
 			continue;
 		}
 
 		if (find_pfxlist_reachable_router(ifa->ia6_ndpr))
 			break;
 	}
 	if (ifa) {
 		for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_ndpr == NULL) /* XXX: see above. */
 				continue;
 
 			if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) {
 				if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 					ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 					ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 					nd6_dad_start((struct ifaddr *)ifa, 0);
 				}
 			} else {
 				ifa->ia6_flags |= IN6_IFF_DETACHED;
 			}
 		}
 	}
 	else {
 		for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 				ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 				ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 				/* Do we need a delay in this case? */
 				nd6_dad_start((struct ifaddr *)ifa, 0);
 			}
 		}
 	}
 }
 
 int
 nd6_prefix_onlink(struct nd_prefix *pr)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifaddr *ifa;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct sockaddr_in6 mask6;
 	struct nd_prefix *opr;
 	u_long rtflags;
 	int error = 0;
 	struct rtentry *rt = NULL;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* sanity check */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 		nd6log((LOG_ERR,
 		    "nd6_prefix_onlink: %s/%d is already on-link\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen));
 		return (EEXIST);
 	}
 
 	/*
 	 * Add the interface route associated with the prefix.  Before
 	 * installing the route, check if there's the same prefix on another
 	 * interface, and the prefix has already installed the interface route.
 	 * Although such a configuration is expected to be rare, we explicitly
 	 * allow it.
 	 */
 	for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) {
 		if (opr == pr)
 			continue;
 
 		if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 			continue;
 
 		if (opr->ndpr_plen == pr->ndpr_plen &&
 		    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 		    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen))
 			return (0);
 	}
 
 	/*
 	 * We prefer link-local addresses as the associated interface address.
 	 */
 	/* search for a link-local addr */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 	    IN6_IFF_NOTREADY | IN6_IFF_ANYCAST);
 	if (ifa == NULL) {
 		/* XXX: freebsd does not have ifa_ifwithaf */
 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
 			if (ifa->ifa_addr->sa_family == AF_INET6)
 				break;
 		}
 		/* should we care about ia6_flags? */
 	}
 	if (ifa == NULL) {
 		/*
 		 * This can still happen, when, for example, we receive an RA
 		 * containing a prefix with the L bit set and the A bit clear,
 		 * after removing all IPv6 addresses on the receiving
 		 * interface.  This should, of course, be rare though.
 		 */
 		nd6log((LOG_NOTICE,
 		    "nd6_prefix_onlink: failed to find any ifaddr"
 		    " to add route for a prefix(%s/%d) on %s\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp)));
 		return (0);
 	}
 
 	/*
 	 * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs.
 	 * ifa->ifa_rtrequest = nd6_rtrequest;
 	 */
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_len = sizeof(mask6);
 	mask6.sin6_addr = pr->ndpr_mask;
 	rtflags = ifa->ifa_flags | RTF_CLONING | RTF_UP;
 	if (nd6_need_cache(ifp)) {
 		/* explicitly set in case ifa_flags does not set the flag. */
 		rtflags |= RTF_CLONING;
 	} else {
 		/*
 		 * explicitly clear the cloning bit in case ifa_flags sets it.
 		 */
 		rtflags &= ~RTF_CLONING;
 	}
 	error = rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix,
 	    ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt);
 	if (error == 0) {
 		if (rt != NULL) /* this should be non NULL, though */
 			nd6_rtmsg(RTM_ADD, rt);
 		pr->ndpr_stateflags |= NDPRF_ONLINK;
 	} else {
 		char ip6bufg[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
 		nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a"
 		    " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx "
 		    "errno = %d\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp),
 		    ip6_sprintf(ip6bufg, &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr),
 		    ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error));
 	}
 
 	if (rt != NULL) {
 		RT_LOCK(rt);
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 	}
 
 	return (error);
 }
 
 int
 nd6_prefix_offlink(struct nd_prefix *pr)
 {
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	struct sockaddr_in6 sa6, mask6;
 	struct rtentry *rt = NULL;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* sanity check */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 		nd6log((LOG_ERR,
 		    "nd6_prefix_offlink: %s/%d is already off-link\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen));
 		return (EEXIST);
 	}
 
 	bzero(&sa6, sizeof(sa6));
 	sa6.sin6_family = AF_INET6;
 	sa6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr,
 	    sizeof(struct in6_addr));
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_family = AF_INET6;
 	mask6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr));
 	error = rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL,
 	    (struct sockaddr *)&mask6, 0, &rt);
 	if (error == 0) {
 		pr->ndpr_stateflags &= ~NDPRF_ONLINK;
 
 		/* report the route deletion to the routing socket. */
 		if (rt != NULL)
 			nd6_rtmsg(RTM_DELETE, rt);
 
 		/*
 		 * There might be the same prefix on another interface,
 		 * the prefix which could not be on-link just because we have
 		 * the interface route (see comments in nd6_prefix_onlink).
 		 * If there's one, try to make the prefix on-link on the
 		 * interface.
 		 */
 		for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) {
 			if (opr == pr)
 				continue;
 
 			if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0)
 				continue;
 
 			/*
 			 * KAME specific: detached prefixes should not be
 			 * on-link.
 			 */
 			if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0)
 				continue;
 
 			if (opr->ndpr_plen == pr->ndpr_plen &&
 			    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 			    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 				int e;
 
 				if ((e = nd6_prefix_onlink(opr)) != 0) {
 					nd6log((LOG_ERR,
 					    "nd6_prefix_offlink: failed to "
 					    "recover a prefix %s/%d from %s "
 					    "to %s (errno = %d)\n",
 					    ip6_sprintf(ip6buf,
 						&opr->ndpr_prefix.sin6_addr),
 					    opr->ndpr_plen, if_name(ifp),
 					    if_name(opr->ndpr_ifp), e));
 				}
 			}
 		}
 	} else {
 		/* XXX: can we still set the NDPRF_ONLINK flag? */
 		nd6log((LOG_ERR,
 		    "nd6_prefix_offlink: failed to delete route: "
 		    "%s/%d on %s (errno = %d)\n",
 		    ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen,
 		    if_name(ifp), error));
 	}
 
 	if (rt != NULL) {
 		RTFREE(rt);
 	}
 
 	return (error);
 }
 
 static struct in6_ifaddr *
 in6_ifadd(struct nd_prefixctl *pr, int mcast)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct ifaddr *ifa;
 	struct in6_aliasreq ifra;
 	struct in6_ifaddr *ia, *ib;
 	int error, plen0;
 	struct in6_addr mask;
 	int prefixlen = pr->ndpr_plen;
 	int updateflags;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	in6_prefixlen2mask(&mask, prefixlen);
 
 	/*
 	 * find a link-local address (will be interface ID).
 	 * Is it really mandatory? Theoretically, a global or a site-local
 	 * address can be configured without a link-local address, if we
 	 * have a unique interface identifier...
 	 *
 	 * it is not mandatory to have a link-local address, we can generate
 	 * interface identifier on the fly.  we do this because:
 	 * (1) it should be the easiest way to find interface identifier.
 	 * (2) RFC2462 5.4 suggesting the use of the same interface identifier
 	 * for multiple addresses on a single interface, and possible shortcut
 	 * of DAD.  we omitted DAD for this reason in the past.
 	 * (3) a user can prevent autoconfiguration of global address
 	 * by removing link-local address by hand (this is partly because we
 	 * don't have other way to control the use of IPv6 on an interface.
 	 * this has been our design choice - cf. NRL's "ifconfig auto").
 	 * (4) it is easier to manage when an interface has addresses
 	 * with the same interface identifier, than to have multiple addresses
 	 * with different interface identifiers.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */
 	if (ifa)
 		ib = (struct in6_ifaddr *)ifa;
 	else
 		return NULL;
 
 	/* prefixlen + ifidlen must be equal to 128 */
 	plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL);
 	if (prefixlen != plen0) {
 		nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s "
 		    "(prefix=%d ifid=%d)\n",
 		    if_name(ifp), prefixlen, 128 - plen0));
 		return NULL;
 	}
 
 	/* make ifaddr */
 
 	bzero(&ifra, sizeof(ifra));
 	/*
 	 * in6_update_ifa() does not use ifra_name, but we accurately set it
 	 * for safety.
 	 */
 	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
 	ifra.ifra_addr.sin6_family = AF_INET6;
 	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	/* prefix */
 	ifra.ifra_addr.sin6_addr = pr->ndpr_prefix.sin6_addr;
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] &= mask.s6_addr32[0];
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] &= mask.s6_addr32[1];
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] &= mask.s6_addr32[2];
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] &= mask.s6_addr32[3];
 
 	/* interface ID */
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]);
 
 	/* new prefix mask. */
 	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_prefixmask.sin6_family = AF_INET6;
 	bcopy(&mask, &ifra.ifra_prefixmask.sin6_addr,
 	    sizeof(ifra.ifra_prefixmask.sin6_addr));
 
 	/* lifetimes. */
 	ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime;
 	ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime;
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */
 
 	/*
 	 * Make sure that we do not have this address already.  This should
 	 * usually not happen, but we can still see this case, e.g., if we
 	 * have manually configured the exact address to be configured.
 	 */
 	if (in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr) != NULL) {
 		/* this should be rare enough to make an explicit log */
 		log(LOG_INFO, "in6_ifadd: %s is already configured\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr));
 		return (NULL);
 	}
 
 	/*
 	 * Allocate ifaddr structure, link into chain, etc.
 	 * If we are going to create a new address upon receiving a multicasted
 	 * RA, we need to impose a random delay before starting DAD.
 	 * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2]
 	 */
 	updateflags = 0;
 	if (mcast)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) {
 		nd6log((LOG_ERR,
 		    "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr),
 		    if_name(ifp), error));
 		return (NULL);	/* ifaddr must not have been allocated. */
 	}
 
 	ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 
 	return (ia);		/* this is always non-NULL */
 }
 
 /*
  * ia0 - corresponding public address
  */
 int
 in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay)
 {
 	INIT_VNET_INET6(curvnet);
 	struct ifnet *ifp = ia0->ia_ifa.ifa_ifp;
 	struct in6_ifaddr *newia, *ia;
 	struct in6_aliasreq ifra;
 	int i, error;
 	int trylimit = 3;	/* XXX: adhoc value */
 	int updateflags;
 	u_int32_t randid[2];
 	time_t vltime0, pltime0;
 
 	bzero(&ifra, sizeof(ifra));
 	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
 	ifra.ifra_addr = ia0->ia_addr;
 	/* copy prefix mask */
 	ifra.ifra_prefixmask = ia0->ia_prefixmask;
 	/* clear the old IFID */
 	for (i = 0; i < 4; i++) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[i] &=
 		    ifra.ifra_prefixmask.sin6_addr.s6_addr32[i];
 	}
 
   again:
 	if (in6_get_tmpifid(ifp, (u_int8_t *)randid,
 	    (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) {
 		nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good "
 		    "random IFID\n"));
 		return (EINVAL);
 	}
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2]));
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3]));
 
 	/*
 	 * in6_get_tmpifid() quite likely provided a unique interface ID.
 	 * However, we may still have a chance to see collision, because
 	 * there may be a time lag between generation of the ID and generation
 	 * of the address.  So, we'll do one more sanity check.
 	 */
 	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
 		if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    &ifra.ifra_addr.sin6_addr)) {
 			if (trylimit-- == 0) {
 				/*
 				 * Give up.  Something strange should have
 				 * happened.
 				 */
 				nd6log((LOG_NOTICE, "in6_tmpifadd: failed to "
 				    "find a unique random IFID\n"));
 				return (EEXIST);
 			}
 			forcegen = 1;
 			goto again;
 		}
 	}
 
 	/*
 	 * The Valid Lifetime is the lower of the Valid Lifetime of the
          * public address or TEMP_VALID_LIFETIME.
 	 * The Preferred Lifetime is the lower of the Preferred Lifetime
          * of the public address or TEMP_PREFERRED_LIFETIME -
          * DESYNC_FACTOR.
 	 */
 	if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		vltime0 = IFA6_IS_INVALID(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_vltime -
 		    (time_second - ia0->ia6_updatetime));
 		if (vltime0 > V_ip6_temp_valid_lifetime)
 			vltime0 = V_ip6_temp_valid_lifetime;
 	} else
 		vltime0 = V_ip6_temp_valid_lifetime;
 	if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_pltime -
 		    (time_second - ia0->ia6_updatetime));
 		if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){
 			pltime0 = V_ip6_temp_preferred_lifetime -
 			    V_ip6_desync_factor;
 		}
 	} else
 		pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor;
 	ifra.ifra_lifetime.ia6t_vltime = vltime0;
 	ifra.ifra_lifetime.ia6t_pltime = pltime0;
 
 	/*
 	 * A temporary address is created only if this calculated Preferred
 	 * Lifetime is greater than REGEN_ADVANCE time units.
 	 */
 	if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance)
 		return (0);
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY);
 
 	/* allocate ifaddr structure, link into chain, etc. */
 	updateflags = 0;
 	if (delay)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0)
 		return (error);
 
 	newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	if (newia == NULL) {	/* XXX: can it happen? */
 		nd6log((LOG_ERR,
 		    "in6_tmpifadd: ifa update succeeded, but we got "
 		    "no ifaddr\n"));
 		return (EINVAL); /* XXX */
 	}
 	newia->ia6_ndpr = ia0->ia6_ndpr;
 	newia->ia6_ndpr->ndpr_refcnt++;
 
 	/*
 	 * A newly added address might affect the status of other addresses.
 	 * XXX: when the temporary address is generated with a new public
 	 * address, the onlink check is redundant.  However, it would be safe
 	 * to do the check explicitly everywhere a new address is generated,
 	 * and, in fact, we surely need the check when we create a new
 	 * temporary address due to deprecation of an old temporary address.
 	 */
 	pfxlist_onlink_check();
 
 	return (0);
 }
 
 static int
 in6_init_prefix_ltimes(struct nd_prefix *ndpr)
 {
 	if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_preferred = 0;
 	else
 		ndpr->ndpr_preferred = time_second + ndpr->ndpr_pltime;
 	if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_expire = 0;
 	else
 		ndpr->ndpr_expire = time_second + ndpr->ndpr_vltime;
 
 	return 0;
 }
 
 static void
 in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6)
 {
 	/* init ia6t_expire */
 	if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_expire = 0;
 	else {
 		lt6->ia6t_expire = time_second;
 		lt6->ia6t_expire += lt6->ia6t_vltime;
 	}
 
 	/* init ia6t_preferred */
 	if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_preferred = 0;
 	else {
 		lt6->ia6t_preferred = time_second;
 		lt6->ia6t_preferred += lt6->ia6t_pltime;
 	}
 }
 
 /*
  * Delete all the routing table entries that use the specified gateway.
  * XXX: this function causes search through all entries of routing table, so
  * it shouldn't be called when acting as a router.
  */
 void
 rt6_flush(struct in6_addr *gateway, struct ifnet *ifp)
 {
 	INIT_VNET_NET(curvnet);
 	struct radix_node_head *rnh = V_rt_tables[0][AF_INET6];
 	int s = splnet();
 
 	/* We'll care only link-local addresses */
 	if (!IN6_IS_ADDR_LINKLOCAL(gateway)) {
 		splx(s);
 		return;
 	}
 
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, rt6_deleteroute, (void *)gateway);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 	splx(s);
 }
 
 static int
 rt6_deleteroute(struct radix_node *rn, void *arg)
 {
 #define SIN6(s)	((struct sockaddr_in6 *)s)
 	struct rtentry *rt = (struct rtentry *)rn;
 	struct in6_addr *gate = (struct in6_addr *)arg;
 
 	if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6)
 		return (0);
 
 	if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) {
 		return (0);
 	}
 
 	/*
 	 * Do not delete a static route.
 	 * XXX: this seems to be a bit ad-hoc. Should we consider the
 	 * 'cloned' bit instead?
 	 */
 	if ((rt->rt_flags & RTF_STATIC) != 0)
 		return (0);
 
 	/*
 	 * We delete only host route. This means, in particular, we don't
 	 * delete default route.
 	 */
 	if ((rt->rt_flags & RTF_HOST) == 0)
 		return (0);
 
 	return (rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 	    rt_mask(rt), rt->rt_flags, 0));
 #undef SIN6
 }
 
 int
 nd6_setdefaultiface(int ifindex)
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_INET6(curvnet);
 	int error = 0;
 
 	if (ifindex < 0 || V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex != 0 && !ifnet_byindex(ifindex))
 		return (EINVAL);
 
 	if (V_nd6_defifindex != ifindex) {
 		V_nd6_defifindex = ifindex;
 		if (V_nd6_defifindex > 0)
 			V_nd6_defifp = ifnet_byindex(V_nd6_defifindex);
 		else
 			V_nd6_defifp = NULL;
 
 		/*
 		 * Our current implementation assumes one-to-one maping between
 		 * interfaces and links, so it would be natural to use the
 		 * default interface as the default link.
 		 */
 		scope6_setdefault(V_nd6_defifp);
 	}
 
 	return (error);
 }
Index: head/sys/netinet6/raw_ip6.c
===================================================================
--- head/sys/netinet6/raw_ip6.c	(revision 185347)
+++ head/sys/netinet6/raw_ip6.c	(revision 185348)
@@ -1,836 +1,835 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <machine/stdarg.h>
 
 #define	satosin6(sa)	((struct sockaddr_in6 *)(sa))
 #define	ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 
 /*
  * Raw interface to IP6 protocol.
  */
 
+#ifdef VIMAGE_GLOBALS
 extern struct	inpcbhead ripcb;
 extern struct	inpcbinfo ripcbinfo;
-extern u_long	rip_sendspace;
-extern u_long	rip_recvspace;
-
-#ifdef VIMAGE_GLOBALS
 struct rip6stat rip6stat;
 #endif
 
+extern u_long	rip_sendspace;
+extern u_long	rip_recvspace;
+
 /*
  * Hooks for multicast forwarding.
  */
 struct socket *ip6_mrouter = NULL;
 int (*ip6_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_done)(void);
 int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int (*mrt6_ioctl)(int, caddr_t);
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip6_input(struct mbuf **mp, int *offp, int proto)
 {
 	INIT_VNET_INET(curvnet);
 	INIT_VNET_INET6(curvnet);
 #ifdef IPSEC
 	INIT_VNET_IPSEC(curvnet);
 #endif
 	struct mbuf *m = *mp;
 	register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	register struct inpcb *in6p;
 	struct inpcb *last = 0;
 	struct mbuf *opts = NULL;
 	struct sockaddr_in6 fromsa;
 
 	V_rip6stat.rip6s_ipackets++;
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/* XXX Send icmp6 host/port unreach? */
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	init_sin6(&fromsa, m); /* general init */
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
 		if ((in6p->in6p_vflag & INP_IPV6) == 0)
 			continue;
 		if (in6p->in6p_ip6_nxt &&
 		    in6p->in6p_ip6_nxt != proto)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		    !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		    !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		INP_RLOCK(in6p);
 		if (in6p->in6p_cksum != -1) {
 			V_rip6stat.rip6s_isum++;
 			if (in6_cksum(m, proto, *offp,
 			    m->m_pkthdr.len - *offp)) {
 				INP_RUNLOCK(in6p);
 				V_rip6stat.rip6s_badsum++;
 				continue;
 			}
 		}
 		if (last) {
 			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
 
 #ifdef IPSEC
 			/*
 			 * Check AH/ESP integrity.
 			 */
 			if (n && ipsec6_in_reject(n, last)) {
 				m_freem(n);
 				V_ipsec6stat.in_polvio++;
 				/* Do not inject data into pcb. */
 			} else
 #endif /* IPSEC */
 			if (n) {
 				if (last->in6p_flags & IN6P_CONTROLOPTS ||
 				    last->in6p_socket->so_options & SO_TIMESTAMP)
 					ip6_savecontrol(last, n, &opts);
 				/* strip intermediate headers */
 				m_adj(n, *offp);
 				if (sbappendaddr(&last->in6p_socket->so_rcv,
 						(struct sockaddr *)&fromsa,
 						 n, opts) == 0) {
 					m_freem(n);
 					if (opts)
 						m_freem(opts);
 					V_rip6stat.rip6s_fullsock++;
 				} else
 					sorwakeup(last->in6p_socket);
 				opts = NULL;
 			}
 			INP_RUNLOCK(last);
 		}
 		last = in6p;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 #ifdef IPSEC
 	/*
 	 * Check AH/ESP integrity.
 	 */
 	if (last && ipsec6_in_reject(m, last)) {
 		m_freem(m);
 		V_ipsec6stat.in_polvio++;
 		V_ip6stat.ip6s_delivered--;
 		/* Do not inject data into pcb. */
 		INP_RUNLOCK(last);
 	} else
 #endif /* IPSEC */
 	if (last) {
 		if (last->in6p_flags & IN6P_CONTROLOPTS ||
 		    last->in6p_socket->so_options & SO_TIMESTAMP)
 			ip6_savecontrol(last, m, &opts);
 		/* Strip intermediate headers. */
 		m_adj(m, *offp);
 		if (sbappendaddr(&last->in6p_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			V_rip6stat.rip6s_fullsock++;
 		} else
 			sorwakeup(last->in6p_socket);
 		INP_RUNLOCK(last);
 	} else {
 		V_rip6stat.rip6s_nosock++;
 		if (m->m_flags & M_MCAST)
 			V_rip6stat.rip6s_nosockmcast++;
 		if (proto == IPPROTO_NONE)
 			m_freem(m);
 		else {
 			char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */
 			icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_NEXTHEADER,
 			    prvnxtp - mtod(m, char *));
 		}
 		V_ip6stat.ip6s_delivered--;
 	}
 	return (IPPROTO_DONE);
 }
 
 void
 rip6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	INIT_VNET_INET(curvnet);
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	int off = 0;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	void *cmdarg;
 	struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/*
 	 * If the parameter is from icmp6, decode it.
 	 */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		cmdarg = ip6cp->ip6c_cmdarg;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		cmdarg = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	(void) in6_pcbnotify(&V_ripcbinfo, sa, 0,
 	    (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
 }
 
 /*
  * Generate IPv6 header and pass packet to ip6_output.  Tack on options user
  * may have setup with control call.
  */
 int
 #if __STDC__
 rip6_output(struct mbuf *m, ...)
 #else
 rip6_output(m, va_alist)
 	struct mbuf *m;
 	va_dcl
 #endif
 {
 	INIT_VNET_INET6(curvnet);
 	struct mbuf *control;
 	struct socket *so;
 	struct sockaddr_in6 *dstsock;
 	struct in6_addr *dst;
 	struct ip6_hdr *ip6;
 	struct inpcb *in6p;
 	u_int	plen = m->m_pkthdr.len;
 	int error = 0;
 	struct ip6_pktopts opt, *optp;
 	struct ifnet *oifp = NULL;
 	int type = 0, code = 0;		/* for ICMPv6 output statistics only */
 	int scope_ambiguous = 0;
 	struct in6_addr *in6a;
 	va_list ap;
 
 	va_start(ap, m);
 	so = va_arg(ap, struct socket *);
 	dstsock = va_arg(ap, struct sockaddr_in6 *);
 	control = va_arg(ap, struct mbuf *);
 	va_end(ap);
 
 	in6p = sotoin6pcb(so);
 	INP_WLOCK(in6p);
 
 	dst = &dstsock->sin6_addr;
 	if (control) {
 		if ((error = ip6_setpktopts(control, &opt,
 		    in6p->in6p_outputopts, so->so_cred,
 		    so->so_proto->pr_protocol)) != 0) {
 			goto bad;
 		}
 		optp = &opt;
 	} else
 		optp = in6p->in6p_outputopts;
 
 	/*
 	 * Check and convert scope zone ID into internal form.
 	 *
 	 * XXX: we may still need to determine the zone later.
 	 */
 	if (!(so->so_state & SS_ISCONNECTED)) {
 		if (dstsock->sin6_scope_id == 0 && !V_ip6_use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(dstsock, V_ip6_use_defzone)) != 0)
 			goto bad;
 	}
 
 	/*
 	 * For an ICMPv6 packet, we should know its type and code to update
 	 * statistics.
 	 */
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icmp6;
 		if (m->m_len < sizeof(struct icmp6_hdr) &&
 		    (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
 			error = ENOBUFS;
 			goto bad;
 		}
 		icmp6 = mtod(m, struct icmp6_hdr *);
 		type = icmp6->icmp6_type;
 		code = icmp6->icmp6_code;
 	}
 
 	M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Source address selection.
 	 */
 	if ((in6a = in6_selectsrc(dstsock, optp, in6p, NULL, so->so_cred,
 	    &oifp, &error)) == NULL) {
 		if (error == 0)
 			error = EADDRNOTAVAIL;
 		goto bad;
 	}
 	ip6->ip6_src = *in6a;
 
 	if (oifp && scope_ambiguous) {
 		/*
 		 * Application should provide a proper zone ID or the use of
 		 * default zone IDs should be enabled.  Unfortunately, some
 		 * applications do not behave as it should, so we need a
 		 * workaround.  Even if an appropriate ID is not determined
 		 * (when it's required), if we can determine the outgoing
 		 * interface. determine the zone ID based on the interface.
 		 */
 		error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
 		if (error != 0)
 			goto bad;
 	}
 	ip6->ip6_dst = dstsock->sin6_addr;
 
 	/*
 	 * Fill in the rest of the IPv6 header fields.
 	 */
 	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 	    (in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 	    (IPV6_VERSION & IPV6_VERSION_MASK);
 
 	/*
 	 * ip6_plen will be filled in ip6_output, so not fill it here.
 	 */
 	ip6->ip6_nxt = in6p->in6p_ip6_nxt;
 	ip6->ip6_hlim = in6_selecthlim(in6p, oifp);
 
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
 	    in6p->in6p_cksum != -1) {
 		struct mbuf *n;
 		int off;
 		u_int16_t *p;
 
 		/* Compute checksum. */
 		if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 			off = offsetof(struct icmp6_hdr, icmp6_cksum);
 		else
 			off = in6p->in6p_cksum;
 		if (plen < off + 1) {
 			error = EINVAL;
 			goto bad;
 		}
 		off += sizeof(struct ip6_hdr);
 
 		n = m;
 		while (n && n->m_len <= off) {
 			off -= n->m_len;
 			n = n->m_next;
 		}
 		if (!n)
 			goto bad;
 		p = (u_int16_t *)(mtod(n, caddr_t) + off);
 		*p = 0;
 		*p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
 	}
 
 	error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p);
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		if (oifp)
 			icmp6_ifoutstat_inc(oifp, type, code);
 		V_icmp6stat.icp6s_outhist[type]++;
 	} else
 		V_rip6stat.rip6s_opackets++;
 
 	goto freectl;
 
  bad:
 	if (m)
 		m_freem(m);
 
  freectl:
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	INP_WUNLOCK(in6p);
 	return (error);
 }
 
 /*
  * Raw IPv6 socket option processing.
  */
 int
 rip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error;
 
 	if (sopt->sopt_level == IPPROTO_ICMPV6)
 		/*
 		 * XXX: is it better to call icmp6_ctloutput() directly
 		 * from protosw?
 		 */
 		return (icmp6_ctloutput(so, sopt));
 	else if (sopt->sopt_level != IPPROTO_IPV6)
 		return (EINVAL);
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_get ?  ip6_mrouter_get(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_set ?  ip6_mrouter_set(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 static int
 rip6_attach(struct socket *so, int proto, struct thread *td)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	struct icmp6_filter *filter;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
 	if (filter == NULL)
 		return (ENOMEM);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		free(filter, M_PCB);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	inp->inp_vflag |= INP_IPV6;
 	inp->in6p_ip6_nxt = (long)proto;
 	inp->in6p_hops = -1;	/* use kernel default */
 	inp->in6p_cksum = -1;
 	inp->in6p_icmp6filt = filter;
 	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip6_detach(struct socket *so)
 {
 	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));
 
 	if (so == ip6_mrouter && ip6_mrouter_done)
 		ip6_mrouter_done();
 	/* xxx: RSVP */
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	free(inp->in6p_icmp6filt, M_PCB);
 	in_pcbdetach(inp);
 	in6_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 }
 
 /* XXXRW: This can't ever be called. */
 static void
 rip6_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static void
 rip6_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_close: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static int
 rip6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	inp->in6p_faddr = in6addr_any;
 	rip6_abort(so);
 	return (0);
 }
 
 static int
 rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_NET(so->so_vnet);
 	INIT_VNET_INET(so->so_vnet);
 	INIT_VNET_INET6(so->so_vnet);
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct ifaddr *ia = NULL;
 	int error = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6)
 		return (EADDRNOTAVAIL);
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
 	    (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0)
 		return (EADDRNOTAVAIL);
 	if (ia &&
 	    ((struct in6_ifaddr *)ia)->ia6_flags &
 	    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
 	     IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
 		return (EADDRNOTAVAIL);
 	}
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	inp->in6p_laddr = addr->sin6_addr;
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	INIT_VNET_NET(so->so_vnet);
 	INIT_VNET_INET(so->so_vnet);
 	INIT_VNET_INET6(so->so_vnet);
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct in6_addr *in6a = NULL;
 	struct ifnet *ifp = NULL;
 	int error = 0, scope_ambiguous = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (TAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Application should provide a proper zone ID or the use of default
 	 * zone IDs should be enabled.  Unfortunately, some applications do
 	 * not behave as it should, so we need a workaround.  Even if an
 	 * appropriate ID is not determined, we'll see if we can determine
 	 * the outgoing interface.  If we can, determine the zone ID based on
 	 * the interface below.
 	 */
 	if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	/* Source address selection. XXX: need pcblookup? */
 	in6a = in6_selectsrc(addr, inp->in6p_outputopts,
 			     inp, NULL, so->so_cred,
 			     &ifp, &error);
 	if (in6a == NULL) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		return (error ? error : EADDRNOTAVAIL);
 	}
 
 	/* XXX: see above */
 	if (ifp && scope_ambiguous &&
 	    (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		return (error);
 	}
 	inp->in6p_faddr = addr->sin6_addr;
 	inp->in6p_laddr = *in6a;
 	soisconnected(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip6_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
-	INIT_VNET_INET(so->so_vnet);
 	struct inpcb *inp;
 	struct sockaddr_in6 tmp;
 	struct sockaddr_in6 *dst;
 	int ret;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_send: inp == NULL"));
 
 	/* Always copy sockaddr to avoid overwrites. */
 	/* Unlocked read. */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return (EISCONN);
 		}
 		/* XXX */
 		bzero(&tmp, sizeof(tmp));
 		tmp.sin6_family = AF_INET6;
 		tmp.sin6_len = sizeof(struct sockaddr_in6);
 		INP_RLOCK(inp);
 		bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
 		    sizeof(struct in6_addr));
 		INP_RUNLOCK(inp);
 		dst = &tmp;
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return (ENOTCONN);
 		}
 		if (nam->sa_len != sizeof(struct sockaddr_in6)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 		tmp = *(struct sockaddr_in6 *)nam;
 		dst = &tmp;
 
 		if (dst->sin6_family == AF_UNSPEC) {
 			/*
 			 * XXX: we allow this case for backward
 			 * compatibility to buggy applications that
 			 * rely on old (and wrong) kernel behavior.
 			 */
 			log(LOG_INFO, "rip6 SEND: address family is "
 			    "unspec. Assume AF_INET6\n");
 			dst->sin6_family = AF_INET6;
 		} else if (dst->sin6_family != AF_INET6) {
 			m_freem(m);
 			return(EAFNOSUPPORT);
 		}
 	}
 	ret = rip6_output(m, so, dst, control);
 	return (ret);
 }
 
 struct pr_usrreqs rip6_usrreqs = {
 	.pru_abort =		rip6_abort,
 	.pru_attach =		rip6_attach,
 	.pru_bind =		rip6_bind,
 	.pru_connect =		rip6_connect,
 	.pru_control =		in6_control,
 	.pru_detach =		rip6_detach,
 	.pru_disconnect =	rip6_disconnect,
 	.pru_peeraddr =		in6_getpeeraddr,
 	.pru_send =		rip6_send,
 	.pru_shutdown =		rip6_shutdown,
 	.pru_sockaddr =		in6_getsockaddr,
 	.pru_close =		rip6_close,
 };
Index: head/sys/netipsec/ipsec.c
===================================================================
--- head/sys/netipsec/ipsec.c	(revision 185347)
+++ head/sys/netipsec/ipsec.c	(revision 185348)
@@ -1,2045 +1,2045 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPsec controller part.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <sys/types.h>
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/ah_var.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp.h>		/*XXX*/
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/xform.h>
 
 #include <machine/in_cksum.h>
 
 #include <opencrypto/cryptodev.h>
 
 #ifdef VIMAGE_GLOBALS
 /* NB: name changed so netstat doesn't use it */
 struct ipsecstat ipsec4stat;
 struct secpolicy ip4_def_policy;
 int ipsec_debug;
 int ip4_ah_offsetmask;
 int ip4_ipsec_dfbit;
 int ip4_esp_trans_deflev;
 int ip4_esp_net_deflev;
 int ip4_ah_trans_deflev;
 int ip4_ah_net_deflev;
 int ip4_ipsec_ecn;
 int ip4_esp_randpad;
 /*
  * Crypto support requirements:
  *
  *  1	require hardware support
  * -1	require software support
  *  0	take anything
  */
 int	crypto_support;
 #endif /* VIMAGE_GLOBALS */
 
 SYSCTL_DECL(_net_inet_ipsec);
 
 /* net.inet.ipsec */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_POLICY,
 	def_policy, CTLFLAG_RW, ip4_def_policy.policy,  0,
 	"IPsec default policy.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV,
 	esp_trans_deflev, CTLFLAG_RW, ip4_esp_trans_deflev,	0,
 	"Default ESP transport mode level");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV,
 	esp_net_deflev, CTLFLAG_RW, ip4_esp_net_deflev,	0,
 	"Default ESP tunnel mode level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV,
 	ah_trans_deflev, CTLFLAG_RW, ip4_ah_trans_deflev,	0,
 	"AH transfer mode default level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV,
 	ah_net_deflev, CTLFLAG_RW, ip4_ah_net_deflev,	0,
 	"AH tunnel mode default level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_CLEARTOS,
 	ah_cleartos, CTLFLAG_RW,	ah_cleartos,	0,
 	"If set clear type-of-service field when doing AH computation.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_OFFSETMASK,
 	ah_offsetmask, CTLFLAG_RW,	ip4_ah_offsetmask,	0,
 	"If not set clear offset field mask when doing AH computation.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DFBIT,
 	dfbit, CTLFLAG_RW,	ip4_ipsec_dfbit,	0,
 	"Do not fragment bit on encap.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_ECN,
 	ecn, CTLFLAG_RW,	ip4_ipsec_ecn,	0,
 	"Explicit Congestion Notification handling.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEBUG,
 	debug, CTLFLAG_RW,	ipsec_debug,	0,
 	"Enable IPsec debugging output when set.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO,
 	crypto_support,	CTLFLAG_RW,	crypto_support,0,
 	"Crypto driver selection.");
 SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO,
 	ipsecstats,	CTLFLAG_RD,	ipsec4stat, ipsecstat,	
 	"IPsec IPv4 statistics.");
 
 #ifdef REGRESSION
 #ifdef VIMAGE_GLOBALS
 int ipsec_replay;
 int ipsec_integrity;
 #endif
 /*
  * When set to 1, IPsec will send packets with the same sequence number.
  * This allows to verify if the other side has proper replay attacks detection.
  */
 SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_replay,
 	CTLFLAG_RW, ipsec_replay, 0, "Emulate replay attack");
 /*
  * When set 1, IPsec will send packets with corrupted HMAC.
  * This allows to verify if the other side properly detects modified packets.
  */
 SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_integrity,
 	CTLFLAG_RW, ipsec_integrity, 0, "Emulate man-in-the-middle attack");
 #endif
 
 #ifdef INET6 
 #ifdef VIMAGE_GLOBALS
 struct ipsecstat ipsec6stat;
 int ip6_esp_trans_deflev;
 int ip6_esp_net_deflev;
 int ip6_ah_trans_deflev;
 int ip6_ah_net_deflev;
 int ip6_ipsec_ecn;
 #endif
 
 SYSCTL_DECL(_net_inet6_ipsec6);
 
 /* net.inet6.ipsec6 */
 #ifdef COMPAT_KAME
 SYSCTL_OID(_net_inet6_ipsec6, IPSECCTL_STATS, stats, CTLFLAG_RD,
     0, 0, compat_ipsecstats_sysctl, "S", "IPsec IPv6 statistics.");
 #endif /* COMPAT_KAME */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_POLICY,
 	def_policy, CTLFLAG_RW,	ip4_def_policy.policy,	0,
 	"IPsec default policy.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, 
 	esp_trans_deflev, CTLFLAG_RW, ip6_esp_trans_deflev,	0,
 	"Default ESP transport mode level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, 
 	esp_net_deflev, CTLFLAG_RW, ip6_esp_net_deflev,	0,
 	"Default ESP tunnel mode level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, 
 	ah_trans_deflev, CTLFLAG_RW, ip6_ah_trans_deflev,	0,
 	"AH transfer mode default level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, 
 	ah_net_deflev, CTLFLAG_RW, ip6_ah_net_deflev,	0,
 	"AH tunnel mode default level.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_ECN,
 	ecn, CTLFLAG_RW, ip6_ipsec_ecn,	0,
 	"Explicit Congestion Notification handling.");
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEBUG,
 	debug, CTLFLAG_RW,	ipsec_debug,	0,
 	"Enable IPsec debugging output when set.");
 SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_STATS,
 	ipsecstats, CTLFLAG_RD, ipsec6stat, ipsecstat,
 	"IPsec IPv6 statistics.");
 #endif /* INET6 */
 
 static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb));
 #ifdef INET6
 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb));
 #endif
 static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int));
 static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #ifdef INET6
 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #endif
 static void ipsec_delpcbpolicy __P((struct inpcbpolicy *));
 static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src));
 static int ipsec_set_policy __P((struct secpolicy **pcb_sp,
 	int optname, caddr_t request, size_t len, struct ucred *cred));
 static int ipsec_get_policy __P((struct secpolicy *pcb_sp, struct mbuf **mp));
 static void vshiftl __P((unsigned char *, int, int));
 static size_t ipsec_hdrsiz __P((struct secpolicy *));
 
 MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy");
 
 void
 ipsec_init(void)
 {
 	INIT_VNET_IPSEC(curvnet);
 
 #ifdef IPSEC_DEBUG
 	V_ipsec_debug = 1;
 #else
 	V_ipsec_debug = 0;
 #endif
 
 	V_ip4_ah_offsetmask = 0;	/* maybe IP_DF? */
 	V_ip4_ipsec_dfbit = 0;	/* DF bit on encap. 0: clear 1: set 2: copy */
 	V_ip4_esp_trans_deflev = IPSEC_LEVEL_USE;
 	V_ip4_esp_net_deflev = IPSEC_LEVEL_USE;
 	V_ip4_ah_trans_deflev = IPSEC_LEVEL_USE;
 	V_ip4_ah_net_deflev = IPSEC_LEVEL_USE;
 	V_ip4_ipsec_ecn = 0;	/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 	V_ip4_esp_randpad = -1;
 
 	V_crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE;
 
 #ifdef REGRESSION
 	V_ipsec_replay = 0;
 	V_ipsec_integrity = 0;
 #endif
 
 #ifdef INET6 
 	V_ip6_esp_trans_deflev = IPSEC_LEVEL_USE;
 	V_ip6_esp_net_deflev = IPSEC_LEVEL_USE;
 	V_ip6_ah_trans_deflev = IPSEC_LEVEL_USE;
 	V_ip6_ah_net_deflev = IPSEC_LEVEL_USE;
 	V_ip6_ipsec_ecn = 0;	/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 #endif
 }
 
 /*
  * Return a held reference to the default SP.
  */
 static struct secpolicy *
 key_allocsp_default(const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default from %s:%u\n", where, tag));
 
 	sp = &V_ip4_def_policy;
 	if (sp->policy != IPSEC_POLICY_DISCARD &&
 	    sp->policy != IPSEC_POLICY_NONE) {
 		ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n",
 		    sp->policy, IPSEC_POLICY_NONE));
 		sp->policy = IPSEC_POLICY_NONE;
 	}
 	key_addref(sp);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default returns SP:%p (%u)\n",
 			sp, sp->refcnt));
 	return sp;
 }
 #define	KEY_ALLOCSP_DEFAULT() \
 	key_allocsp_default(__FILE__, __LINE__)
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 struct secpolicy *
 ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(tdbi != NULL, ("null tdbi"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir);
 	if (sp == NULL)			/*XXX????*/
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return sp;
 }
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 struct secpolicy *
 ipsec_getpolicybysock(m, dir, inp, error)
 	struct mbuf *m;
 	u_int dir;
 	struct inpcb *inp;
 	int *error;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct inpcbpolicy *pcbsp = NULL;
 	struct secpolicy *currsp = NULL;	/* policy on socket */
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(inp != NULL, ("null inpcb"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	/* set spidx in pcb */
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 #ifdef INET6
 		*error = ipsec6_setspidx_in6pcb(m, inp);
 		pcbsp = inp->in6p_sp;
 #else
 		*error = EINVAL;		/* should not happen */
 #endif
 	} else {
 		*error = ipsec4_setspidx_inpcb(m, inp);
 		pcbsp = inp->inp_sp;
 	}
 	if (*error)
 		return NULL;
 
 	IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp"));
 	switch (dir) {
 	case IPSEC_DIR_INBOUND:
 		currsp = pcbsp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		currsp = pcbsp->sp_out;
 		break;
 	}
 	IPSEC_ASSERT(currsp != NULL, ("null currsp"));
 
 	if (pcbsp->priv) {			/* when privilieged socket */
 		switch (currsp->policy) {
 		case IPSEC_POLICY_BYPASS:
 		case IPSEC_POLICY_IPSEC:
 			key_addref(currsp);
 			sp = currsp;
 			break;
 
 		case IPSEC_POLICY_ENTRUST:
 			/* look for a policy in SPD */
 			sp = KEY_ALLOCSP(&currsp->spidx, dir);
 			if (sp == NULL)		/* no SP found */
 				sp = KEY_ALLOCSP_DEFAULT();
 			break;
 
 		default:
 			ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n",
 				__func__, currsp->policy));
 			*error = EINVAL;
 			return NULL;
 		}
 	} else {				/* unpriv, SPD has policy */
 		sp = KEY_ALLOCSP(&currsp->spidx, dir);
 		if (sp == NULL) {		/* no SP found */
 			switch (currsp->policy) {
 			case IPSEC_POLICY_BYPASS:
 				ipseclog((LOG_ERR, "%s: Illegal policy for "
 					"non-priviliged defined %d\n",
 					__func__, currsp->policy));
 				*error = EINVAL;
 				return NULL;
 
 			case IPSEC_POLICY_ENTRUST:
 				sp = KEY_ALLOCSP_DEFAULT();
 				break;
 
 			case IPSEC_POLICY_IPSEC:
 				key_addref(currsp);
 				sp = currsp;
 				break;
 
 			default:
 				ipseclog((LOG_ERR, "%s: Invalid policy for "
 					"PCB %d\n", __func__, currsp->policy));
 				*error = EINVAL;
 				return NULL;
 			}
 		}
 	}
 	IPSEC_ASSERT(sp != NULL,
 		("null SP (priv %u policy %u", pcbsp->priv, currsp->policy));
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n",
 			__func__, pcbsp->priv, currsp->policy, sp, sp->refcnt));
 	return sp;
 }
 
 /*
  * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	positive: a pointer to the entry for security policy leaf matched.
  *	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  */
 struct secpolicy *
 ipsec_getpolicybyaddr(m, dir, flag, error)
 	struct mbuf *m;
 	u_int dir;
 	int flag;
 	int *error;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicyindex spidx;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = NULL;
 	if (key_havesp(dir)) {
 		/* Make an index to look for a policy. */
 		*error = ipsec_setspidx(m, &spidx,
 					(flag & IP_FORWARDING) ? 0 : 1);
 		if (*error != 0) {
 			DPRINTF(("%s: setpidx failed, dir %u flag %u\n",
 				__func__, dir, flag));
 			return NULL;
 		}
 		spidx.dir = dir;
 
 		sp = KEY_ALLOCSP(&spidx, dir);
 	}
 	if (sp == NULL)			/* no SP found, use system default */
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return sp;
 }
 
 struct secpolicy *
 ipsec4_checkpolicy(m, dir, flag, error, inp)
 	struct mbuf *m;
 	u_int dir, flag;
 	int *error;
 	struct inpcb *inp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	*error = 0;
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, flag, error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, error);
 	if (sp == NULL) {
 		IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error"));
 		V_ipsec4stat.ips_out_inval++;
 		return NULL;
 	}
 	IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error));
 	switch (sp->policy) {
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		printf("%s: invalid policy %u\n", __func__, sp->policy);
 		/* fall thru... */
 	case IPSEC_POLICY_DISCARD:
 		V_ipsec4stat.ips_out_polvio++;
 		*error = -EINVAL;	/* packet is discarded by caller */
 		break;
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		KEY_FREESP(&sp);
 		sp = NULL;		/* NB: force NULL result */
 		break;
 	case IPSEC_POLICY_IPSEC:
 		if (sp->req == NULL)	/* acquire an SA */
 			*error = key_spdacquire(sp);
 		break;
 	}
 	if (*error != 0) {
 		KEY_FREESP(&sp);
 		sp = NULL;
 	}
 	return sp;
 }
 
 static int
 ipsec4_setspidx_inpcb(m, pcb)
 	struct mbuf *m;
 	struct inpcb *pcb;
 {
 	int error;
 
 	IPSEC_ASSERT(pcb != NULL, ("null pcb"));
 	IPSEC_ASSERT(pcb->inp_sp != NULL, ("null inp_sp"));
 	IPSEC_ASSERT(pcb->inp_sp->sp_out != NULL && pcb->inp_sp->sp_in != NULL,
 		("null sp_in || sp_out"));
 
 	error = ipsec_setspidx(m, &pcb->inp_sp->sp_in->spidx, 1);
 	if (error == 0) {
 		pcb->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND;
 		pcb->inp_sp->sp_out->spidx = pcb->inp_sp->sp_in->spidx;
 		pcb->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND;
 	} else {
 		bzero(&pcb->inp_sp->sp_in->spidx,
 			sizeof (pcb->inp_sp->sp_in->spidx));
 		bzero(&pcb->inp_sp->sp_out->spidx,
 			sizeof (pcb->inp_sp->sp_in->spidx));
 	}
 	return error;
 }
 
 #ifdef INET6
 static int
 ipsec6_setspidx_in6pcb(m, pcb)
 	struct mbuf *m;
 	struct in6pcb *pcb;
 {
 	//INIT_VNET_IPSEC(curvnet);
 	struct secpolicyindex *spidx;
 	int error;
 
 	IPSEC_ASSERT(pcb != NULL, ("null pcb"));
 	IPSEC_ASSERT(pcb->in6p_sp != NULL, ("null inp_sp"));
 	IPSEC_ASSERT(pcb->in6p_sp->sp_out != NULL && pcb->in6p_sp->sp_in != NULL,
 		("null sp_in || sp_out"));
 
 	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));
 
 	spidx = &pcb->in6p_sp->sp_in->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_INBOUND;
 
 	spidx = &pcb->in6p_sp->sp_out->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_OUTBOUND;
 
 	return 0;
 
 bad:
 	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));
 	return error;
 }
 #endif
 
 /*
  * configure security policy index (src/dst/proto/sport/dport)
  * by looking at the content of mbuf.
  * the caller is responsible for error recovery (like clearing up spidx).
  */
 static int
 ipsec_setspidx(m, spidx, needport)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 	int needport;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct ip *ip = NULL;
 	struct ip ipbuf;
 	u_int v;
 	struct mbuf *n;
 	int len;
 	int error;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/*
 	 * validate m->m_pkthdr.len.  we see incorrect length if we
 	 * mistakenly call this function with inconsistent mbuf chain
 	 * (like 4.4BSD tcp/udp processing).  XXX should we panic here?
 	 */
 	len = 0;
 	for (n = m; n; n = n->m_next)
 		len += n->m_len;
 	if (m->m_pkthdr.len != len) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n",
 				__func__, len, m->m_pkthdr.len));
 		return EINVAL;
 	}
 
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) too small (v4), ignored.\n",
 			    __func__, m->m_pkthdr.len));
 		return EINVAL;
 	}
 
 	if (m->m_len >= sizeof(*ip))
 		ip = mtod(m, struct ip *);
 	else {
 		m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
 		ip = &ipbuf;
 	}
 #ifdef _IP_VHL
 	v = _IP_VHL_V(ip->ip_vhl);
 #else
 	v = ip->ip_v;
 #endif
 	switch (v) {
 	case 4:
 		error = ipsec4_setspidx_ipaddr(m, spidx);
 		if (error)
 			return error;
 		ipsec4_get_ulp(m, spidx, needport);
 		return 0;
 #ifdef INET6
 	case 6:
 		if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
 			KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				printf("%s: pkthdr len(%d) too small (v6), "
 				"ignored\n", __func__, m->m_pkthdr.len));
 			return EINVAL;
 		}
 		error = ipsec6_setspidx_ipaddr(m, spidx);
 		if (error)
 			return error;
 		ipsec6_get_ulp(m, spidx, needport);
 		return 0;
 #endif
 	default:
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: " "unknown IP version %u, ignored.\n",
 				__func__, v));
 		return EINVAL;
 	}
 }
 
 static void
 ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	u_int8_t nxt;
 	int off;
 
 	/* sanity check */
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short"));
 
 	/* NB: ip_input() flips it into host endian XXX need more checking */
 	if (m->m_len < sizeof (struct ip)) {
 		struct ip *ip = mtod(m, struct ip *);
 		if (ip->ip_off & (IP_MF | IP_OFFMASK))
 			goto done;
 #ifdef _IP_VHL
 		off = _IP_VHL_HL(ip->ip_vhl) << 2;
 #else
 		off = ip->ip_hl << 2;
 #endif
 		nxt = ip->ip_p;
 	} else {
 		struct ip ih;
 
 		m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih);
 		if (ih.ip_off & (IP_MF | IP_OFFMASK))
 			goto done;
 #ifdef _IP_VHL
 		off = _IP_VHL_HL(ih.ip_vhl) << 2;
 #else
 		off = ih.ip_hl << 2;
 #endif
 		nxt = ih.ip_p;
 	}
 
 	while (off < m->m_pkthdr.len) {
 		struct ip6_ext ip6e;
 		struct tcphdr th;
 		struct udphdr uh;
 
 		switch (nxt) {
 		case IPPROTO_TCP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (th), (caddr_t) &th);
 			spidx->src.sin.sin_port = th.th_sport;
 			spidx->dst.sin.sin_port = th.th_dport;
 			return;
 		case IPPROTO_UDP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (uh), (caddr_t) &uh);
 			spidx->src.sin.sin_port = uh.uh_sport;
 			spidx->dst.sin.sin_port = uh.uh_dport;
 			return;
 		case IPPROTO_AH:
 			if (off + sizeof(ip6e) > m->m_pkthdr.len)
 				goto done;
 			/* XXX sigh, this works but is totally bogus */
 			m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e);
 			off += (ip6e.ip6e_len + 2) << 2;
 			nxt = ip6e.ip6e_nxt;
 			break;
 		case IPPROTO_ICMP:
 		default:
 			/* XXX intermediate headers??? */
 			spidx->ul_proto = nxt;
 			goto done_proto;
 		}
 	}
 done:
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 done_proto:
 	spidx->src.sin.sin_port = IPSEC_PORT_ANY;
 	spidx->dst.sin.sin_port = IPSEC_PORT_ANY;
 }
 
 /* assumes that m is sane */
 static int
 ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
 {
 	static const struct sockaddr_in template = {
 		sizeof (struct sockaddr_in),
 		AF_INET,
 		0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }
 	};
 
 	spidx->src.sin = template;
 	spidx->dst.sin = template;
 
 	if (m->m_len < sizeof (struct ip)) {
 		m_copydata(m, offsetof(struct ip, ip_src),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->src.sin.sin_addr);
 		m_copydata(m, offsetof(struct ip, ip_dst),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->dst.sin.sin_addr);
 	} else {
 		struct ip *ip = mtod(m, struct ip *);
 		spidx->src.sin.sin_addr = ip->ip_src;
 		spidx->dst.sin.sin_addr = ip->ip_dst;
 	}
 
 	spidx->prefs = sizeof(struct in_addr) << 3;
 	spidx->prefd = sizeof(struct in_addr) << 3;
 
 	return 0;
 }
 
 #ifdef INET6
 static void
 ipsec6_get_ulp(m, spidx, needport)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 	int needport;
 {
 	INIT_VNET_IPSEC(curvnet);
 	int off, nxt;
 	struct tcphdr th;
 	struct udphdr uh;
 	struct icmp6_hdr ih;
 
 	/* sanity check */
 	if (m == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(m));
 
 	/* set default */
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 	((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
 	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;
 
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off < 0 || m->m_pkthdr.len < off)
 		return;
 
 	switch (nxt) {
 	case IPPROTO_TCP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(th), (caddr_t)&th);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
 		break;
 	case IPPROTO_UDP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
 		break;
 	case IPPROTO_ICMPV6:
 		spidx->ul_proto = nxt;
 		if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(ih), (caddr_t)&ih);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port =
 		    htons((uint16_t)ih.icmp6_type);
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port =
 		    htons((uint16_t)ih.icmp6_code);
 		break;
 	default:
 		/* XXX intermediate headers??? */
 		spidx->ul_proto = nxt;
 		break;
 	}
 }
 
 /* assumes that m is sane */
 static int
 ipsec6_setspidx_ipaddr(m, spidx)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 {
 	struct ip6_hdr *ip6 = NULL;
 	struct ip6_hdr ip6buf;
 	struct sockaddr_in6 *sin6;
 
 	if (m->m_len >= sizeof(*ip6))
 		ip6 = mtod(m, struct ip6_hdr *);
 	else {
 		m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf);
 		ip6 = &ip6buf;
 	}
 
 	sin6 = (struct sockaddr_in6 *)&spidx->src;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 	}
 	spidx->prefs = sizeof(struct in6_addr) << 3;
 
 	sin6 = (struct sockaddr_in6 *)&spidx->dst;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 	}
 	spidx->prefd = sizeof(struct in6_addr) << 3;
 
 	return 0;
 }
 #endif
 
 static void
 ipsec_delpcbpolicy(p)
 	struct inpcbpolicy *p;
 {
 	free(p, M_IPSEC_INPCB);
 }
 
 /* initialize policy in PCB */
 int
 ipsec_init_policy(so, pcb_sp)
 	struct socket *so;
 	struct inpcbpolicy **pcb_sp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct inpcbpolicy *new;
 
 	/* sanity check. */
 	if (so == NULL || pcb_sp == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy),
 					    M_IPSEC_INPCB, M_NOWAIT|M_ZERO);
 	if (new == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 
 	new->priv = IPSEC_IS_PRIVILEGED_SO(so);
 
 	if ((new->sp_in = KEY_NEWSP()) == NULL) {
 		ipsec_delpcbpolicy(new);
 		return ENOBUFS;
 	}
 	new->sp_in->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_in->policy = IPSEC_POLICY_ENTRUST;
 
 	if ((new->sp_out = KEY_NEWSP()) == NULL) {
 		KEY_FREESP(&new->sp_in);
 		ipsec_delpcbpolicy(new);
 		return ENOBUFS;
 	}
 	new->sp_out->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_out->policy = IPSEC_POLICY_ENTRUST;
 
 	*pcb_sp = new;
 
 	return 0;
 }
 
 /* copy old ipsec policy into new */
 int
 ipsec_copy_policy(old, new)
 	struct inpcbpolicy *old, *new;
 {
 	struct secpolicy *sp;
 
 	sp = ipsec_deepcopy_policy(old->sp_in);
 	if (sp) {
 		KEY_FREESP(&new->sp_in);
 		new->sp_in = sp;
 	} else
 		return ENOBUFS;
 
 	sp = ipsec_deepcopy_policy(old->sp_out);
 	if (sp) {
 		KEY_FREESP(&new->sp_out);
 		new->sp_out = sp;
 	} else
 		return ENOBUFS;
 
 	new->priv = old->priv;
 
 	return 0;
 }
 
 struct ipsecrequest *
 ipsec_newisr(void)
 {
 	struct ipsecrequest *p;
 
 	p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT|M_ZERO);
 	if (p != NULL)
 		IPSECREQUEST_LOCK_INIT(p);
 	return p;
 }
 
 void
 ipsec_delisr(struct ipsecrequest *p)
 {
 	IPSECREQUEST_LOCK_DESTROY(p);
 	free(p, M_IPSEC_SR);
 }
 
 /* deep-copy a policy in PCB */
 static struct secpolicy *
 ipsec_deepcopy_policy(src)
 	struct secpolicy *src;
 {
 	struct ipsecrequest *newchain = NULL;
 	struct ipsecrequest *p;
 	struct ipsecrequest **q;
 	struct ipsecrequest *r;
 	struct secpolicy *dst;
 
 	if (src == NULL)
 		return NULL;
 	dst = KEY_NEWSP();
 	if (dst == NULL)
 		return NULL;
 
 	/*
 	 * deep-copy IPsec request chain.  This is required since struct
 	 * ipsecrequest is not reference counted.
 	 */
 	q = &newchain;
 	for (p = src->req; p; p = p->next) {
 		*q = ipsec_newisr();
 		if (*q == NULL)
 			goto fail;
 		(*q)->saidx.proto = p->saidx.proto;
 		(*q)->saidx.mode = p->saidx.mode;
 		(*q)->level = p->level;
 		(*q)->saidx.reqid = p->saidx.reqid;
 
 		bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src));
 		bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst));
 
 		(*q)->sp = dst;
 
 		q = &((*q)->next);
 	}
 
 	dst->req = newchain;
 	dst->state = src->state;
 	dst->policy = src->policy;
 	/* do not touch the refcnt fields */
 
 	return dst;
 
 fail:
 	for (p = newchain; p; p = r) {
 		r = p->next;
 		ipsec_delisr(p);
 		p = NULL;
 	}
 	return NULL;
 }
 
 /* set policy and ipsec request if present. */
 static int
 ipsec_set_policy(pcb_sp, optname, request, len, cred)
 	struct secpolicy **pcb_sp;
 	int optname;
 	caddr_t request;
 	size_t len;
 	struct ucred *cred;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_x_policy *xpl;
 	struct secpolicy *newsp = NULL;
 	int error;
 
 	/* sanity check. */
 	if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: passed policy\n", __func__);
 		kdebug_sadb_x_policy((struct sadb_ext *)xpl));
 
 	/* check policy type */
 	/* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */
 	if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD
 	 || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
 		return EINVAL;
 
 	/* check privileged socket */
 	if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
 		error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0);
 		if (error)
 			return EACCES;
 	}
 
 	/* allocation new SP entry */
 	if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
 		return error;
 
 	newsp->state = IPSEC_SPSTATE_ALIVE;
 
 	/* clear old SP and set new SP */
 	KEY_FREESP(pcb_sp);
 	*pcb_sp = newsp;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: new policy\n", __func__);
 		kdebug_secpolicy(newsp));
 
 	return 0;
 }
 
 static int
 ipsec_get_policy(pcb_sp, mp)
 	struct secpolicy *pcb_sp;
 	struct mbuf **mp;
 {
 	INIT_VNET_IPSEC(curvnet);
 
 	/* sanity check. */
 	if (pcb_sp == NULL || mp == NULL)
 		return EINVAL;
 
 	*mp = key_sp2msg(pcb_sp);
 	if (!*mp) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 
 	(*mp)->m_type = MT_DATA;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(*mp));
 
 	return 0;
 }
 
 int
 ipsec4_set_policy(inp, optname, request, len, cred)
 	struct inpcb *inp;
 	int optname;
 	caddr_t request;
 	size_t len;
 	struct ucred *cred;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* sanity check. */
 	if (inp == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_set_policy(pcb_sp, optname, request, len, cred);
 }
 
 int
 ipsec4_get_policy(inp, request, len, mp)
 	struct inpcb *inp;
 	caddr_t request;
 	size_t len;
 	struct mbuf **mp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* sanity check. */
 	if (inp == NULL || request == NULL || mp == NULL)
 		return EINVAL;
 	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_get_policy(pcb_sp, mp);
 }
 
 /* delete policy in PCB */
 int
 ipsec4_delete_pcbpolicy(inp)
 	struct inpcb *inp;
 {
 	IPSEC_ASSERT(inp != NULL, ("null inp"));
 
 	if (inp->inp_sp == NULL)
 		return 0;
 
 	if (inp->inp_sp->sp_in != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_in);
 
 	if (inp->inp_sp->sp_out != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_out);
 
 	ipsec_delpcbpolicy(inp->inp_sp);
 	inp->inp_sp = NULL;
 
 	return 0;
 }
 
 #ifdef INET6
 int
 ipsec6_set_policy(in6p, optname, request, len, cred)
 	struct in6pcb *in6p;
 	int optname;
 	caddr_t request;
 	size_t len;
 	struct ucred *cred;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* sanity check. */
 	if (in6p == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &in6p->in6p_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &in6p->in6p_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_set_policy(pcb_sp, optname, request, len, cred);
 }
 
 int
 ipsec6_get_policy(in6p, request, len, mp)
 	struct in6pcb *in6p;
 	caddr_t request;
 	size_t len;
 	struct mbuf **mp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* sanity check. */
 	if (in6p == NULL || request == NULL || mp == NULL)
 		return EINVAL;
 	IPSEC_ASSERT(in6p->in6p_sp != NULL, ("null in6p_sp"));
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = in6p->in6p_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = in6p->in6p_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_get_policy(pcb_sp, mp);
 }
 
 int
 ipsec6_delete_pcbpolicy(in6p)
 	struct in6pcb *in6p;
 {
 	IPSEC_ASSERT(in6p != NULL, ("null in6p"));
 
 	if (in6p->in6p_sp == NULL)
 		return 0;
 
 	if (in6p->in6p_sp->sp_in != NULL)
 		KEY_FREESP(&in6p->in6p_sp->sp_in);
 
 	if (in6p->in6p_sp->sp_out != NULL)
 		KEY_FREESP(&in6p->in6p_sp->sp_out);
 
 	ipsec_delpcbpolicy(in6p->in6p_sp);
 	in6p->in6p_sp = NULL;
 
 	return 0;
 }
 #endif
 
 /*
  * return current level.
  * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned.
  */
 u_int
 ipsec_get_reqlevel(isr)
 	struct ipsecrequest *isr;
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int level = 0;
 	u_int esp_trans_deflev, esp_net_deflev;
 	u_int ah_trans_deflev, ah_net_deflev;
 
 	IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument"));
 	IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family,
 		("af family mismatch, src %u, dst %u",
 		 isr->sp->spidx.src.sa.sa_family,
 		 isr->sp->spidx.dst.sa.sa_family));
 
 /* XXX note that we have ipseclog() expanded here - code sync issue */
 #define IPSEC_CHECK_DEFAULT(lev) \
 	(((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE	      \
 			&& (lev) != IPSEC_LEVEL_UNIQUE)			      \
 		? (V_ipsec_debug						      \
 			? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\
 				(lev), IPSEC_LEVEL_REQUIRE)		      \
 			: 0),						      \
 			(lev) = IPSEC_LEVEL_REQUIRE,			      \
 			(lev)						      \
 		: (lev))
 
 	/* set default level */
 	switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
 #ifdef INET
 	case AF_INET:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev);
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unknown af %u",
 			__func__, isr->sp->spidx.src.sa.sa_family);
 	}
 
 #undef IPSEC_CHECK_DEFAULT
 
 	/* set level */
 	switch (isr->level) {
 	case IPSEC_LEVEL_DEFAULT:
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = esp_net_deflev;
 			else
 				level = esp_trans_deflev;
 			break;
 		case IPPROTO_AH:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = ah_net_deflev;
 			else
 				level = ah_trans_deflev;
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * we don't really care, as IPcomp document says that
 			 * we shouldn't compress small packets
 			 */
 			level = IPSEC_LEVEL_USE;
 			break;
 		default:
 			panic("%s: Illegal protocol defined %u\n", __func__,
 				isr->saidx.proto);
 		}
 		break;
 
 	case IPSEC_LEVEL_USE:
 	case IPSEC_LEVEL_REQUIRE:
 		level = isr->level;
 		break;
 	case IPSEC_LEVEL_UNIQUE:
 		level = IPSEC_LEVEL_REQUIRE;
 		break;
 
 	default:
 		panic("%s: Illegal IPsec level %u\n", __func__, isr->level);
 	}
 
 	return level;
 }
 
 /*
  * Check security policy requirements against the actual
  * packet contents.  Return one if the packet should be
  * reject as "invalid"; otherwiser return zero to have the
  * packet treated as "valid".
  *
  * OUT:
  *	0: valid
  *	1: invalid
  */
 int
 ipsec_in_reject(struct secpolicy *sp, struct mbuf *m)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct ipsecrequest *isr;
 	int need_auth;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	/* check policy */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		return 1;
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return 0;
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	/* XXX should compare policy against ipsec header history */
 
 	need_auth = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE)
 			continue;
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if ((m->m_flags & M_DECRYPTED) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP m_flags:%x\n", __func__,
 					    m->m_flags));
 				return 1;
 			}
 
 			if (!need_auth &&
 			    isr->sav != NULL &&
 			    isr->sav->tdb_authalgxform != NULL &&
 			    (m->m_flags & M_AUTHIPDGM) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP/AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return 1;
 			}
 			break;
 		case IPPROTO_AH:
 			need_auth = 1;
 			if ((m->m_flags & M_AUTHIPHDR) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return 1;
 			}
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * we don't really care, as IPcomp document
 			 * says that we shouldn't compress small
 			 * packets, IPComp policy should always be
 			 * treated as being in "use" level.
 			 */
 			break;
 		}
 	}
 	return 0;		/* valid */
 }
 
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp_input(), udp_input(),
  * and {ah,esp}4_input for tunnel mode
  */
 int
 ipsec4_in_reject(m, inp)
 	struct mbuf *m;
 	struct inpcb *inp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 	int error;
 	int result;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);
 
 	if (sp != NULL) {
 		result = ipsec_in_reject(sp, m);
 		if (result)
 			V_ipsec4stat.ips_in_polvio++;
 		KEY_FREESP(&sp);
 	} else {
 		result = 0;	/* XXX should be panic ?
 				 * -> No, there may be error. */
 	}
 	return result;
 }
 
 #ifdef INET6
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp6_input(), udp6_input(),
  * and {ah,esp}6_input for tunnel mode
  */
 int
 ipsec6_in_reject(m, inp)
 	struct mbuf *m;
 	struct inpcb *inp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp = NULL;
 	int error;
 	int result;
 
 	/* sanity check */
 	if (m == NULL)
 		return 0;	/* XXX should be panic ? */
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);
 
 	if (sp != NULL) {
 		result = ipsec_in_reject(sp, m);
 		if (result)
 			V_ipsec6stat.ips_in_polvio++;
 		KEY_FREESP(&sp);
 	} else {
 		result = 0;
 	}
 	return result;
 }
 #endif
 
 /*
  * compute the byte size to be occupied by IPsec header.
  * in case it is tunneled, it includes the size of outer IP header.
  * NOTE: SP passed is free in this function.
  */
 static size_t
 ipsec_hdrsiz(struct secpolicy *sp)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct ipsecrequest *isr;
 	size_t siz;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return 0;
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	siz = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		size_t clen = 0;
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			clen = esp_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_AH:
 			clen = ah_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_IPCOMP:
 			clen = sizeof(struct ipcomp);
 			break;
 		}
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			switch (isr->saidx.dst.sa.sa_family) {
 			case AF_INET:
 				clen += sizeof(struct ip);
 				break;
 #ifdef INET6
 			case AF_INET6:
 				clen += sizeof(struct ip6_hdr);
 				break;
 #endif
 			default:
 				ipseclog((LOG_ERR, "%s: unknown AF %d in "
 				    "IPsec tunnel SA\n", __func__,
 				    ((struct sockaddr *)&isr->saidx.dst)->sa_family));
 				break;
 			}
 		}
 		siz += clen;
 	}
 
 	return siz;
 }
 
 /* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */
 size_t
 ipsec4_hdrsiz(m, dir, inp)
 	struct mbuf *m;
 	u_int dir;
 	struct inpcb *inp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 	int error;
 	size_t size;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, &error);
 
 	if (sp != NULL) {
 		size = ipsec_hdrsiz(sp);
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("%s: size:%lu.\n", __func__,
 				(unsigned long)size));
 
 		KEY_FREESP(&sp);
 	} else {
 		size = 0;	/* XXX should be panic ?
 				 * -> No, we are called w/o knowing if
 				 *    IPsec processing is needed. */
 	}
 	return size;
 }
 
 #ifdef INET6
 /* This function is called from ipsec6_hdrsize_tcp(),
  * and maybe from ip6_forward.()
  */
 size_t
 ipsec6_hdrsiz(m, dir, in6p)
 	struct mbuf *m;
 	u_int dir;
 	struct in6pcb *in6p;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 	int error;
 	size_t size;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(in6p == NULL || in6p->in6p_socket != NULL,
 		("socket w/o inpcb"));
 
 	/* get SP for this packet */
 	/* XXX Is it right to call with IP_FORWARDING. */
 	if (in6p == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, in6p, &error);
 
 	if (sp == NULL)
 		return 0;
 	size = ipsec_hdrsiz(sp);
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: size:%lu.\n", __func__, (unsigned long)size));
 	KEY_FREESP(&sp);
 
 	return size;
 }
 #endif /*INET6*/
 
 /*
  * Check the variable replay window.
  * ipsec_chkreplay() performs replay check before ICV verification.
  * ipsec_updatereplay() updates replay bitmap.  This must be called after
  * ICV verification (it also performs replay check, which is usually done
  * beforehand).
  * 0 (zero) is returned if packet disallowed, 1 if packet permitted.
  *
  * based on RFC 2401.
  */
 int
 ipsec_chkreplay(seq, sav)
 	u_int32_t seq;
 	struct secasvar *sav;
 {
 	const struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* constant: bits of window size */
 	int frlast;		/* constant: last frame */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		return 1;	/* no need to check replay. */
 
 	/* constant */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* sequence number of 0 is invalid */
 	if (seq == 0)
 		return 0;
 
 	/* first time is always okay */
 	if (replay->count == 0)
 		return 1;
 
 	if (seq > replay->lastseq) {
 		/* larger sequences are okay */
 		return 1;
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* over range to check, i.e. too old or wrapped */
 		if (diff >= wsizeb)
 			return 0;
 
 		fr = frlast - diff / 8;
 
 		/* this packet already seen ? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return 0;
 
 		/* out of order but good */
 		return 1;
 	}
 }
 
 /*
  * check replay counter whether to update or not.
  * OUT:	0:	OK
  *	1:	NG
  */
 int
 ipsec_updatereplay(seq, sav)
 	u_int32_t seq;
 	struct secasvar *sav;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* constant: bits of window size */
 	int frlast;		/* constant: last frame */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		goto ok;	/* no need to check replay. */
 
 	/* constant */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* sequence number of 0 is invalid */
 	if (seq == 0)
 		return 1;
 
 	/* first time */
 	if (replay->count == 0) {
 		replay->lastseq = seq;
 		bzero(replay->bitmap, replay->wsize);
 		(replay->bitmap)[frlast] = 1;
 		goto ok;
 	}
 
 	if (seq > replay->lastseq) {
 		/* seq is larger than lastseq. */
 		diff = seq - replay->lastseq;
 
 		/* new larger sequence number */
 		if (diff < wsizeb) {
 			/* In window */
 			/* set bit for this packet */
 			vshiftl(replay->bitmap, diff, replay->wsize);
 			(replay->bitmap)[frlast] |= 1;
 		} else {
 			/* this packet has a "way larger" */
 			bzero(replay->bitmap, replay->wsize);
 			(replay->bitmap)[frlast] = 1;
 		}
 		replay->lastseq = seq;
 
 		/* larger is good */
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* over range to check, i.e. too old or wrapped */
 		if (diff >= wsizeb)
 			return 1;
 
 		fr = frlast - diff / 8;
 
 		/* this packet already seen ? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return 1;
 
 		/* mark as seen */
 		(replay->bitmap)[fr] |= (1 << (diff % 8));
 
 		/* out of order but good */
 	}
 
 ok:
 	if (replay->count == ~0) {
 
 		/* set overflow flag */
 		replay->overflow++;
 
 		/* don't increment, no more packets accepted */
 		if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
 			return 1;
 
 		ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n",
 		    __func__, replay->overflow, ipsec_logsastr(sav)));
 	}
 
 	replay->count++;
 
 	return 0;
 }
 
 /*
  * shift variable length buffer to left.
  * IN:	bitmap: pointer to the buffer
  * 	nbit:	the number of to shift.
  *	wsize:	buffer size (bytes).
  */
 static void
 vshiftl(bitmap, nbit, wsize)
 	unsigned char *bitmap;
 	int nbit, wsize;
 {
 	int s, j, i;
 	unsigned char over;
 
 	for (j = 0; j < nbit; j += 8) {
 		s = (nbit - j < 8) ? (nbit - j): 8;
 		bitmap[0] <<= s;
 		for (i = 1; i < wsize; i++) {
 			over = (bitmap[i] >> (8 - s));
 			bitmap[i] <<= s;
 			bitmap[i-1] |= over;
 		}
 	}
 
 	return;
 }
 
 /* Return a printable string for the IPv4 address. */
 static char *
 inet_ntoa4(struct in_addr ina)
 {
 	static char buf[4][4 * sizeof "123" + 4];
 	unsigned char *ucp = (unsigned char *) &ina;
 	static int i = 3;
 
 	/* XXX-BZ returns static buffer. */
 	i = (i + 1) % 4;
 	sprintf(buf[i], "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff,
 	    ucp[2] & 0xff, ucp[3] & 0xff);
 	return (buf[i]);
 }
 
 /* Return a printable string for the address. */
 char *
 ipsec_address(union sockaddr_union* sa)
 {
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 	switch (sa->sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		return inet_ntoa4(sa->sin.sin_addr);
 #endif /* INET */
 
 #ifdef INET6
 	case AF_INET6:
 		return ip6_sprintf(ip6buf, &sa->sin6.sin6_addr);
 #endif /* INET6 */
 
 	default:
 		return "(unknown address family)";
 	}
 }
 
 const char *
 ipsec_logsastr(sav)
 	struct secasvar *sav;
 {
 	static char buf[256];
 	char *p;
 	struct secasindex *saidx = &sav->sah->saidx;
 
 	IPSEC_ASSERT(saidx->src.sa.sa_family == saidx->dst.sa.sa_family,
 		("address family mismatch"));
 
 	p = buf;
 	snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi));
 	while (p && *p)
 		p++;
 	/* NB: only use ipsec_address on one address at a time */
 	snprintf(p, sizeof (buf) - (p - buf), "src=%s ",
 		ipsec_address(&saidx->src));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof (buf) - (p - buf), "dst=%s)",
 		ipsec_address(&saidx->dst));
 
 	return buf;
 }
 
 void
 ipsec_dumpmbuf(m)
 	struct mbuf *m;
 {
 	int totlen;
 	int i;
 	u_char *p;
 
 	totlen = 0;
 	printf("---\n");
 	while (m) {
 		p = mtod(m, u_char *);
 		for (i = 0; i < m->m_len; i++) {
 			printf("%02x ", p[i]);
 			totlen++;
 			if (totlen % 16 == 0)
 				printf("\n");
 		}
 		m = m->m_next;
 	}
 	if (totlen % 16 != 0)
 		printf("\n");
 	printf("---\n");
 }
 
 static void
 ipsec_attach(void)
 {
 	SECPOLICY_LOCK_INIT(&V_ip4_def_policy);
-	ip4_def_policy.refcnt = 1;			/* NB: disallow free */
+	V_ip4_def_policy.refcnt = 1;			/* NB: disallow free */
 }
 SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL);
 
 
 /* XXX this stuff doesn't belong here... */
 
 static	struct xformsw* xforms = NULL;
 
 /*
  * Register a transform; typically at system startup.
  */
 void
 xform_register(struct xformsw* xsp)
 {
 	xsp->xf_next = xforms;
 	xforms = xsp;
 }
 
 /*
  * Initialize transform support in an sav.
  */
 int
 xform_init(struct secasvar *sav, int xftype)
 {
 	struct xformsw *xsp;
 
 	if (sav->tdb_xform != NULL)	/* previously initialized */
 		return 0;
 	for (xsp = xforms; xsp; xsp = xsp->xf_next)
 		if (xsp->xf_type == xftype)
 			return (*xsp->xf_init)(sav, xsp);
 	return EINVAL;
 }
Index: head/sys/netipsec/key.c
===================================================================
--- head/sys/netipsec/key.c	(revision 185347)
+++ head/sys/netipsec/key.c	(revision 185348)
@@ -1,7466 +1,7468 @@
 /*	$FreeBSD$	*/
 /*	$KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This code is referd to RFC 2367
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/syslog.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/raw_cb.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 #ifdef INET
 #include <netinet/in_pcb.h>
 #endif
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif /* INET6 */
 
 #include <net/pfkeyv2.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key.h>
 #include <netipsec/keysock.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <netipsec/xform.h>
 
 #include <machine/stdarg.h>
 
 /* randomness */
 #include <sys/random.h>
 #include <sys/vimage.h>
 
 #define FULLMASK	0xff
 #define	_BITS(bytes)	((bytes) << 3)
 
 /*
  * Note on SA reference counting:
  * - SAs that are not in DEAD state will have (total external reference + 1)
  *   following value in reference count field.  they cannot be freed and are
  *   referenced from SA header.
  * - SAs that are in DEAD state will have (total external reference)
  *   in reference count field.  they are ready to be freed.  reference from
  *   SA header will be removed in key_delsav(), when the reference count
  *   field hits 0 (= no external reference other than from SA header.
  */
 
 #ifdef VIMAGE_GLOBALS
 u_int32_t key_debug_level;
 static u_int key_spi_trycnt;
 static u_int32_t key_spi_minval;
 static u_int32_t key_spi_maxval;
 static u_int32_t policy_id;
 static u_int key_int_random;
 static u_int key_larval_lifetime;
 static int key_blockacq_count;
 static int key_blockacq_lifetime;
 static int key_preferred_oldsa;
 
 static u_int32_t acq_seq;
 
 static int ipsec_esp_keymin;
 static int ipsec_esp_auth;
 static int ipsec_ah_keymin;
 
 static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX];	/* SPD */
 static LIST_HEAD(_sahtree, secashead) sahtree;			/* SAD */
 static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1];
 static LIST_HEAD(_acqtree, secacq) acqtree;		/* acquiring list */
 static LIST_HEAD(_spacqtree, secspacq) spacqtree;	/* SP acquiring list */
 #endif /* VIMAGE_GLOBALS */
 
 static struct mtx sptree_lock;
 #define	SPTREE_LOCK_INIT() \
 	mtx_init(&sptree_lock, "sptree", \
 		"fast ipsec security policy database", MTX_DEF)
 #define	SPTREE_LOCK_DESTROY()	mtx_destroy(&sptree_lock)
 #define	SPTREE_LOCK()		mtx_lock(&sptree_lock)
 #define	SPTREE_UNLOCK()	mtx_unlock(&sptree_lock)
 #define	SPTREE_LOCK_ASSERT()	mtx_assert(&sptree_lock, MA_OWNED)
 
 static struct mtx sahtree_lock;
 #define	SAHTREE_LOCK_INIT() \
 	mtx_init(&sahtree_lock, "sahtree", \
 		"fast ipsec security association database", MTX_DEF)
 #define	SAHTREE_LOCK_DESTROY()	mtx_destroy(&sahtree_lock)
 #define	SAHTREE_LOCK()		mtx_lock(&sahtree_lock)
 #define	SAHTREE_UNLOCK()	mtx_unlock(&sahtree_lock)
 #define	SAHTREE_LOCK_ASSERT()	mtx_assert(&sahtree_lock, MA_OWNED)
 
 							/* registed list */
 static struct mtx regtree_lock;
 #define	REGTREE_LOCK_INIT() \
 	mtx_init(&regtree_lock, "regtree", "fast ipsec regtree", MTX_DEF)
 #define	REGTREE_LOCK_DESTROY()	mtx_destroy(&regtree_lock)
 #define	REGTREE_LOCK()		mtx_lock(&regtree_lock)
 #define	REGTREE_UNLOCK()	mtx_unlock(&regtree_lock)
 #define	REGTREE_LOCK_ASSERT()	mtx_assert(&regtree_lock, MA_OWNED)
 
 static struct mtx acq_lock;
 #define	ACQ_LOCK_INIT() \
 	mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF)
 #define	ACQ_LOCK_DESTROY()	mtx_destroy(&acq_lock)
 #define	ACQ_LOCK()		mtx_lock(&acq_lock)
 #define	ACQ_UNLOCK()		mtx_unlock(&acq_lock)
 #define	ACQ_LOCK_ASSERT()	mtx_assert(&acq_lock, MA_OWNED)
 
 static struct mtx spacq_lock;
 #define	SPACQ_LOCK_INIT() \
 	mtx_init(&spacq_lock, "spacqtree", \
 		"fast ipsec security policy acquire list", MTX_DEF)
 #define	SPACQ_LOCK_DESTROY()	mtx_destroy(&spacq_lock)
 #define	SPACQ_LOCK()		mtx_lock(&spacq_lock)
 #define	SPACQ_UNLOCK()		mtx_unlock(&spacq_lock)
 #define	SPACQ_LOCK_ASSERT()	mtx_assert(&spacq_lock, MA_OWNED)
 
 /* search order for SAs */
 static const u_int saorder_state_valid_prefer_old[] = {
 	SADB_SASTATE_DYING, SADB_SASTATE_MATURE,
 };
 static const u_int saorder_state_valid_prefer_new[] = {
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
 };
-static u_int saorder_state_alive[] = {
+static const u_int saorder_state_alive[] = {
 	/* except DEAD */
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL
 };
-static u_int saorder_state_any[] = {
+static const u_int saorder_state_any[] = {
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
 	SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD
 };
 
 static const int minsize[] = {
 	sizeof(struct sadb_msg),	/* SADB_EXT_RESERVED */
 	sizeof(struct sadb_sa),		/* SADB_EXT_SA */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_CURRENT */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_HARD */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_SOFT */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_SRC */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_DST */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_PROXY */
 	sizeof(struct sadb_key),	/* SADB_EXT_KEY_AUTH */
 	sizeof(struct sadb_key),	/* SADB_EXT_KEY_ENCRYPT */
 	sizeof(struct sadb_ident),	/* SADB_EXT_IDENTITY_SRC */
 	sizeof(struct sadb_ident),	/* SADB_EXT_IDENTITY_DST */
 	sizeof(struct sadb_sens),	/* SADB_EXT_SENSITIVITY */
 	sizeof(struct sadb_prop),	/* SADB_EXT_PROPOSAL */
 	sizeof(struct sadb_supported),	/* SADB_EXT_SUPPORTED_AUTH */
 	sizeof(struct sadb_supported),	/* SADB_EXT_SUPPORTED_ENCRYPT */
 	sizeof(struct sadb_spirange),	/* SADB_EXT_SPIRANGE */
 	0,				/* SADB_X_EXT_KMPRIVATE */
 	sizeof(struct sadb_x_policy),	/* SADB_X_EXT_POLICY */
 	sizeof(struct sadb_x_sa2),	/* SADB_X_SA2 */
 };
 static const int maxsize[] = {
 	sizeof(struct sadb_msg),	/* SADB_EXT_RESERVED */
 	sizeof(struct sadb_sa),		/* SADB_EXT_SA */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_CURRENT */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_HARD */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_SOFT */
 	0,				/* SADB_EXT_ADDRESS_SRC */
 	0,				/* SADB_EXT_ADDRESS_DST */
 	0,				/* SADB_EXT_ADDRESS_PROXY */
 	0,				/* SADB_EXT_KEY_AUTH */
 	0,				/* SADB_EXT_KEY_ENCRYPT */
 	0,				/* SADB_EXT_IDENTITY_SRC */
 	0,				/* SADB_EXT_IDENTITY_DST */
 	0,				/* SADB_EXT_SENSITIVITY */
 	0,				/* SADB_EXT_PROPOSAL */
 	0,				/* SADB_EXT_SUPPORTED_AUTH */
 	0,				/* SADB_EXT_SUPPORTED_ENCRYPT */
 	sizeof(struct sadb_spirange),	/* SADB_EXT_SPIRANGE */
 	0,				/* SADB_X_EXT_KMPRIVATE */
 	0,				/* SADB_X_EXT_POLICY */
 	sizeof(struct sadb_x_sa2),	/* SADB_X_SA2 */
 };
 
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_key);
 #endif
 
 SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_DEBUG_LEVEL,	debug,
 	CTLFLAG_RW, key_debug_level,	0,	"");
 
 /* max count of trial for the decision of spi value */
 SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_SPI_TRY, spi_trycnt,
 	CTLFLAG_RW, key_spi_trycnt,	0,	"");
 
 /* minimum spi value to allocate automatically. */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MIN_VALUE,
 	spi_minval,	CTLFLAG_RW, key_spi_minval,	0,	"");
 
 /* maximun spi value to allocate automatically. */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MAX_VALUE,
 	spi_maxval,	CTLFLAG_RW, key_spi_maxval,	0,	"");
 
 /* interval to initialize randseed */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_RANDOM_INT,
 	int_random,	CTLFLAG_RW, key_int_random,	0,	"");
 
 /* lifetime for larval SA */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_LARVAL_LIFETIME,
 	larval_lifetime, CTLFLAG_RW, key_larval_lifetime,	0,	"");
 
 /* counter for blocking to send SADB_ACQUIRE to IKEd */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_COUNT,
 	blockacq_count,	CTLFLAG_RW, key_blockacq_count,	0,	"");
 
 /* lifetime for blocking to send SADB_ACQUIRE to IKEd */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_LIFETIME,
 	blockacq_lifetime, CTLFLAG_RW, key_blockacq_lifetime,	0, "");
 
 /* ESP auth */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_AUTH,	esp_auth,
 	CTLFLAG_RW, ipsec_esp_auth,	0,	"");
 
 /* minimum ESP key length */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_KEYMIN,
 	esp_keymin, CTLFLAG_RW, ipsec_esp_keymin,	0,	"");
 
 /* minimum AH key length */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_AH_KEYMIN,	ah_keymin,
 	CTLFLAG_RW, ipsec_ah_keymin,	0,	"");
 
 /* perfered old SA rather than new SA */
 SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_PREFERED_OLDSA,
 	preferred_oldsa, CTLFLAG_RW, key_preferred_oldsa,	0,	"");
 
 #define __LIST_CHAINED(elm) \
 	(!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL))
 #define LIST_INSERT_TAIL(head, elm, type, field) \
 do {\
 	struct type *curelm = LIST_FIRST(head); \
 	if (curelm == NULL) {\
 		LIST_INSERT_HEAD(head, elm, field); \
 	} else { \
 		while (LIST_NEXT(curelm, field)) \
 			curelm = LIST_NEXT(curelm, field);\
 		LIST_INSERT_AFTER(curelm, elm, field);\
 	}\
 } while (0)
 
 #define KEY_CHKSASTATE(head, sav, name) \
 do { \
 	if ((head) != (sav)) {						\
 		ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \
 			(name), (head), (sav)));			\
 		continue;						\
 	}								\
 } while (0)
 
 #define KEY_CHKSPDIR(head, sp, name) \
 do { \
 	if ((head) != (sp)) {						\
 		ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \
 			"anyway continue.\n",				\
 			(name), (head), (sp)));				\
 	}								\
 } while (0)
 
 MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association");
 MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head");
 MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy");
 MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request");
 MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous");
 MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire");
 MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire");
 
 /*
  * set parameters into secpolicyindex buffer.
  * Must allocate secpolicyindex buffer passed to this function.
  */
 #define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \
 do { \
 	bzero((idx), sizeof(struct secpolicyindex));                         \
 	(idx)->dir = (_dir);                                                 \
 	(idx)->prefs = (ps);                                                 \
 	(idx)->prefd = (pd);                                                 \
 	(idx)->ul_proto = (ulp);                                             \
 	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len);     \
 	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len);     \
 } while (0)
 
 /*
  * set parameters into secasindex buffer.
  * Must allocate secasindex buffer before calling this function.
  */
 #define KEY_SETSECASIDX(p, m, r, s, d, idx) \
 do { \
 	bzero((idx), sizeof(struct secasindex));                             \
 	(idx)->proto = (p);                                                  \
 	(idx)->mode = (m);                                                   \
 	(idx)->reqid = (r);                                                  \
 	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len);     \
 	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len);     \
 } while (0)
 
 /* key statistics */
 struct _keystat {
 	u_long getspi_count; /* the avarage of count to try to get new SPI */
 } keystat;
 
 struct sadb_msghdr {
 	struct sadb_msg *msg;
 	struct sadb_ext *ext[SADB_EXT_MAX + 1];
 	int extoff[SADB_EXT_MAX + 1];
 	int extlen[SADB_EXT_MAX + 1];
 };
 
 static struct secasvar *key_allocsa_policy __P((const struct secasindex *));
 static void key_freesp_so __P((struct secpolicy **));
 static struct secasvar *key_do_allocsa_policy __P((struct secashead *, u_int));
 static void key_delsp __P((struct secpolicy *));
 static struct secpolicy *key_getsp __P((struct secpolicyindex *));
 static void _key_delsp(struct secpolicy *sp);
 static struct secpolicy *key_getspbyid __P((u_int32_t));
 static u_int32_t key_newreqid __P((void));
 static struct mbuf *key_gather_mbuf __P((struct mbuf *,
 	const struct sadb_msghdr *, int, int, ...));
 static int key_spdadd __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static u_int32_t key_getnewspid __P((void));
 static int key_spddelete __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spddelete2 __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spdget __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spdflush __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spddump __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static struct mbuf *key_setdumpsp __P((struct secpolicy *,
 	u_int8_t, u_int32_t, u_int32_t));
 static u_int key_getspreqmsglen __P((struct secpolicy *));
 static int key_spdexpire __P((struct secpolicy *));
 static struct secashead *key_newsah __P((struct secasindex *));
 static void key_delsah __P((struct secashead *));
 static struct secasvar *key_newsav __P((struct mbuf *,
 	const struct sadb_msghdr *, struct secashead *, int *,
 	const char*, int));
 #define	KEY_NEWSAV(m, sadb, sah, e)				\
 	key_newsav(m, sadb, sah, e, __FILE__, __LINE__)
 static void key_delsav __P((struct secasvar *));
 static struct secashead *key_getsah __P((struct secasindex *));
 static struct secasvar *key_checkspidup __P((struct secasindex *, u_int32_t));
 static struct secasvar *key_getsavbyspi __P((struct secashead *, u_int32_t));
 static int key_setsaval __P((struct secasvar *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_mature __P((struct secasvar *));
 static struct mbuf *key_setdumpsa __P((struct secasvar *, u_int8_t,
 	u_int8_t, u_int32_t, u_int32_t));
 static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t,
 	u_int32_t, pid_t, u_int16_t));
 static struct mbuf *key_setsadbsa __P((struct secasvar *));
 static struct mbuf *key_setsadbaddr __P((u_int16_t,
 	const struct sockaddr *, u_int8_t, u_int16_t));
 static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t));
 static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t,
 	u_int32_t));
 static struct seckey *key_dup_keymsg(const struct sadb_key *, u_int, 
 				     struct malloc_type *);
 static struct seclifetime *key_dup_lifemsg(const struct sadb_lifetime *src,
 					    struct malloc_type *type);
 #ifdef INET6
 static int key_ismyaddr6 __P((struct sockaddr_in6 *));
 #endif
 
 /* flags for key_cmpsaidx() */
 #define CMP_HEAD	1	/* protocol, addresses. */
 #define CMP_MODE_REQID	2	/* additionally HEAD, reqid, mode. */
 #define CMP_REQID	3	/* additionally HEAD, reaid. */
 #define CMP_EXACTLY	4	/* all elements. */
 static int key_cmpsaidx
 	__P((const struct secasindex *, const struct secasindex *, int));
 
 static int key_cmpspidx_exactly
 	__P((struct secpolicyindex *, struct secpolicyindex *));
 static int key_cmpspidx_withmask
 	__P((struct secpolicyindex *, struct secpolicyindex *));
 static int key_sockaddrcmp __P((const struct sockaddr *, const struct sockaddr *, int));
 static int key_bbcmp __P((const void *, const void *, u_int));
 static u_int16_t key_satype2proto __P((u_int8_t));
 static u_int8_t key_proto2satype __P((u_int16_t));
 
 static int key_getspi __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static u_int32_t key_do_getnewspi __P((struct sadb_spirange *,
 					struct secasindex *));
 static int key_update __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 #ifdef IPSEC_DOSEQCHECK
 static struct secasvar *key_getsavbyseq __P((struct secashead *, u_int32_t));
 #endif
 static int key_add __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_setident __P((struct secashead *, struct mbuf *,
 	const struct sadb_msghdr *));
 static struct mbuf *key_getmsgbuf_x1 __P((struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_delete __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_get __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 
 static void key_getcomb_setlifetime __P((struct sadb_comb *));
 static struct mbuf *key_getcomb_esp __P((void));
 static struct mbuf *key_getcomb_ah __P((void));
 static struct mbuf *key_getcomb_ipcomp __P((void));
 static struct mbuf *key_getprop __P((const struct secasindex *));
 
 static int key_acquire __P((const struct secasindex *, struct secpolicy *));
 static struct secacq *key_newacq __P((const struct secasindex *));
 static struct secacq *key_getacq __P((const struct secasindex *));
 static struct secacq *key_getacqbyseq __P((u_int32_t));
 static struct secspacq *key_newspacq __P((struct secpolicyindex *));
 static struct secspacq *key_getspacq __P((struct secpolicyindex *));
 static int key_acquire2 __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_register __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_expire __P((struct secasvar *));
 static int key_flush __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_dump __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_promisc __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_senderror __P((struct socket *, struct mbuf *, int));
 static int key_validate_ext __P((const struct sadb_ext *, int));
 static int key_align __P((struct mbuf *, struct sadb_msghdr *));
 static struct mbuf *key_setlifetime(struct seclifetime *src, 
 				     u_int16_t exttype);
 static struct mbuf *key_setkey(struct seckey *src, u_int16_t exttype);
 
 #if 0
 static const char *key_getfqdn __P((void));
 static const char *key_getuserfqdn __P((void));
 #endif
 static void key_sa_chgstate __P((struct secasvar *, u_int8_t));
 static struct mbuf *key_alloc_mbuf __P((int));
 
 static __inline void
 sa_initref(struct secasvar *sav)
 {
 
 	refcount_init(&sav->refcnt, 1);
 }
 static __inline void
 sa_addref(struct secasvar *sav)
 {
 
 	refcount_acquire(&sav->refcnt);
 	IPSEC_ASSERT(sav->refcnt != 0, ("SA refcnt overflow"));
 }
 static __inline int
 sa_delref(struct secasvar *sav)
 {
 
 	IPSEC_ASSERT(sav->refcnt > 0, ("SA refcnt underflow"));
 	return (refcount_release(&sav->refcnt));
 }
 
 #define	SP_ADDREF(p) do {						\
 	(p)->refcnt++;							\
 	IPSEC_ASSERT((p)->refcnt != 0, ("SP refcnt overflow"));		\
 } while (0)
 #define	SP_DELREF(p) do {						\
 	IPSEC_ASSERT((p)->refcnt > 0, ("SP refcnt underflow"));		\
 	(p)->refcnt--;							\
 } while (0)
  
 
 /*
  * Update the refcnt while holding the SPTREE lock.
  */
 void
 key_addref(struct secpolicy *sp)
 {
 	SPTREE_LOCK();
 	SP_ADDREF(sp);
 	SPTREE_UNLOCK();
 }
 
 /*
  * Return 0 when there are known to be no SP's for the specified
  * direction.  Otherwise return 1.  This is used by IPsec code
  * to optimize performance.
  */
 int
 key_havesp(u_int dir)
 {
 	INIT_VNET_IPSEC(curvnet);
 
 	return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ?
 		LIST_FIRST(&V_sptree[dir]) != NULL : 1);
 }
 
 /* %%% IPsec policy management */
 /*
  * allocating a SP for OUTBOUND or INBOUND packet.
  * Must call key_freesp() later.
  * OUT:	NULL:	not found
  *	others:	found and return the pointer.
  */
 struct secpolicy *
 key_allocsp(struct secpolicyindex *spidx, u_int dir, const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	/* get a SP entry */
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("*** objects\n");
 		kdebug_secpolicyindex(spidx));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("*** in SPD\n");
 			kdebug_secpolicyindex(&sp->spidx));
 
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (key_cmpspidx_withmask(&sp->spidx, spidx))
 			goto found;
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		/* sanity check */
 		KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);
 
 		/* found a SPD entry */
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 
 /*
  * allocating a SP for OUTBOUND or INBOUND packet.
  * Must call key_freesp() later.
  * OUT:	NULL:	not found
  *	others:	found and return the pointer.
  */
 struct secpolicy *
 key_allocsp2(u_int32_t spi,
 	     union sockaddr_union *dst,
 	     u_int8_t proto,
 	     u_int dir,
 	     const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(dst != NULL, ("null dst"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	/* get a SP entry */
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("*** objects\n");
 		printf("spi %u proto %u dir %u\n", spi, proto, dir);
 		kdebug_sockaddr(&dst->sa));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("*** in SPD\n");
 			kdebug_secpolicyindex(&sp->spidx));
 
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		/* compare simple values, then dst address */
 		if (sp->spidx.ul_proto != proto)
 			continue;
 		/* NB: spi's must exist and match */
 		if (!sp->req || !sp->req->sav || sp->req->sav->spi != spi)
 			continue;
 		if (key_sockaddrcmp(&sp->spidx.dst.sa, &dst->sa, 1) == 0)
 			goto found;
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		/* sanity check */
 		KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);
 
 		/* found a SPD entry */
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 
 /*
  * return a policy that matches this particular inbound packet.
  * XXX slow
  */
 struct secpolicy *
 key_gettunnel(const struct sockaddr *osrc,
 	      const struct sockaddr *odst,
 	      const struct sockaddr *isrc,
 	      const struct sockaddr *idst,
 	      const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 	const int dir = IPSEC_DIR_INBOUND;
 	struct ipsecrequest *r1, *r2, *p;
 	struct secpolicyindex spidx;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	if (isrc->sa_family != idst->sa_family) {
 		ipseclog((LOG_ERR, "%s: protocol family mismatched %d != %d\n.",
 			__func__, isrc->sa_family, idst->sa_family));
 		sp = NULL;
 		goto done;
 	}
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 
 		r1 = r2 = NULL;
 		for (p = sp->req; p; p = p->next) {
 			if (p->saidx.mode != IPSEC_MODE_TUNNEL)
 				continue;
 
 			r1 = r2;
 			r2 = p;
 
 			if (!r1) {
 				/* here we look at address matches only */
 				spidx = sp->spidx;
 				if (isrc->sa_len > sizeof(spidx.src) ||
 				    idst->sa_len > sizeof(spidx.dst))
 					continue;
 				bcopy(isrc, &spidx.src, isrc->sa_len);
 				bcopy(idst, &spidx.dst, idst->sa_len);
 				if (!key_cmpspidx_withmask(&sp->spidx, &spidx))
 					continue;
 			} else {
 				if (key_sockaddrcmp(&r1->saidx.src.sa, isrc, 0) ||
 				    key_sockaddrcmp(&r1->saidx.dst.sa, idst, 0))
 					continue;
 			}
 
 			if (key_sockaddrcmp(&r2->saidx.src.sa, osrc, 0) ||
 			    key_sockaddrcmp(&r2->saidx.dst.sa, odst, 0))
 				continue;
 
 			goto found;
 		}
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 done:
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 
 /*
  * allocating an SA entry for an *OUTBOUND* packet.
  * checking each request entries in SP, and acquire an SA if need.
  * OUT:	0: there are valid requests.
  *	ENOENT: policy may be valid, but SA with REQUIRE is on acquiring.
  */
 int
 key_checkrequest(struct ipsecrequest *isr, const struct secasindex *saidx)
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int level;
 	int error;
 
 	IPSEC_ASSERT(isr != NULL, ("null isr"));
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 	IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT ||
 		saidx->mode == IPSEC_MODE_TUNNEL,
 		("unexpected policy %u", saidx->mode));
 
 	/*
 	 * XXX guard against protocol callbacks from the crypto
 	 * thread as they reference ipsecrequest.sav which we
 	 * temporarily null out below.  Need to rethink how we
 	 * handle bundled SA's in the callback thread.
 	 */
 	IPSECREQUEST_LOCK_ASSERT(isr);
 
 	/* get current level */
 	level = ipsec_get_reqlevel(isr);
 #if 0
 	/*
 	 * We do allocate new SA only if the state of SA in the holder is
 	 * SADB_SASTATE_DEAD.  The SA for outbound must be the oldest.
 	 */
 	if (isr->sav != NULL) {
 		if (isr->sav->sah == NULL)
 			panic("%s: sah is null.\n", __func__);
 		if (isr->sav == (struct secasvar *)LIST_FIRST(
 			    &isr->sav->sah->savtree[SADB_SASTATE_DEAD])) {
 			KEY_FREESAV(&isr->sav);
 			isr->sav = NULL;
 		}
 	}
 #else
 	/*
 	 * we free any SA stashed in the IPsec request because a different
 	 * SA may be involved each time this request is checked, either
 	 * because new SAs are being configured, or this request is
 	 * associated with an unconnected datagram socket, or this request
 	 * is associated with a system default policy.
 	 *
 	 * The operation may have negative impact to performance.  We may
 	 * want to check cached SA carefully, rather than picking new SA
 	 * every time.
 	 */
 	if (isr->sav != NULL) {
 		KEY_FREESAV(&isr->sav);
 		isr->sav = NULL;
 	}
 #endif
 
 	/*
 	 * new SA allocation if no SA found.
 	 * key_allocsa_policy should allocate the oldest SA available.
 	 * See key_do_allocsa_policy(), and draft-jenkins-ipsec-rekeying-03.txt.
 	 */
 	if (isr->sav == NULL)
 		isr->sav = key_allocsa_policy(saidx);
 
 	/* When there is SA. */
 	if (isr->sav != NULL) {
 		if (isr->sav->state != SADB_SASTATE_MATURE &&
 		    isr->sav->state != SADB_SASTATE_DYING)
 			return EINVAL;
 		return 0;
 	}
 
 	/* there is no SA */
 	error = key_acquire(saidx, isr->sp);
 	if (error != 0) {
 		/* XXX What should I do ? */
 		ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
 			__func__, error));
 		return error;
 	}
 
 	if (level != IPSEC_LEVEL_REQUIRE) {
 		/* XXX sigh, the interface to this routine is botched */
 		IPSEC_ASSERT(isr->sav == NULL, ("unexpected SA"));
 		return 0;
 	} else {
 		return ENOENT;
 	}
 }
 
 /*
  * allocating a SA for policy entry from SAD.
  * NOTE: searching SAD of aliving state.
  * OUT:	NULL:	not found.
  *	others:	found and return the pointer.
  */
 static struct secasvar *
 key_allocsa_policy(const struct secasindex *saidx)
 {
 #define	N(a)	_ARRAYLEN(a)
 	INIT_VNET_IPSEC(curvnet);	
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int stateidx, arraysize;
 	const u_int *state_valid;
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) {
 			if (V_key_preferred_oldsa) {
 				state_valid = saorder_state_valid_prefer_old;
 				arraysize = N(saorder_state_valid_prefer_old);
 			} else {
 				state_valid = saorder_state_valid_prefer_new;
 				arraysize = N(saorder_state_valid_prefer_new);
 			}
 			SAHTREE_UNLOCK();
 			goto found;
 		}
 	}
 	SAHTREE_UNLOCK();
 
 	return NULL;
 
     found:
 	/* search valid state */
 	for (stateidx = 0; stateidx < arraysize; stateidx++) {
 		sav = key_do_allocsa_policy(sah, state_valid[stateidx]);
 		if (sav != NULL)
 			return sav;
 	}
 
 	return NULL;
 #undef N
 }
 
 /*
  * searching SAD with direction, protocol, mode and state.
  * called by key_allocsa_policy().
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_do_allocsa_policy(struct secashead *sah, u_int state)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secasvar *sav, *nextsav, *candidate, *d;
 
 	/* initilize */
 	candidate = NULL;
 
 	SAHTREE_LOCK();
 	for (sav = LIST_FIRST(&sah->savtree[state]);
 	     sav != NULL;
 	     sav = nextsav) {
 
 		nextsav = LIST_NEXT(sav, chain);
 
 		/* sanity check */
 		KEY_CHKSASTATE(sav->state, state, __func__);
 
 		/* initialize */
 		if (candidate == NULL) {
 			candidate = sav;
 			continue;
 		}
 
 		/* Which SA is the better ? */
 
 		IPSEC_ASSERT(candidate->lft_c != NULL,
 			("null candidate lifetime"));
 		IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime"));
 
 		/* What the best method is to compare ? */
 		if (V_key_preferred_oldsa) {
 			if (candidate->lft_c->addtime >
 					sav->lft_c->addtime) {
 				candidate = sav;
 			}
 			continue;
 			/*NOTREACHED*/
 		}
 
 		/* preferred new sa rather than old sa */
 		if (candidate->lft_c->addtime <
 				sav->lft_c->addtime) {
 			d = candidate;
 			candidate = sav;
 		} else
 			d = sav;
 
 		/*
 		 * prepared to delete the SA when there is more
 		 * suitable candidate and the lifetime of the SA is not
 		 * permanent.
 		 */
 		if (d->lft_h->addtime != 0) {
 			struct mbuf *m, *result;
 			u_int8_t satype;
 
 			key_sa_chgstate(d, SADB_SASTATE_DEAD);
 
 			IPSEC_ASSERT(d->refcnt > 0, ("bogus ref count"));
 
 			satype = key_proto2satype(d->sah->saidx.proto);
 			if (satype == 0)
 				goto msgfail;
 
 			m = key_setsadbmsg(SADB_DELETE, 0,
 			    satype, 0, 0, d->refcnt - 1);
 			if (!m)
 				goto msgfail;
 			result = m;
 
 			/* set sadb_address for saidx's. */
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 				&d->sah->saidx.src.sa,
 				d->sah->saidx.src.sa.sa_len << 3,
 				IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			/* set sadb_address for saidx's. */
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 				&d->sah->saidx.dst.sa,
 				d->sah->saidx.dst.sa.sa_len << 3,
 				IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			/* create SA extension */
 			m = key_setsadbsa(d);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			if (result->m_len < sizeof(struct sadb_msg)) {
 				result = m_pullup(result,
 						sizeof(struct sadb_msg));
 				if (result == NULL)
 					goto msgfail;
 			}
 
 			result->m_pkthdr.len = 0;
 			for (m = result; m; m = m->m_next)
 				result->m_pkthdr.len += m->m_len;
 			mtod(result, struct sadb_msg *)->sadb_msg_len =
 				PFKEY_UNIT64(result->m_pkthdr.len);
 
 			if (key_sendup_mbuf(NULL, result,
 					KEY_SENDUP_REGISTERED))
 				goto msgfail;
 		 msgfail:
 			KEY_FREESAV(&d);
 		}
 	}
 	if (candidate) {
 		sa_addref(candidate);
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s cause refcnt++:%d SA:%p\n",
 				__func__, candidate->refcnt, candidate));
 	}
 	SAHTREE_UNLOCK();
 
 	return candidate;
 }
 
 /*
  * allocating a usable SA entry for a *INBOUND* packet.
  * Must call key_freesav() later.
  * OUT: positive:	pointer to a usable sav (i.e. MATURE or DYING state).
  *	NULL:		not found, or error occured.
  *
  * In the comparison, no source address is used--for RFC2401 conformance.
  * To quote, from section 4.1:
  *	A security association is uniquely identified by a triple consisting
  *	of a Security Parameter Index (SPI), an IP Destination Address, and a
  *	security protocol (AH or ESP) identifier.
  * Note that, however, we do need to keep source address in IPsec SA.
  * IKE specification and PF_KEY specification do assume that we
  * keep source address in IPsec SA.  We see a tricky situation here.
  */
 struct secasvar *
 key_allocsa(
 	union sockaddr_union *dst,
 	u_int proto,
 	u_int32_t spi,
 	const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int stateidx, arraysize, state;
 	const u_int *saorder_state_valid;
 
 	IPSEC_ASSERT(dst != NULL, ("null dst address"));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	/*
 	 * searching SAD.
 	 * XXX: to be checked internal IP header somewhere.  Also when
 	 * IPsec tunnel packet is received.  But ESP tunnel mode is
 	 * encrypted so we can't check internal IP header.
 	 */
 	SAHTREE_LOCK();
 	if (V_key_preferred_oldsa) {
 		saorder_state_valid = saorder_state_valid_prefer_old;
 		arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
 	} else {
 		saorder_state_valid = saorder_state_valid_prefer_new;
 		arraysize = _ARRAYLEN(saorder_state_valid_prefer_new);
 	}
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		/* search valid state */
 		for (stateidx = 0; stateidx < arraysize; stateidx++) {
 			state = saorder_state_valid[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				/* sanity check */
 				KEY_CHKSASTATE(sav->state, state, __func__);
 				/* do not return entries w/ unusable state */
 				if (sav->state != SADB_SASTATE_MATURE &&
 				    sav->state != SADB_SASTATE_DYING)
 					continue;
 				if (proto != sav->sah->saidx.proto)
 					continue;
 				if (spi != sav->spi)
 					continue;
 #if 0	/* don't check src */
 				/* check src address */
 				if (key_sockaddrcmp(&src->sa, &sav->sah->saidx.src.sa, 0) != 0)
 					continue;
 #endif
 				/* check dst address */
 				if (key_sockaddrcmp(&dst->sa, &sav->sah->saidx.dst.sa, 0) != 0)
 					continue;
 				sa_addref(sav);
 				goto done;
 			}
 		}
 	}
 	sav = NULL;
 done:
 	SAHTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SA:%p; refcnt %u\n", __func__,
 			sav, sav ? sav->refcnt : 0));
 	return sav;
 }
 
 /*
  * Must be called after calling key_allocsp().
  * For both the packet without socket and key_freeso().
  */
 void
 _key_freesp(struct secpolicy **spp, const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp = *spp;
 
 	IPSEC_ASSERT(sp != NULL, ("null sp"));
 
 	SPTREE_LOCK();
 	SP_DELREF(sp);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s SP:%p (ID=%u) from %s:%u; refcnt now %u\n",
 			__func__, sp, sp->id, where, tag, sp->refcnt));
 
 	if (sp->refcnt == 0) {
 		*spp = NULL;
 		key_delsp(sp);
 	}
 	SPTREE_UNLOCK();
 }
 
 /*
  * Must be called after calling key_allocsp().
  * For the packet with socket.
  */
 void
 key_freeso(struct socket *so)
 {
 	INIT_VNET_IPSEC(curvnet);
 	IPSEC_ASSERT(so != NULL, ("null so"));
 
 	switch (so->so_proto->pr_domain->dom_family) {
 #ifdef INET
 	case PF_INET:
 	    {
 		struct inpcb *pcb = sotoinpcb(so);
 
 		/* Does it have a PCB ? */
 		if (pcb == NULL)
 			return;
 		key_freesp_so(&pcb->inp_sp->sp_in);
 		key_freesp_so(&pcb->inp_sp->sp_out);
 	    }
 		break;
 #endif
 #ifdef INET6
 	case PF_INET6:
 	    {
 #ifdef HAVE_NRL_INPCB
 		struct inpcb *pcb  = sotoinpcb(so);
 
 		/* Does it have a PCB ? */
 		if (pcb == NULL)
 			return;
 		key_freesp_so(&pcb->inp_sp->sp_in);
 		key_freesp_so(&pcb->inp_sp->sp_out);
 #else
 		struct in6pcb *pcb  = sotoin6pcb(so);
 
 		/* Does it have a PCB ? */
 		if (pcb == NULL)
 			return;
 		key_freesp_so(&pcb->in6p_sp->sp_in);
 		key_freesp_so(&pcb->in6p_sp->sp_out);
 #endif
 	    }
 		break;
 #endif /* INET6 */
 	default:
 		ipseclog((LOG_DEBUG, "%s: unknown address family=%d.\n",
 		    __func__, so->so_proto->pr_domain->dom_family));
 		return;
 	}
 }
 
 static void
 key_freesp_so(struct secpolicy **sp)
 {
 	IPSEC_ASSERT(sp != NULL && *sp != NULL, ("null sp"));
 
 	if ((*sp)->policy == IPSEC_POLICY_ENTRUST ||
 	    (*sp)->policy == IPSEC_POLICY_BYPASS)
 		return;
 
 	IPSEC_ASSERT((*sp)->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", (*sp)->policy));
 	KEY_FREESP(sp);
 }
 
 /*
  * Must be called after calling key_allocsa().
  * This function is called by key_freesp() to free some SA allocated
  * for a policy.
  */
 void
 key_freesav(struct secasvar **psav, const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secasvar *sav = *psav;
 
 	IPSEC_ASSERT(sav != NULL, ("null sav"));
 
 	if (sa_delref(sav)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
 				__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
 		*psav = NULL;
 		key_delsav(sav);
 	} else {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
 				__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
 	}
 }
 
 /* %%% SPD management */
 /*
  * free security policy entry.
  */
 static void
 key_delsp(struct secpolicy *sp)
 {
 	struct ipsecrequest *isr, *nextisr;
 
 	IPSEC_ASSERT(sp != NULL, ("null sp"));
 	SPTREE_LOCK_ASSERT();
 
 	sp->state = IPSEC_SPSTATE_DEAD;
 
 	IPSEC_ASSERT(sp->refcnt == 0,
 		("SP with references deleted (refcnt %u)", sp->refcnt));
 
 	/* remove from SP index */
 	if (__LIST_CHAINED(sp))
 		LIST_REMOVE(sp, chain);
 
 	for (isr = sp->req; isr != NULL; isr = nextisr) {
 		if (isr->sav != NULL) {
 			KEY_FREESAV(&isr->sav);
 			isr->sav = NULL;
 		}
 
 		nextisr = isr->next;
 		ipsec_delisr(isr);
 	}
 	_key_delsp(sp);
 }
 
 /*
  * search SPD
  * OUT:	NULL	: not found
  *	others	: found, pointer to a SP.
  */
 static struct secpolicy *
 key_getsp(struct secpolicyindex *spidx)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (key_cmpspidx_exactly(spidx, &sp->spidx)) {
 			SP_ADDREF(sp);
 			break;
 		}
 	}
 	SPTREE_UNLOCK();
 
 	return sp;
 }
 
 /*
  * get SP by index.
  * OUT:	NULL	: not found
  *	others	: found, pointer to a SP.
  */
 static struct secpolicy *
 key_getspbyid(u_int32_t id)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (sp->id == id) {
 			SP_ADDREF(sp);
 			goto done;
 		}
 	}
 
 	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (sp->id == id) {
 			SP_ADDREF(sp);
 			goto done;
 		}
 	}
 done:
 	SPTREE_UNLOCK();
 
 	return sp;
 }
 
 struct secpolicy *
 key_newsp(const char* where, int tag)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *newsp = NULL;
 
 	newsp = (struct secpolicy *)
 		malloc(sizeof(struct secpolicy), M_IPSEC_SP, M_NOWAIT|M_ZERO);
 	if (newsp) {
 		SECPOLICY_LOCK_INIT(newsp);
 		newsp->refcnt = 1;
 		newsp->req = NULL;
 	}
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u return SP:%p\n", __func__,
 			where, tag, newsp));
 	return newsp;
 }
 
 static void
 _key_delsp(struct secpolicy *sp)
 {
 	SECPOLICY_LOCK_DESTROY(sp);
 	free(sp, M_IPSEC_SP);
 }
 
 /*
  * create secpolicy structure from sadb_x_policy structure.
  * NOTE: `state', `secpolicyindex' in secpolicy structure are not set,
  * so must be set properly later.
  */
 struct secpolicy *
 key_msg2sp(xpl0, len, error)
 	struct sadb_x_policy *xpl0;
 	size_t len;
 	int *error;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *newsp;
 
 	IPSEC_ASSERT(xpl0 != NULL, ("null xpl0"));
 	IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len));
 
 	if (len != PFKEY_EXTLEN(xpl0)) {
 		ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__));
 		*error = EINVAL;
 		return NULL;
 	}
 
 	if ((newsp = KEY_NEWSP()) == NULL) {
 		*error = ENOBUFS;
 		return NULL;
 	}
 
 	newsp->spidx.dir = xpl0->sadb_x_policy_dir;
 	newsp->policy = xpl0->sadb_x_policy_type;
 
 	/* check policy */
 	switch (xpl0->sadb_x_policy_type) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_NONE:
 	case IPSEC_POLICY_ENTRUST:
 	case IPSEC_POLICY_BYPASS:
 		newsp->req = NULL;
 		break;
 
 	case IPSEC_POLICY_IPSEC:
 	    {
 		int tlen;
 		struct sadb_x_ipsecrequest *xisr;
 		struct ipsecrequest **p_isr = &newsp->req;
 
 		/* validity check */
 		if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) {
 			ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n",
 				__func__));
 			KEY_FREESP(&newsp);
 			*error = EINVAL;
 			return NULL;
 		}
 
 		tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0);
 		xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1);
 
 		while (tlen > 0) {
 			/* length check */
 			if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) {
 				ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest "
 					"length.\n", __func__));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 
 			/* allocate request buffer */
 			/* NB: data structure is zero'd */
 			*p_isr = ipsec_newisr();
 			if ((*p_isr) == NULL) {
 				ipseclog((LOG_DEBUG,
 				    "%s: No more memory.\n", __func__));
 				KEY_FREESP(&newsp);
 				*error = ENOBUFS;
 				return NULL;
 			}
 
 			/* set values */
 			switch (xisr->sadb_x_ipsecrequest_proto) {
 			case IPPROTO_ESP:
 			case IPPROTO_AH:
 			case IPPROTO_IPCOMP:
 				break;
 			default:
 				ipseclog((LOG_DEBUG,
 				    "%s: invalid proto type=%u\n", __func__,
 				    xisr->sadb_x_ipsecrequest_proto));
 				KEY_FREESP(&newsp);
 				*error = EPROTONOSUPPORT;
 				return NULL;
 			}
 			(*p_isr)->saidx.proto = xisr->sadb_x_ipsecrequest_proto;
 
 			switch (xisr->sadb_x_ipsecrequest_mode) {
 			case IPSEC_MODE_TRANSPORT:
 			case IPSEC_MODE_TUNNEL:
 				break;
 			case IPSEC_MODE_ANY:
 			default:
 				ipseclog((LOG_DEBUG,
 				    "%s: invalid mode=%u\n", __func__,
 				    xisr->sadb_x_ipsecrequest_mode));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 			(*p_isr)->saidx.mode = xisr->sadb_x_ipsecrequest_mode;
 
 			switch (xisr->sadb_x_ipsecrequest_level) {
 			case IPSEC_LEVEL_DEFAULT:
 			case IPSEC_LEVEL_USE:
 			case IPSEC_LEVEL_REQUIRE:
 				break;
 			case IPSEC_LEVEL_UNIQUE:
 				/* validity check */
 				/*
 				 * If range violation of reqid, kernel will
 				 * update it, don't refuse it.
 				 */
 				if (xisr->sadb_x_ipsecrequest_reqid
 						> IPSEC_MANUAL_REQID_MAX) {
 					ipseclog((LOG_DEBUG,
 					    "%s: reqid=%d range "
 					    "violation, updated by kernel.\n",
 					    __func__,
 					    xisr->sadb_x_ipsecrequest_reqid));
 					xisr->sadb_x_ipsecrequest_reqid = 0;
 				}
 
 				/* allocate new reqid id if reqid is zero. */
 				if (xisr->sadb_x_ipsecrequest_reqid == 0) {
 					u_int32_t reqid;
 					if ((reqid = key_newreqid()) == 0) {
 						KEY_FREESP(&newsp);
 						*error = ENOBUFS;
 						return NULL;
 					}
 					(*p_isr)->saidx.reqid = reqid;
 					xisr->sadb_x_ipsecrequest_reqid = reqid;
 				} else {
 				/* set it for manual keying. */
 					(*p_isr)->saidx.reqid =
 						xisr->sadb_x_ipsecrequest_reqid;
 				}
 				break;
 
 			default:
 				ipseclog((LOG_DEBUG, "%s: invalid level=%u\n",
 					__func__,
 					xisr->sadb_x_ipsecrequest_level));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 			(*p_isr)->level = xisr->sadb_x_ipsecrequest_level;
 
 			/* set IP addresses if there */
 			if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) {
 				struct sockaddr *paddr;
 
 				paddr = (struct sockaddr *)(xisr + 1);
 
 				/* validity check */
 				if (paddr->sa_len
 				    > sizeof((*p_isr)->saidx.src)) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"request address length.\n",
 						__func__));
 					KEY_FREESP(&newsp);
 					*error = EINVAL;
 					return NULL;
 				}
 				bcopy(paddr, &(*p_isr)->saidx.src,
 					paddr->sa_len);
 
 				paddr = (struct sockaddr *)((caddr_t)paddr
 							+ paddr->sa_len);
 
 				/* validity check */
 				if (paddr->sa_len
 				    > sizeof((*p_isr)->saidx.dst)) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"request address length.\n",
 						__func__));
 					KEY_FREESP(&newsp);
 					*error = EINVAL;
 					return NULL;
 				}
 				bcopy(paddr, &(*p_isr)->saidx.dst,
 					paddr->sa_len);
 			}
 
 			(*p_isr)->sp = newsp;
 
 			/* initialization for the next. */
 			p_isr = &(*p_isr)->next;
 			tlen -= xisr->sadb_x_ipsecrequest_len;
 
 			/* validity check */
 			if (tlen < 0) {
 				ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n",
 					__func__));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 
 			xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr
 			                 + xisr->sadb_x_ipsecrequest_len);
 		}
 	    }
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
 		KEY_FREESP(&newsp);
 		*error = EINVAL;
 		return NULL;
 	}
 
 	*error = 0;
 	return newsp;
 }
 
 static u_int32_t
 key_newreqid()
 {
 	static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;
 
 	auto_reqid = (auto_reqid == ~0
 			? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1);
 
 	/* XXX should be unique check */
 
 	return auto_reqid;
 }
 
 /*
  * copy secpolicy struct to sadb_x_policy structure indicated.
  */
 struct mbuf *
 key_sp2msg(sp)
 	struct secpolicy *sp;
 {
 	struct sadb_x_policy *xpl;
 	int tlen;
 	caddr_t p;
 	struct mbuf *m;
 
 	IPSEC_ASSERT(sp != NULL, ("null policy"));
 
 	tlen = key_getspreqmsglen(sp);
 
 	m = key_alloc_mbuf(tlen);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		return NULL;
 	}
 
 	m->m_len = tlen;
 	m->m_next = NULL;
 	xpl = mtod(m, struct sadb_x_policy *);
 	bzero(xpl, tlen);
 
 	xpl->sadb_x_policy_len = PFKEY_UNIT64(tlen);
 	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
 	xpl->sadb_x_policy_type = sp->policy;
 	xpl->sadb_x_policy_dir = sp->spidx.dir;
 	xpl->sadb_x_policy_id = sp->id;
 	p = (caddr_t)xpl + sizeof(*xpl);
 
 	/* if is the policy for ipsec ? */
 	if (sp->policy == IPSEC_POLICY_IPSEC) {
 		struct sadb_x_ipsecrequest *xisr;
 		struct ipsecrequest *isr;
 
 		for (isr = sp->req; isr != NULL; isr = isr->next) {
 
 			xisr = (struct sadb_x_ipsecrequest *)p;
 
 			xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto;
 			xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode;
 			xisr->sadb_x_ipsecrequest_level = isr->level;
 			xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid;
 
 			p += sizeof(*xisr);
 			bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len);
 			p += isr->saidx.src.sa.sa_len;
 			bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len);
 			p += isr->saidx.src.sa.sa_len;
 
 			xisr->sadb_x_ipsecrequest_len =
 				PFKEY_ALIGN8(sizeof(*xisr)
 					+ isr->saidx.src.sa.sa_len
 					+ isr->saidx.dst.sa.sa_len);
 		}
 	}
 
 	return m;
 }
 
 /* m will not be freed nor modified */
 static struct mbuf *
 #ifdef __STDC__
 key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp,
 	int ndeep, int nitem, ...)
 #else
 key_gather_mbuf(m, mhp, ndeep, nitem, va_alist)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 	int ndeep;
 	int nitem;
 	va_dcl
 #endif
 {
 	va_list ap;
 	int idx;
 	int i;
 	struct mbuf *result = NULL, *n;
 	int len;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 
 	va_start(ap, nitem);
 	for (i = 0; i < nitem; i++) {
 		idx = va_arg(ap, int);
 		if (idx < 0 || idx > SADB_EXT_MAX)
 			goto fail;
 		/* don't attempt to pull empty extension */
 		if (idx == SADB_EXT_RESERVED && mhp->msg == NULL)
 			continue;
 		if (idx != SADB_EXT_RESERVED  &&
 		    (mhp->ext[idx] == NULL || mhp->extlen[idx] == 0))
 			continue;
 
 		if (idx == SADB_EXT_RESERVED) {
 			len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 			IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len));
 
 			MGETHDR(n, M_DONTWAIT, MT_DATA);
 			if (!n)
 				goto fail;
 			n->m_len = len;
 			n->m_next = NULL;
 			m_copydata(m, 0, sizeof(struct sadb_msg),
 			    mtod(n, caddr_t));
 		} else if (i < ndeep) {
 			len = mhp->extlen[idx];
 			n = key_alloc_mbuf(len);
 			if (!n || n->m_next) {	/*XXX*/
 				if (n)
 					m_freem(n);
 				goto fail;
 			}
 			m_copydata(m, mhp->extoff[idx], mhp->extlen[idx],
 			    mtod(n, caddr_t));
 		} else {
 			n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx],
 			    M_DONTWAIT);
 		}
 		if (n == NULL)
 			goto fail;
 
 		if (result)
 			m_cat(result, n);
 		else
 			result = n;
 	}
 	va_end(ap);
 
 	if ((result->m_flags & M_PKTHDR) != 0) {
 		result->m_pkthdr.len = 0;
 		for (n = result; n; n = n->m_next)
 			result->m_pkthdr.len += n->m_len;
 	}
 
 	return result;
 
 fail:
 	m_freem(result);
 	return NULL;
 }
 
 /*
  * SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing
  * add an entry to SP database, when received
  *   <base, address(SD), (lifetime(H),) policy>
  * from the user(?).
  * Adding to SP database,
  * and send
  *   <base, address(SD), (lifetime(H),) policy>
  * to the socket which was send.
  *
  * SPDADD set a unique policy entry.
  * SPDSETIDX like SPDADD without a part of policy requests.
  * SPDUPDATE replace a unique policy entry.
  *
  * m will always be freed.
  */
 static int
 key_spdadd(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_address *src0, *dst0;
 	struct sadb_x_policy *xpl0, *xpl;
 	struct sadb_lifetime *lft = NULL;
 	struct secpolicyindex spidx;
 	struct secpolicy *newsp;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_X_EXT_POLICY] == NULL) {
 		ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n"));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_HARD]
 			< sizeof(struct sadb_lifetime)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 				__func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
 
 	/* make secindex */
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &spidx);
 
 	/* checking the direciton. */
 	switch (xpl0->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 	case IPSEC_DIR_OUTBOUND:
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
 		mhp->msg->sadb_msg_errno = EINVAL;
 		return 0;
 	}
 
 	/* check policy */
 	/* key_spdadd() accepts DISCARD, NONE and IPSEC. */
 	if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST
 	 || xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
 		ipseclog((LOG_DEBUG, "%s: Invalid policy type.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* policy requests are mandatory when action is ipsec. */
         if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX
 	 && xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC
 	 && mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) {
 		ipseclog((LOG_DEBUG, "%s: some policy requests part required\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/*
 	 * checking there is SP already or not.
 	 * SPDUPDATE doesn't depend on whether there is a SP or not.
 	 * If the type is either SPDADD or SPDSETIDX AND a SP is found,
 	 * then error.
 	 */
 	newsp = key_getsp(&spidx);
 	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
 		if (newsp) {
 			newsp->state = IPSEC_SPSTATE_DEAD;
 			KEY_FREESP(&newsp);
 		}
 	} else {
 		if (newsp != NULL) {
 			KEY_FREESP(&newsp);
 			ipseclog((LOG_DEBUG, "%s: a SP entry exists already.\n",
 				__func__));
 			return key_senderror(so, m, EEXIST);
 		}
 	}
 
 	/* allocation new SP entry */
 	if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) {
 		return key_senderror(so, m, error);
 	}
 
 	if ((newsp->id = key_getnewspid()) == 0) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &newsp->spidx);
 
 	/* sanity check on addr pair */
 	if (((struct sockaddr *)(src0 + 1))->sa_family !=
 			((struct sockaddr *)(dst0+ 1))->sa_family) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, EINVAL);
 	}
 	if (((struct sockaddr *)(src0 + 1))->sa_len !=
 			((struct sockaddr *)(dst0+ 1))->sa_len) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, EINVAL);
 	}
 #if 1
 	if (newsp->req && newsp->req->saidx.src.sa.sa_family) {
 		struct sockaddr *sa;
 		sa = (struct sockaddr *)(src0 + 1);
 		if (sa->sa_family != newsp->req->saidx.src.sa.sa_family) {
 			_key_delsp(newsp);
 			return key_senderror(so, m, EINVAL);
 		}
 	}
 	if (newsp->req && newsp->req->saidx.dst.sa.sa_family) {
 		struct sockaddr *sa;
 		sa = (struct sockaddr *)(dst0 + 1);
 		if (sa->sa_family != newsp->req->saidx.dst.sa.sa_family) {
 			_key_delsp(newsp);
 			return key_senderror(so, m, EINVAL);
 		}
 	}
 #endif
 
 	newsp->created = time_second;
 	newsp->lastused = newsp->created;
 	newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0;
 	newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0;
 
 	newsp->refcnt = 1;	/* do not reclaim until I say I do */
 	newsp->state = IPSEC_SPSTATE_ALIVE;
 	LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain);
 
 	/* delete the entry in spacqtree */
 	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
 		struct secspacq *spacq = key_getspacq(&spidx);
 		if (spacq != NULL) {
 			/* reset counter in order to deletion by timehandler. */
 			spacq->created = time_second;
 			spacq->count = 0;
 			SPACQ_UNLOCK();
 		}
     	}
 
     {
 	struct mbuf *n, *mpolicy;
 	struct sadb_msg *newmsg;
 	int off;
 
 	/* create new sadb_msg to reply. */
 	if (lft) {
 		n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED,
 		    SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD,
 		    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	} else {
 		n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED,
 		    SADB_X_EXT_POLICY,
 		    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(*newmsg)) {
 		n = m_pullup(n, sizeof(*newmsg));
 		if (!n)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	off = 0;
 	mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)),
 	    sizeof(*xpl), &off);
 	if (mpolicy == NULL) {
 		/* n is already freed */
 		return key_senderror(so, m, ENOBUFS);
 	}
 	xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off);
 	if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) {
 		m_freem(n);
 		return key_senderror(so, m, EINVAL);
 	}
 	xpl->sadb_x_policy_id = newsp->id;
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * get new policy id.
  * OUT:
  *	0:	failure.
  *	others: success.
  */
 static u_int32_t
 key_getnewspid()
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int32_t newid = 0;
 	int count = V_key_spi_trycnt;	/* XXX */
 	struct secpolicy *sp;
 
 	/* when requesting to allocate spi ranged */
 	while (count--) {
 		newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1));
 
 		if ((sp = key_getspbyid(newid)) == NULL)
 			break;
 
 		KEY_FREESP(&sp);
 	}
 
 	if (count == 0 || newid == 0) {
 		ipseclog((LOG_DEBUG, "%s: to allocate policy id is failed.\n",
 			__func__));
 		return 0;
 	}
 
 	return newid;
 }
 
 /*
  * SADB_SPDDELETE processing
  * receive
  *   <base, address(SD), policy(*)>
  * from the user(?), and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, address(SD), policy(*)>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spddelete(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_address *src0, *dst0;
 	struct sadb_x_policy *xpl0;
 	struct secpolicyindex spidx;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(so != NULL, ("null so"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_X_EXT_POLICY] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
 
 	/* make secindex */
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &spidx);
 
 	/* checking the direciton. */
 	switch (xpl0->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 	case IPSEC_DIR_OUTBOUND:
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getsp(&spidx)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* save policy id to buffer to be returned. */
 	xpl0->sadb_x_policy_id = sp->id;
 
 	sp->state = IPSEC_SPSTATE_DEAD;
 	KEY_FREESP(&sp);
 
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
 	    SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_SPDDELETE2 processing
  * receive
  *   <base, policy(*)>
  * from the user(?), and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, policy(*)>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spddelete2(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int32_t id;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_X_EXT_POLICY] == NULL ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getspbyid(id)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	sp->state = IPSEC_SPSTATE_DEAD;
 	KEY_FREESP(&sp);
 
     {
 	struct mbuf *n, *nn;
 	struct sadb_msg *newmsg;
 	int off, len;
 
 	/* create new sadb_msg to reply. */
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	MGETHDR(n, M_DONTWAIT, MT_DATA);
 	if (n && len > MHLEN) {
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)",
 		off, len));
 
 	n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY],
 	    mhp->extlen[SADB_X_EXT_POLICY], M_DONTWAIT);
 	if (!n->m_next) {
 		m_freem(n);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	n->m_pkthdr.len = 0;
 	for (nn = n; nn; nn = nn->m_next)
 		n->m_pkthdr.len += nn->m_len;
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_X_GET processing
  * receive
  *   <base, policy(*)>
  * from the user(?),
  * and send,
  *   <base, address(SD), policy>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spdget(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int32_t id;
 	struct secpolicy *sp;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_X_EXT_POLICY] == NULL ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getspbyid(id)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	n = key_setdumpsp(sp, SADB_X_SPDGET, 0, mhp->msg->sadb_msg_pid);
 	if (n != NULL) {
 		m_freem(m);
 		return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 	} else
 		return key_senderror(so, m, ENOBUFS);
 }
 
 /*
  * SADB_X_SPDACQUIRE processing.
  * Acquire policy and SA(s) for a *OUTBOUND* packet.
  * send
  *   <base, policy(*)>
  * to KMD, and expect to receive
  *   <base> with SADB_X_SPDACQUIRE if error occured,
  * or
  *   <base, policy>
  * with SADB_X_SPDUPDATE from KMD by PF_KEY.
  * policy(*) is without policy requests.
  *
  *    0     : succeed
  *    others: error number
  */
 int
 key_spdacquire(sp)
 	struct secpolicy *sp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct mbuf *result = NULL, *m;
 	struct secspacq *newspacq;
 
 	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
 	IPSEC_ASSERT(sp->req == NULL, ("policy exists"));
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("policy not IPSEC %u", sp->policy));
 
 	/* Get an entry to check whether sent message or not. */
 	newspacq = key_getspacq(&sp->spidx);
 	if (newspacq != NULL) {
 		if (V_key_blockacq_count < newspacq->count) {
 			/* reset counter and do send message. */
 			newspacq->count = 0;
 		} else {
 			/* increment counter and do nothing. */
 			newspacq->count++;
 			return 0;
 		}
 		SPACQ_UNLOCK();
 	} else {
 		/* make new entry for blocking to send SADB_ACQUIRE. */
 		newspacq = key_newspacq(&sp->spidx);
 		if (newspacq == NULL)
 			return ENOBUFS;
 	}
 
 	/* create new sadb_msg to reply. */
 	m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0);
 	if (!m)
 		return ENOBUFS;
 
 	result = m;
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED);
 }
 
 /*
  * SADB_SPDFLUSH processing
  * receive
  *   <base>
  * from the user, and free all entries in secpctree.
  * and send,
  *   <base>
  * to the user.
  * NOTE: what to do is only marking SADB_SASTATE_DEAD.
  *
  * m will always be freed.
  */
 static int
 key_spdflush(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_msg *newmsg;
 	struct secpolicy *sp;
 	u_int dir;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg)))
 		return key_senderror(so, m, EINVAL);
 
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		SPTREE_LOCK();
 		LIST_FOREACH(sp, &V_sptree[dir], chain)
 			sp->state = IPSEC_SPSTATE_DEAD;
 		SPTREE_UNLOCK();
 	}
 
 	if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (m->m_next)
 		m_freem(m->m_next);
 	m->m_next = NULL;
 	m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 	newmsg = mtod(m, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 }
 
 /*
  * SADB_SPDDUMP processing
  * receive
  *   <base>
  * from the user, and dump all SP leaves
  * and send,
  *   <base> .....
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_spddump(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secpolicy *sp;
 	int cnt;
 	u_int dir;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* search SPD entry and get buffer size. */
 	cnt = 0;
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			cnt++;
 		}
 	}
 
 	if (cnt == 0)
 		return key_senderror(so, m, ENOENT);
 
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			--cnt;
 			n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
 			    mhp->msg->sadb_msg_pid);
 
 			if (n)
 				key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 		}
 	}
 
 	m_freem(m);
 	return 0;
 }
 
 static struct mbuf *
 key_setdumpsp(sp, type, seq, pid)
 	struct secpolicy *sp;
 	u_int8_t type;
 	u_int32_t seq, pid;
 {
 	struct mbuf *result = NULL, *m;
 	struct seclifetime lt;
 
 	m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt);
 	if (!m)
 		goto fail;
 	result = m;
 
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sp->spidx.src.sa, sp->spidx.prefs,
 	    sp->spidx.ul_proto);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sp->spidx.dst.sa, sp->spidx.prefd,
 	    sp->spidx.ul_proto);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	m = key_sp2msg(sp);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	if(sp->lifetime){
 		lt.addtime=sp->created;
 		lt.usetime= sp->lastused;
 		m = key_setlifetime(&lt, SADB_EXT_LIFETIME_CURRENT);
 		if (!m)
 			goto fail;
 		m_cat(result, m);
 		
 		lt.addtime=sp->lifetime;
 		lt.usetime= sp->validtime;
 		m = key_setlifetime(&lt, SADB_EXT_LIFETIME_HARD);
 		if (!m)
 			goto fail;
 		m_cat(result, m);
 	}
 
 	if ((result->m_flags & M_PKTHDR) == 0)
 		goto fail;
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL)
 			goto fail;
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return result;
 
 fail:
 	m_freem(result);
 	return NULL;
 }
 
 /*
  * get PFKEY message length for security policy and request.
  */
 static u_int
 key_getspreqmsglen(sp)
 	struct secpolicy *sp;
 {
 	u_int tlen;
 
 	tlen = sizeof(struct sadb_x_policy);
 
 	/* if is the policy for ipsec ? */
 	if (sp->policy != IPSEC_POLICY_IPSEC)
 		return tlen;
 
 	/* get length of ipsec requests */
     {
 	struct ipsecrequest *isr;
 	int len;
 
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		len = sizeof(struct sadb_x_ipsecrequest)
 			+ isr->saidx.src.sa.sa_len
 			+ isr->saidx.dst.sa.sa_len;
 
 		tlen += PFKEY_ALIGN8(len);
 	}
     }
 
 	return tlen;
 }
 
 /*
  * SADB_SPDEXPIRE processing
  * send
  *   <base, address(SD), lifetime(CH), policy>
  * to KMD by PF_KEY.
  *
  * OUT:	0	: succeed
  *	others	: error number
  */
 static int
 key_spdexpire(sp)
 	struct secpolicy *sp;
 {
 	struct mbuf *result = NULL, *m;
 	int len;
 	int error = -1;
 	struct sadb_lifetime *lt;
 
 	/* XXX: Why do we lock ? */
 
 	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
 
 	/* set msg header */
 	m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/* create lifetime extension (current and hard) */
 	len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		error = ENOBUFS;
 		goto fail;
 	}
 	bzero(mtod(m, caddr_t), len);
 	lt = mtod(m, struct sadb_lifetime *);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
 	lt->sadb_lifetime_allocations = 0;
 	lt->sadb_lifetime_bytes = 0;
 	lt->sadb_lifetime_addtime = sp->created;
 	lt->sadb_lifetime_usetime = sp->lastused;
 	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
 	lt->sadb_lifetime_allocations = 0;
 	lt->sadb_lifetime_bytes = 0;
 	lt->sadb_lifetime_addtime = sp->lifetime;
 	lt->sadb_lifetime_usetime = sp->validtime;
 	m_cat(result, m);
 
 	/* set sadb_address for source */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sp->spidx.src.sa,
 	    sp->spidx.prefs, sp->spidx.ul_proto);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set sadb_address for destination */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sp->spidx.dst.sa,
 	    sp->spidx.prefd, sp->spidx.ul_proto);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set secpolicy */
 	m = key_sp2msg(sp);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	return error;
 }
 
 /* %%% SAD management */
 /*
  * allocating a memory for new SA head, and copy from the values of mhp.
  * OUT:	NULL	: failure due to the lack of memory.
  *	others	: pointer to new SA head.
  */
 static struct secashead *
 key_newsah(saidx)
 	struct secasindex *saidx;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *newsah;
 
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 
 	newsah = malloc(sizeof(struct secashead), M_IPSEC_SAH, M_NOWAIT|M_ZERO);
 	if (newsah != NULL) {
 		int i;
 		for (i = 0; i < sizeof(newsah->savtree)/sizeof(newsah->savtree[0]); i++)
 			LIST_INIT(&newsah->savtree[i]);
 		newsah->saidx = *saidx;
 
 		/* add to saidxtree */
 		newsah->state = SADB_SASTATE_MATURE;
 
 		SAHTREE_LOCK();
 		LIST_INSERT_HEAD(&V_sahtree, newsah, chain);
 		SAHTREE_UNLOCK();
 	}
 	return(newsah);
 }
 
 /*
  * delete SA index and all SA registerd.
  */
 static void
 key_delsah(sah)
 	struct secashead *sah;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secasvar *sav, *nextsav;
 	u_int stateidx;
 	int zombie = 0;
 
 	IPSEC_ASSERT(sah != NULL, ("NULL sah"));
 	SAHTREE_LOCK_ASSERT();
 
 	/* searching all SA registerd in the secindex. */
 	for (stateidx = 0;
-	     stateidx < _ARRAYLEN(V_saorder_state_any);
+	     stateidx < _ARRAYLEN(saorder_state_any);
 	     stateidx++) {
-		u_int state = V_saorder_state_any[stateidx];
+		u_int state = saorder_state_any[stateidx];
 		LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) {
 			if (sav->refcnt == 0) {
 				/* sanity check */
 				KEY_CHKSASTATE(state, sav->state, __func__);
 				KEY_FREESAV(&sav);
 			} else {
 				/* give up to delete this sa */
 				zombie++;
 			}
 		}
 	}
 	if (!zombie) {		/* delete only if there are savs */
 		/* remove from tree of SA index */
 		if (__LIST_CHAINED(sah))
 			LIST_REMOVE(sah, chain);
 		if (sah->sa_route.ro_rt) {
 			RTFREE(sah->sa_route.ro_rt);
 			sah->sa_route.ro_rt = (struct rtentry *)NULL;
 		}
 		free(sah, M_IPSEC_SAH);
 	}
 }
 
 /*
  * allocating a new SA with LARVAL state.  key_add() and key_getspi() call,
  * and copy the values of mhp into new buffer.
  * When SAD message type is GETSPI:
  *	to set sequence number from acq_seq++,
  *	to set zero to SPI.
  *	not to call key_setsava().
  * OUT:	NULL	: fail
  *	others	: pointer to new secasvar.
  *
  * does not modify mbuf.  does not free mbuf on error.
  */
 static struct secasvar *
 key_newsav(m, mhp, sah, errp, where, tag)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 	struct secashead *sah;
 	int *errp;
 	const char* where;
 	int tag;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secasvar *newsav;
 	const struct sadb_sa *xsa;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 	IPSEC_ASSERT(sah != NULL, ("null secashead"));
 
 	newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT|M_ZERO);
 	if (newsav == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		*errp = ENOBUFS;
 		goto done;
 	}
 
 	switch (mhp->msg->sadb_msg_type) {
 	case SADB_GETSPI:
 		newsav->spi = 0;
 
 #ifdef IPSEC_DOSEQCHECK
 		/* sync sequence number */
 		if (mhp->msg->sadb_msg_seq == 0)
 			newsav->seq =
 				(V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq));
 		else
 #endif
 			newsav->seq = mhp->msg->sadb_msg_seq;
 		break;
 
 	case SADB_ADD:
 		/* sanity check */
 		if (mhp->ext[SADB_EXT_SA] == NULL) {
 			free(newsav, M_IPSEC_SA);
 			newsav = NULL;
 			ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 				__func__));
 			*errp = EINVAL;
 			goto done;
 		}
 		xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 		newsav->spi = xsa->sadb_sa_spi;
 		newsav->seq = mhp->msg->sadb_msg_seq;
 		break;
 	default:
 		free(newsav, M_IPSEC_SA);
 		newsav = NULL;
 		*errp = EINVAL;
 		goto done;
 	}
 
 
 	/* copy sav values */
 	if (mhp->msg->sadb_msg_type != SADB_GETSPI) {
 		*errp = key_setsaval(newsav, m, mhp);
 		if (*errp) {
 			free(newsav, M_IPSEC_SA);
 			newsav = NULL;
 			goto done;
 		}
 	}
 
 	SECASVAR_LOCK_INIT(newsav);
 
 	/* reset created */
 	newsav->created = time_second;
 	newsav->pid = mhp->msg->sadb_msg_pid;
 
 	/* add to satree */
 	newsav->sah = sah;
 	sa_initref(newsav);
 	newsav->state = SADB_SASTATE_LARVAL;
 
 	/* XXX locking??? */
 	LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav,
 			secasvar, chain);
 done:
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u return SP:%p\n", __func__,
 			where, tag, newsav));
 
 	return newsav;
 }
 
 /*
  * free() SA variable entry.
  */
 static void
 key_cleansav(struct secasvar *sav)
 {
 	/*
 	 * Cleanup xform state.  Note that zeroize'ing causes the
 	 * keys to be cleared; otherwise we must do it ourself.
 	 */
 	if (sav->tdb_xform != NULL) {
 		sav->tdb_xform->xf_zeroize(sav);
 		sav->tdb_xform = NULL;
 	} else {
 		KASSERT(sav->iv == NULL, ("iv but no xform"));
 		if (sav->key_auth != NULL)
 			bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 		if (sav->key_enc != NULL)
 			bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
 	}
 	if (sav->key_auth != NULL) {
 		if (sav->key_auth->key_data != NULL)
 			free(sav->key_auth->key_data, M_IPSEC_MISC);
 		free(sav->key_auth, M_IPSEC_MISC);
 		sav->key_auth = NULL;
 	}
 	if (sav->key_enc != NULL) {
 		if (sav->key_enc->key_data != NULL)
 			free(sav->key_enc->key_data, M_IPSEC_MISC);
 		free(sav->key_enc, M_IPSEC_MISC);
 		sav->key_enc = NULL;
 	}
 	if (sav->sched) {
 		bzero(sav->sched, sav->schedlen);
 		free(sav->sched, M_IPSEC_MISC);
 		sav->sched = NULL;
 	}
 	if (sav->replay != NULL) {
 		free(sav->replay, M_IPSEC_MISC);
 		sav->replay = NULL;
 	}
 	if (sav->lft_c != NULL) {
 		free(sav->lft_c, M_IPSEC_MISC);
 		sav->lft_c = NULL;
 	}
 	if (sav->lft_h != NULL) {
 		free(sav->lft_h, M_IPSEC_MISC);
 		sav->lft_h = NULL;
 	}
 	if (sav->lft_s != NULL) {
 		free(sav->lft_s, M_IPSEC_MISC);
 		sav->lft_s = NULL;
 	}
 }
 
 /*
  * free() SA variable entry.
  */
 static void
 key_delsav(sav)
 	struct secasvar *sav;
 {
 	IPSEC_ASSERT(sav != NULL, ("null sav"));
 	IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0", sav->refcnt));
 
 	/* remove from SA header */
 	if (__LIST_CHAINED(sav))
 		LIST_REMOVE(sav, chain);
 	key_cleansav(sav);
 	SECASVAR_LOCK_DESTROY(sav);
 	free(sav, M_IPSEC_SA);
 }
 
 /*
  * search SAD.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secashead *
 key_getsah(saidx)
 	struct secasindex *saidx;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah;
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID))
 			break;
 	}
 	SAHTREE_UNLOCK();
 
 	return sah;
 }
 
 /*
  * check not to be duplicated SPI.
  * NOTE: this function is too slow due to searching all SAD.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_checkspidup(saidx, spi)
 	struct secasindex *saidx;
 	u_int32_t spi;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah;
 	struct secasvar *sav;
 
 	/* check address family */
 	if (saidx->src.sa.sa_family != saidx->dst.sa.sa_family) {
 		ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
 			__func__));
 		return NULL;
 	}
 
 	sav = NULL;
 	/* check all SAD */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst))
 			continue;
 		sav = key_getsavbyspi(sah, spi);
 		if (sav != NULL)
 			break;
 	}
 	SAHTREE_UNLOCK();
 
 	return sav;
 }
 
 /*
  * search SAD litmited alive SA, protocol, SPI.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_getsavbyspi(sah, spi)
 	struct secashead *sah;
 	u_int32_t spi;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secasvar *sav;
 	u_int stateidx, state;
 
 	sav = NULL;
 	SAHTREE_LOCK_ASSERT();
 	/* search all status */
 	for (stateidx = 0;
-	     stateidx < _ARRAYLEN(V_saorder_state_alive);
+	     stateidx < _ARRAYLEN(saorder_state_alive);
 	     stateidx++) {
 
-		state = V_saorder_state_alive[stateidx];
+		state = saorder_state_alive[stateidx];
 		LIST_FOREACH(sav, &sah->savtree[state], chain) {
 
 			/* sanity check */
 			if (sav->state != state) {
 				ipseclog((LOG_DEBUG, "%s: "
 				    "invalid sav->state (queue: %d SA: %d)\n",
 				    __func__, state, sav->state));
 				continue;
 			}
 
 			if (sav->spi == spi)
 				return sav;
 		}
 	}
 
 	return NULL;
 }
 
 /*
  * copy SA values from PF_KEY message except *SPI, SEQ, PID, STATE and TYPE*.
  * You must update these if need.
  * OUT:	0:	success.
  *	!0:	failure.
  *
  * does not modify mbuf.  does not free mbuf on error.
  */
 static int
 key_setsaval(sav, m, mhp)
 	struct secasvar *sav;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	int error = 0;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* initialization */
 	sav->replay = NULL;
 	sav->key_auth = NULL;
 	sav->key_enc = NULL;
 	sav->sched = NULL;
 	sav->schedlen = 0;
 	sav->iv = NULL;
 	sav->lft_c = NULL;
 	sav->lft_h = NULL;
 	sav->lft_s = NULL;
 	sav->tdb_xform = NULL;		/* transform */
 	sav->tdb_encalgxform = NULL;	/* encoding algorithm */
 	sav->tdb_authalgxform = NULL;	/* authentication algorithm */
 	sav->tdb_compalgxform = NULL;	/* compression algorithm */
 
 	/* SA */
 	if (mhp->ext[SADB_EXT_SA] != NULL) {
 		const struct sadb_sa *sa0;
 
 		sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 		if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) {
 			error = EINVAL;
 			goto fail;
 		}
 
 		sav->alg_auth = sa0->sadb_sa_auth;
 		sav->alg_enc = sa0->sadb_sa_encrypt;
 		sav->flags = sa0->sadb_sa_flags;
 
 		/* replay window */
 		if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) {
 			sav->replay = (struct secreplay *)
 				malloc(sizeof(struct secreplay)+sa0->sadb_sa_replay, M_IPSEC_MISC, M_NOWAIT|M_ZERO);
 			if (sav->replay == NULL) {
 				ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 					__func__));
 				error = ENOBUFS;
 				goto fail;
 			}
 			if (sa0->sadb_sa_replay != 0)
 				sav->replay->bitmap = (caddr_t)(sav->replay+1);
 			sav->replay->wsize = sa0->sadb_sa_replay;
 		}
 	}
 
 	/* Authentication keys */
 	if (mhp->ext[SADB_EXT_KEY_AUTH] != NULL) {
 		const struct sadb_key *key0;
 		int len;
 
 		key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH];
 		len = mhp->extlen[SADB_EXT_KEY_AUTH];
 
 		error = 0;
 		if (len < sizeof(*key0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		switch (mhp->msg->sadb_msg_satype) {
 		case SADB_SATYPE_AH:
 		case SADB_SATYPE_ESP:
 		case SADB_X_SATYPE_TCPSIGNATURE:
 			if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
 			    sav->alg_auth != SADB_X_AALG_NULL)
 				error = EINVAL;
 			break;
 		case SADB_X_SATYPE_IPCOMP:
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n",
 				__func__));
 			goto fail;
 		}
 
 		sav->key_auth = (struct seckey *)key_dup_keymsg(key0, len,
 								M_IPSEC_MISC);
 		if (sav->key_auth == NULL ) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 				  __func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	/* Encryption key */
 	if (mhp->ext[SADB_EXT_KEY_ENCRYPT] != NULL) {
 		const struct sadb_key *key0;
 		int len;
 
 		key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT];
 		len = mhp->extlen[SADB_EXT_KEY_ENCRYPT];
 
 		error = 0;
 		if (len < sizeof(*key0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		switch (mhp->msg->sadb_msg_satype) {
 		case SADB_SATYPE_ESP:
 			if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
 			    sav->alg_enc != SADB_EALG_NULL) {
 				error = EINVAL;
 				break;
 			}
 			sav->key_enc = (struct seckey *)key_dup_keymsg(key0,
 								       len,
 								       M_IPSEC_MISC);
 			if (sav->key_enc == NULL) {
 				ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 					__func__));
 				error = ENOBUFS;
 				goto fail;
 			}
 			break;
 		case SADB_X_SATYPE_IPCOMP:
 			if (len != PFKEY_ALIGN8(sizeof(struct sadb_key)))
 				error = EINVAL;
 			sav->key_enc = NULL;	/*just in case*/
 			break;
 		case SADB_SATYPE_AH:
 		case SADB_X_SATYPE_TCPSIGNATURE:
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n",
 				__func__));
 			goto fail;
 		}
 	}
 
 	/* set iv */
 	sav->ivlen = 0;
 
 	switch (mhp->msg->sadb_msg_satype) {
 	case SADB_SATYPE_AH:
 		error = xform_init(sav, XF_AH);
 		break;
 	case SADB_SATYPE_ESP:
 		error = xform_init(sav, XF_ESP);
 		break;
 	case SADB_X_SATYPE_IPCOMP:
 		error = xform_init(sav, XF_IPCOMP);
 		break;
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		error = xform_init(sav, XF_TCPSIGNATURE);
 		break;
 	}
 	if (error) {
 		ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n",
 		        __func__, mhp->msg->sadb_msg_satype));
 		goto fail;
 	}
 
 	/* reset created */
 	sav->created = time_second;
 
 	/* make lifetime for CURRENT */
 	sav->lft_c = malloc(sizeof(struct seclifetime), M_IPSEC_MISC, M_NOWAIT);
 	if (sav->lft_c == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		error = ENOBUFS;
 		goto fail;
 	}
 
 	sav->lft_c->allocations = 0;
 	sav->lft_c->bytes = 0;
 	sav->lft_c->addtime = time_second;
 	sav->lft_c->usetime = 0;
 
 	/* lifetimes for HARD and SOFT */
     {
 	const struct sadb_lifetime *lft0;
 
 	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
 	if (lft0 != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		sav->lft_h = key_dup_lifemsg(lft0, M_IPSEC_MISC);
 		if (sav->lft_h == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 		/* to be initialize ? */
 	}
 
 	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT];
 	if (lft0 != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		sav->lft_s = key_dup_lifemsg(lft0, M_IPSEC_MISC);
 		if (sav->lft_s == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 		/* to be initialize ? */
 	}
     }
 
 	return 0;
 
  fail:
 	/* initialization */
 	key_cleansav(sav);
 
 	return error;
 }
 
 /*
  * validation with a secasvar entry, and set SADB_SATYPE_MATURE.
  * OUT:	0:	valid
  *	other:	errno
  */
 static int
 key_mature(struct secasvar *sav)
 {
 	INIT_VNET_IPSEC(curvnet);
 	int error;
 
 	/* check SPI value */
 	switch (sav->sah->saidx.proto) {
 	case IPPROTO_ESP:
 	case IPPROTO_AH:
 		/*
 		 * RFC 4302, 2.4. Security Parameters Index (SPI), SPI values
 		 * 1-255 reserved by IANA for future use,
 		 * 0 for implementation specific, local use.
 		 */
 		if (ntohl(sav->spi) <= 255) {
 			ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n",
 			    __func__, (u_int32_t)ntohl(sav->spi)));
 			return EINVAL;
 		}
 		break;
 	}
 
 	/* check satype */
 	switch (sav->sah->saidx.proto) {
 	case IPPROTO_ESP:
 		/* check flags */
 		if ((sav->flags & (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) ==
 		    (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) {
 			ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
 				"given to old-esp.\n", __func__));
 			return EINVAL;
 		}
 		error = xform_init(sav, XF_ESP);
 		break;
 	case IPPROTO_AH:
 		/* check flags */
 		if (sav->flags & SADB_X_EXT_DERIV) {
 			ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
 				"given to AH SA.\n", __func__));
 			return EINVAL;
 		}
 		if (sav->alg_enc != SADB_EALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_AH);
 		break;
 	case IPPROTO_IPCOMP:
 		if (sav->alg_auth != SADB_AALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		if ((sav->flags & SADB_X_EXT_RAWCPI) == 0
 		 && ntohl(sav->spi) >= 0x10000) {
 			ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n",
 				__func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_IPCOMP);
 		break;
 	case IPPROTO_TCP:
 		if (sav->alg_enc != SADB_EALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_TCPSIGNATURE);
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__));
 		error = EPROTONOSUPPORT;
 		break;
 	}
 	if (error == 0) {
 		SAHTREE_LOCK();
 		key_sa_chgstate(sav, SADB_SASTATE_MATURE);
 		SAHTREE_UNLOCK();
 	}
 	return (error);
 }
 
 /*
  * subroutine for SADB_GET and SADB_DUMP.
  */
 static struct mbuf *
 key_setdumpsa(sav, type, satype, seq, pid)
 	struct secasvar *sav;
 	u_int8_t type, satype;
 	u_int32_t seq, pid;
 {
 	struct mbuf *result = NULL, *tres = NULL, *m;
 	int i;
 	int dumporder[] = {
 		SADB_EXT_SA, SADB_X_EXT_SA2,
 		SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
 		SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC,
 		SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY, SADB_EXT_KEY_AUTH,
 		SADB_EXT_KEY_ENCRYPT, SADB_EXT_IDENTITY_SRC,
 		SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY,
 	};
 
 	m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt);
 	if (m == NULL)
 		goto fail;
 	result = m;
 
 	for (i = sizeof(dumporder)/sizeof(dumporder[0]) - 1; i >= 0; i--) {
 		m = NULL;
 		switch (dumporder[i]) {
 		case SADB_EXT_SA:
 			m = key_setsadbsa(sav);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_X_EXT_SA2:
 			m = key_setsadbxsa2(sav->sah->saidx.mode,
 					sav->replay ? sav->replay->count : 0,
 					sav->sah->saidx.reqid);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_ADDRESS_SRC:
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 			    &sav->sah->saidx.src.sa,
 			    FULLMASK, IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_ADDRESS_DST:
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 			    &sav->sah->saidx.dst.sa,
 			    FULLMASK, IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_KEY_AUTH:
 			if (!sav->key_auth)
 				continue;
 			m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_KEY_ENCRYPT:
 			if (!sav->key_enc)
 				continue;
 			m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_CURRENT:
 			if (!sav->lft_c)
 				continue;
 			m = key_setlifetime(sav->lft_c, 
 					    SADB_EXT_LIFETIME_CURRENT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_HARD:
 			if (!sav->lft_h)
 				continue;
 			m = key_setlifetime(sav->lft_h, 
 					    SADB_EXT_LIFETIME_HARD);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_SOFT:
 			if (!sav->lft_s)
 				continue;
 			m = key_setlifetime(sav->lft_s, 
 					    SADB_EXT_LIFETIME_SOFT);
 
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_ADDRESS_PROXY:
 		case SADB_EXT_IDENTITY_SRC:
 		case SADB_EXT_IDENTITY_DST:
 			/* XXX: should we brought from SPD ? */
 		case SADB_EXT_SENSITIVITY:
 		default:
 			continue;
 		}
 
 		if (!m)
 			goto fail;
 		if (tres)
 			m_cat(m, tres);
 		tres = m;
 		  
 	}
 
 	m_cat(result, tres);
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL)
 			goto fail;
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return result;
 
 fail:
 	m_freem(result);
 	m_freem(tres);
 	return NULL;
 }
 
 /*
  * set data into sadb_msg.
  */
 static struct mbuf *
 key_setsadbmsg(type, tlen, satype, seq, pid, reserved)
 	u_int8_t type, satype;
 	u_int16_t tlen;
 	u_int32_t seq;
 	pid_t pid;
 	u_int16_t reserved;
 {
 	struct mbuf *m;
 	struct sadb_msg *p;
 	int len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 	if (len > MCLBYTES)
 		return NULL;
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m && len > MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_freem(m);
 			m = NULL;
 		}
 	}
 	if (!m)
 		return NULL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_next = NULL;
 
 	p = mtod(m, struct sadb_msg *);
 
 	bzero(p, len);
 	p->sadb_msg_version = PF_KEY_V2;
 	p->sadb_msg_type = type;
 	p->sadb_msg_errno = 0;
 	p->sadb_msg_satype = satype;
 	p->sadb_msg_len = PFKEY_UNIT64(tlen);
 	p->sadb_msg_reserved = reserved;
 	p->sadb_msg_seq = seq;
 	p->sadb_msg_pid = (u_int32_t)pid;
 
 	return m;
 }
 
 /*
  * copy secasvar data into sadb_address.
  */
 static struct mbuf *
 key_setsadbsa(sav)
 	struct secasvar *sav;
 {
 	struct mbuf *m;
 	struct sadb_sa *p;
 	int len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_sa));
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		return NULL;
 	}
 
 	p = mtod(m, struct sadb_sa *);
 
 	bzero(p, len);
 	p->sadb_sa_len = PFKEY_UNIT64(len);
 	p->sadb_sa_exttype = SADB_EXT_SA;
 	p->sadb_sa_spi = sav->spi;
 	p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0);
 	p->sadb_sa_state = sav->state;
 	p->sadb_sa_auth = sav->alg_auth;
 	p->sadb_sa_encrypt = sav->alg_enc;
 	p->sadb_sa_flags = sav->flags;
 
 	return m;
 }
 
 /*
  * set data into sadb_address.
  */
 static struct mbuf *
 key_setsadbaddr(exttype, saddr, prefixlen, ul_proto)
 	u_int16_t exttype;
 	const struct sockaddr *saddr;
 	u_int8_t prefixlen;
 	u_int16_t ul_proto;
 {
 	struct mbuf *m;
 	struct sadb_address *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_address)) +
 	    PFKEY_ALIGN8(saddr->sa_len);
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		return NULL;
 	}
 
 	p = mtod(m, struct sadb_address *);
 
 	bzero(p, len);
 	p->sadb_address_len = PFKEY_UNIT64(len);
 	p->sadb_address_exttype = exttype;
 	p->sadb_address_proto = ul_proto;
 	if (prefixlen == FULLMASK) {
 		switch (saddr->sa_family) {
 		case AF_INET:
 			prefixlen = sizeof(struct in_addr) << 3;
 			break;
 		case AF_INET6:
 			prefixlen = sizeof(struct in6_addr) << 3;
 			break;
 		default:
 			; /*XXX*/
 		}
 	}
 	p->sadb_address_prefixlen = prefixlen;
 	p->sadb_address_reserved = 0;
 
 	bcopy(saddr,
 	    mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)),
 	    saddr->sa_len);
 
 	return m;
 }
 
 /*
  * set data into sadb_x_sa2.
  */
 static struct mbuf *
 key_setsadbxsa2(mode, seq, reqid)
 	u_int8_t mode;
 	u_int32_t seq, reqid;
 {
 	struct mbuf *m;
 	struct sadb_x_sa2 *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2));
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		return NULL;
 	}
 
 	p = mtod(m, struct sadb_x_sa2 *);
 
 	bzero(p, len);
 	p->sadb_x_sa2_len = PFKEY_UNIT64(len);
 	p->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
 	p->sadb_x_sa2_mode = mode;
 	p->sadb_x_sa2_reserved1 = 0;
 	p->sadb_x_sa2_reserved2 = 0;
 	p->sadb_x_sa2_sequence = seq;
 	p->sadb_x_sa2_reqid = reqid;
 
 	return m;
 }
 
 /*
  * set data into sadb_x_policy
  */
 static struct mbuf *
 key_setsadbxpolicy(type, dir, id)
 	u_int16_t type;
 	u_int8_t dir;
 	u_int32_t id;
 {
 	struct mbuf *m;
 	struct sadb_x_policy *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy));
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		return NULL;
 	}
 
 	p = mtod(m, struct sadb_x_policy *);
 
 	bzero(p, len);
 	p->sadb_x_policy_len = PFKEY_UNIT64(len);
 	p->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
 	p->sadb_x_policy_type = type;
 	p->sadb_x_policy_dir = dir;
 	p->sadb_x_policy_id = id;
 
 	return m;
 }
 
 /* %%% utilities */
 /* Take a key message (sadb_key) from the socket and turn it into one
  * of the kernel's key structures (seckey).
  *
  * IN: pointer to the src
  * OUT: NULL no more memory
  */
 struct seckey *
 key_dup_keymsg(const struct sadb_key *src, u_int len,
 	       struct malloc_type *type)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct seckey *dst;
 	dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT);
 	if (dst != NULL) {
 		dst->bits = src->sadb_key_bits;
 		dst->key_data = (char *)malloc(len, type, M_NOWAIT);
 		if (dst->key_data != NULL) {
 			bcopy((const char *)src + sizeof(struct sadb_key), 
 			      dst->key_data, len);
 		} else {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n", 
 				  __func__));
 			free(dst, type);
 			dst = NULL;
 		}
 	} else {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", 
 			  __func__));
 
 	}
 	return dst;
 }
 
 /* Take a lifetime message (sadb_lifetime) passed in on a socket and
  * turn it into one of the kernel's lifetime structures (seclifetime).
  *
  * IN: pointer to the destination, source and malloc type
  * OUT: NULL, no more memory
  */
 
 static struct seclifetime *
 key_dup_lifemsg(const struct sadb_lifetime *src,
 		 struct malloc_type *type)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct seclifetime *dst = NULL;
 
 	dst = (struct seclifetime *)malloc(sizeof(struct seclifetime), 
 					   type, M_NOWAIT);
 	if (dst == NULL) {
 		/* XXX counter */
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 	} else {
 		dst->allocations = src->sadb_lifetime_allocations;
 		dst->bytes = src->sadb_lifetime_bytes;
 		dst->addtime = src->sadb_lifetime_addtime;
 		dst->usetime = src->sadb_lifetime_usetime;
 	}
 	return dst;
 }
 
 /* compare my own address
  * OUT:	1: true, i.e. my address.
  *	0: false
  */
 int
 key_ismyaddr(sa)
 	struct sockaddr *sa;
 {
 #ifdef INET
 	INIT_VNET_INET(curvnet);
 	struct sockaddr_in *sin;
 	struct in_ifaddr *ia;
 #endif
 
 	IPSEC_ASSERT(sa != NULL, ("null sockaddr"));
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		for (ia = V_in_ifaddrhead.tqh_first; ia;
 		     ia = ia->ia_link.tqe_next)
 		{
 			if (sin->sin_family == ia->ia_addr.sin_family &&
 			    sin->sin_len == ia->ia_addr.sin_len &&
 			    sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
 			{
 				return 1;
 			}
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return key_ismyaddr6((struct sockaddr_in6 *)sa);
 #endif
 	}
 
 	return 0;
 }
 
 #ifdef INET6
 /*
  * compare my own address for IPv6.
  * 1: ours
  * 0: other
  * NOTE: derived ip6_input() in KAME. This is necessary to modify more.
  */
 #include <netinet6/in6_var.h>
 
 static int
 key_ismyaddr6(sin6)
 	struct sockaddr_in6 *sin6;
 {
 	INIT_VNET_INET6(curvnet);
 	struct in6_ifaddr *ia;
 	struct in6_multi *in6m;
 
 	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
 		if (key_sockaddrcmp((struct sockaddr *)&sin6,
 		    (struct sockaddr *)&ia->ia_addr, 0) == 0)
 			return 1;
 
 		/*
 		 * XXX Multicast
 		 * XXX why do we care about multlicast here while we don't care
 		 * about IPv4 multicast??
 		 * XXX scope
 		 */
 		in6m = NULL;
 		IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m);
 		if (in6m)
 			return 1;
 	}
 
 	/* loopback, just for safety */
 	if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))
 		return 1;
 
 	return 0;
 }
 #endif /*INET6*/
 
 /*
  * compare two secasindex structure.
  * flag can specify to compare 2 saidxes.
  * compare two secasindex structure without both mode and reqid.
  * don't compare port.
  * IN:  
  *      saidx0: source, it can be in SAD.
  *      saidx1: object.
  * OUT: 
  *      1 : equal
  *      0 : not equal
  */
 static int
 key_cmpsaidx(
 	const struct secasindex *saidx0,
 	const struct secasindex *saidx1,
 	int flag)
 {
 	/* sanity */
 	if (saidx0 == NULL && saidx1 == NULL)
 		return 1;
 
 	if (saidx0 == NULL || saidx1 == NULL)
 		return 0;
 
 	if (saidx0->proto != saidx1->proto)
 		return 0;
 
 	if (flag == CMP_EXACTLY) {
 		if (saidx0->mode != saidx1->mode)
 			return 0;
 		if (saidx0->reqid != saidx1->reqid)
 			return 0;
 		if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.sa.sa_len) != 0 ||
 		    bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.sa.sa_len) != 0)
 			return 0;
 	} else {
 
 		/* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */
 		if (flag == CMP_MODE_REQID
 		  ||flag == CMP_REQID) {
 			/*
 			 * If reqid of SPD is non-zero, unique SA is required.
 			 * The result must be of same reqid in this case.
 			 */
 			if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid)
 				return 0;
 		}
 
 		if (flag == CMP_MODE_REQID) {
 			if (saidx0->mode != IPSEC_MODE_ANY
 			 && saidx0->mode != saidx1->mode)
 				return 0;
 		}
 
 		if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, 0) != 0) {
 			return 0;
 		}
 		if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, 0) != 0) {
 			return 0;
 		}
 	}
 
 	return 1;
 }
 
 /*
  * compare two secindex structure exactly.
  * IN:
  *	spidx0: source, it is often in SPD.
  *	spidx1: object, it is often from PFKEY message.
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_cmpspidx_exactly(
 	struct secpolicyindex *spidx0,
 	struct secpolicyindex *spidx1)
 {
 	/* sanity */
 	if (spidx0 == NULL && spidx1 == NULL)
 		return 1;
 
 	if (spidx0 == NULL || spidx1 == NULL)
 		return 0;
 
 	if (spidx0->prefs != spidx1->prefs
 	 || spidx0->prefd != spidx1->prefd
 	 || spidx0->ul_proto != spidx1->ul_proto)
 		return 0;
 
 	return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 &&
 	       key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0;
 }
 
 /*
  * compare two secindex structure with mask.
  * IN:
  *	spidx0: source, it is often in SPD.
  *	spidx1: object, it is often from IP header.
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_cmpspidx_withmask(
 	struct secpolicyindex *spidx0,
 	struct secpolicyindex *spidx1)
 {
 	/* sanity */
 	if (spidx0 == NULL && spidx1 == NULL)
 		return 1;
 
 	if (spidx0 == NULL || spidx1 == NULL)
 		return 0;
 
 	if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family ||
 	    spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family ||
 	    spidx0->src.sa.sa_len != spidx1->src.sa.sa_len ||
 	    spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len)
 		return 0;
 
 	/* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */
 	if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY
 	 && spidx0->ul_proto != spidx1->ul_proto)
 		return 0;
 
 	switch (spidx0->src.sa.sa_family) {
 	case AF_INET:
 		if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY
 		 && spidx0->src.sin.sin_port != spidx1->src.sin.sin_port)
 			return 0;
 		if (!key_bbcmp(&spidx0->src.sin.sin_addr,
 		    &spidx1->src.sin.sin_addr, spidx0->prefs))
 			return 0;
 		break;
 	case AF_INET6:
 		if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY
 		 && spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port)
 			return 0;
 		/*
 		 * scope_id check. if sin6_scope_id is 0, we regard it
 		 * as a wildcard scope, which matches any scope zone ID. 
 		 */
 		if (spidx0->src.sin6.sin6_scope_id &&
 		    spidx1->src.sin6.sin6_scope_id &&
 		    spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id)
 			return 0;
 		if (!key_bbcmp(&spidx0->src.sin6.sin6_addr,
 		    &spidx1->src.sin6.sin6_addr, spidx0->prefs))
 			return 0;
 		break;
 	default:
 		/* XXX */
 		if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0)
 			return 0;
 		break;
 	}
 
 	switch (spidx0->dst.sa.sa_family) {
 	case AF_INET:
 		if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY
 		 && spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port)
 			return 0;
 		if (!key_bbcmp(&spidx0->dst.sin.sin_addr,
 		    &spidx1->dst.sin.sin_addr, spidx0->prefd))
 			return 0;
 		break;
 	case AF_INET6:
 		if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY
 		 && spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port)
 			return 0;
 		/*
 		 * scope_id check. if sin6_scope_id is 0, we regard it
 		 * as a wildcard scope, which matches any scope zone ID. 
 		 */
 		if (spidx0->dst.sin6.sin6_scope_id &&
 		    spidx1->dst.sin6.sin6_scope_id &&
 		    spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id)
 			return 0;
 		if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr,
 		    &spidx1->dst.sin6.sin6_addr, spidx0->prefd))
 			return 0;
 		break;
 	default:
 		/* XXX */
 		if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0)
 			return 0;
 		break;
 	}
 
 	/* XXX Do we check other field ?  e.g. flowinfo */
 
 	return 1;
 }
 
 /* returns 0 on match */
 static int
 key_sockaddrcmp(
 	const struct sockaddr *sa1,
 	const struct sockaddr *sa2,
 	int port)
 {
 #ifdef satosin
 #undef satosin
 #endif
 #define satosin(s) ((const struct sockaddr_in *)s)
 #ifdef satosin6
 #undef satosin6
 #endif
 #define satosin6(s) ((const struct sockaddr_in6 *)s)
 	if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len)
 		return 1;
 
 	switch (sa1->sa_family) {
 	case AF_INET:
 		if (sa1->sa_len != sizeof(struct sockaddr_in))
 			return 1;
 		if (satosin(sa1)->sin_addr.s_addr !=
 		    satosin(sa2)->sin_addr.s_addr) {
 			return 1;
 		}
 		if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port)
 			return 1;
 		break;
 	case AF_INET6:
 		if (sa1->sa_len != sizeof(struct sockaddr_in6))
 			return 1;	/*EINVAL*/
 		if (satosin6(sa1)->sin6_scope_id !=
 		    satosin6(sa2)->sin6_scope_id) {
 			return 1;
 		}
 		if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr,
 		    &satosin6(sa2)->sin6_addr)) {
 			return 1;
 		}
 		if (port &&
 		    satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) {
 			return 1;
 		}
 		break;
 	default:
 		if (bcmp(sa1, sa2, sa1->sa_len) != 0)
 			return 1;
 		break;
 	}
 
 	return 0;
 #undef satosin
 #undef satosin6
 }
 
 /*
  * compare two buffers with mask.
  * IN:
  *	addr1: source
  *	addr2: object
  *	bits:  Number of bits to compare
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_bbcmp(const void *a1, const void *a2, u_int bits)
 {
 	const unsigned char *p1 = a1;
 	const unsigned char *p2 = a2;
 
 	/* XXX: This could be considerably faster if we compare a word
 	 * at a time, but it is complicated on LSB Endian machines */
 
 	/* Handle null pointers */
 	if (p1 == NULL || p2 == NULL)
 		return (p1 == p2);
 
 	while (bits >= 8) {
 		if (*p1++ != *p2++)
 			return 0;
 		bits -= 8;
 	}
 
 	if (bits > 0) {
 		u_int8_t mask = ~((1<<(8-bits))-1);
 		if ((*p1 & mask) != (*p2 & mask))
 			return 0;
 	}
 	return 1;	/* Match! */
 }
 
 static void
 key_flush_spd(time_t now)
 {
 	INIT_VNET_IPSEC(curvnet);
 	static u_int16_t sptree_scangen = 0;
 	u_int16_t gen = sptree_scangen++;
 	struct secpolicy *sp;
 	u_int dir;
 
 	/* SPD */
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 restart:
 		SPTREE_LOCK();
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			if (sp->scangen == gen)		/* previously handled */
 				continue;
 			sp->scangen = gen;
 			if (sp->state == IPSEC_SPSTATE_DEAD) {
 				/* NB: clean entries created by key_spdflush */
 				SPTREE_UNLOCK();
 				KEY_FREESP(&sp);
 				goto restart;
 			}
 			if (sp->lifetime == 0 && sp->validtime == 0)
 				continue;
 			if ((sp->lifetime && now - sp->created > sp->lifetime)
 			 || (sp->validtime && now - sp->lastused > sp->validtime)) {
 				sp->state = IPSEC_SPSTATE_DEAD;
 				SPTREE_UNLOCK();
 				key_spdexpire(sp);
 				KEY_FREESP(&sp);
 				goto restart;
 			}
 		}
 		SPTREE_UNLOCK();
 	}
 }
 
 static void
 key_flush_sad(time_t now)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah, *nextsah;
 	struct secasvar *sav, *nextsav;
 
 	/* SAD */
 	SAHTREE_LOCK();
 	LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) {
 		/* if sah has been dead, then delete it and process next sah. */
 		if (sah->state == SADB_SASTATE_DEAD) {
 			key_delsah(sah);
 			continue;
 		}
 
 		/* if LARVAL entry doesn't become MATURE, delete it. */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) {
 			if (now - sav->created > V_key_larval_lifetime)
 				KEY_FREESAV(&sav);
 		}
 
 		/*
 		 * check MATURE entry to start to send expire message
 		 * whether or not.
 		 */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_MATURE], chain, nextsav) {
 			/* we don't need to check. */
 			if (sav->lft_s == NULL)
 				continue;
 
 			/* sanity check */
 			if (sav->lft_c == NULL) {
 				ipseclog((LOG_DEBUG,"%s: there is no CURRENT "
 					"time, why?\n", __func__));
 				continue;
 			}
 
 			/* check SOFT lifetime */
 			if (sav->lft_s->addtime != 0 &&
 			    now - sav->created > sav->lft_s->addtime) {
 				/*
 				 * check SA to be used whether or not.
 				 * when SA hasn't been used, delete it.
 				 */
 				if (sav->lft_c->usetime == 0) {
 					key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 					KEY_FREESAV(&sav);
 				} else {
 					key_sa_chgstate(sav, SADB_SASTATE_DYING);
 					/*
 					 * XXX If we keep to send expire
 					 * message in the status of
 					 * DYING. Do remove below code.
 					 */
 					key_expire(sav);
 				}
 			}
 			/* check SOFT lifetime by bytes */
 			/*
 			 * XXX I don't know the way to delete this SA
 			 * when new SA is installed.  Caution when it's
 			 * installed too big lifetime by time.
 			 */
 			else if (sav->lft_s->bytes != 0 &&
 			    sav->lft_s->bytes < sav->lft_c->bytes) {
 
 				key_sa_chgstate(sav, SADB_SASTATE_DYING);
 				/*
 				 * XXX If we keep to send expire
 				 * message in the status of
 				 * DYING. Do remove below code.
 				 */
 				key_expire(sav);
 			}
 		}
 
 		/* check DYING entry to change status to DEAD. */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DYING], chain, nextsav) {
 			/* we don't need to check. */
 			if (sav->lft_h == NULL)
 				continue;
 
 			/* sanity check */
 			if (sav->lft_c == NULL) {
 				ipseclog((LOG_DEBUG, "%s: there is no CURRENT "
 					"time, why?\n", __func__));
 				continue;
 			}
 
 			if (sav->lft_h->addtime != 0 &&
 			    now - sav->created > sav->lft_h->addtime) {
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 #if 0	/* XXX Should we keep to send expire message until HARD lifetime ? */
 			else if (sav->lft_s != NULL
 			      && sav->lft_s->addtime != 0
 			      && now - sav->created > sav->lft_s->addtime) {
 				/*
 				 * XXX: should be checked to be
 				 * installed the valid SA.
 				 */
 
 				/*
 				 * If there is no SA then sending
 				 * expire message.
 				 */
 				key_expire(sav);
 			}
 #endif
 			/* check HARD lifetime by bytes */
 			else if (sav->lft_h->bytes != 0 &&
 			    sav->lft_h->bytes < sav->lft_c->bytes) {
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 
 		/* delete entry in DEAD */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DEAD], chain, nextsav) {
 			/* sanity check */
 			if (sav->state != SADB_SASTATE_DEAD) {
 				ipseclog((LOG_DEBUG, "%s: invalid sav->state "
 					"(queue: %d SA: %d): kill it anyway\n",
 					__func__,
 					SADB_SASTATE_DEAD, sav->state));
 			}
 			/*
 			 * do not call key_freesav() here.
 			 * sav should already be freed, and sav->refcnt
 			 * shows other references to sav
 			 * (such as from SPD).
 			 */
 		}
 	}
 	SAHTREE_UNLOCK();
 }
 
 static void
 key_flush_acq(time_t now)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secacq *acq, *nextacq;
 
 	/* ACQ tree */
 	ACQ_LOCK();
 	for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) {
 		nextacq = LIST_NEXT(acq, chain);
 		if (now - acq->created > V_key_blockacq_lifetime
 		 && __LIST_CHAINED(acq)) {
 			LIST_REMOVE(acq, chain);
 			free(acq, M_IPSEC_SAQ);
 		}
 	}
 	ACQ_UNLOCK();
 }
 
 static void
 key_flush_spacq(time_t now)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secspacq *acq, *nextacq;
 
 	/* SP ACQ tree */
 	SPACQ_LOCK();
 	for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) {
 		nextacq = LIST_NEXT(acq, chain);
 		if (now - acq->created > V_key_blockacq_lifetime
 		 && __LIST_CHAINED(acq)) {
 			LIST_REMOVE(acq, chain);
 			free(acq, M_IPSEC_SAQ);
 		}
 	}
 	SPACQ_UNLOCK();
 }
 
 /*
  * time handler.
  * scanning SPD and SAD to check status for each entries,
  * and do to remove or to expire.
  * XXX: year 2038 problem may remain.
  */
 void
 key_timehandler(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	time_t now = time_second;
 
+	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		key_flush_spd(now);
 		key_flush_sad(now);
 		key_flush_acq(now);
 		key_flush_spacq(now);
 		CURVNET_RESTORE();
 	}
+	VNET_LIST_RUNLOCK();
 
 #ifndef IPSEC_DEBUG2
 	/* do exchange to tick time !! */
 	(void)timeout((void *)key_timehandler, (void *)0, hz);
 #endif /* IPSEC_DEBUG2 */
 }
 
 u_long
 key_random()
 {
 	u_long value;
 
 	key_randomfill(&value, sizeof(value));
 	return value;
 }
 
 void
 key_randomfill(p, l)
 	void *p;
 	size_t l;
 {
 	size_t n;
 	u_long v;
 	static int warn = 1;
 
 	n = 0;
 	n = (size_t)read_random(p, (u_int)l);
 	/* last resort */
 	while (n < l) {
 		v = random();
 		bcopy(&v, (u_int8_t *)p + n,
 		    l - n < sizeof(v) ? l - n : sizeof(v));
 		n += sizeof(v);
 
 		if (warn) {
 			printf("WARNING: pseudo-random number generator "
 			    "used for IPsec processing\n");
 			warn = 0;
 		}
 	}
 }
 
 /*
  * map SADB_SATYPE_* to IPPROTO_*.
  * if satype == SADB_SATYPE then satype is mapped to ~0.
  * OUT:
  *	0: invalid satype.
  */
 static u_int16_t
 key_satype2proto(satype)
 	u_int8_t satype;
 {
 	switch (satype) {
 	case SADB_SATYPE_UNSPEC:
 		return IPSEC_PROTO_ANY;
 	case SADB_SATYPE_AH:
 		return IPPROTO_AH;
 	case SADB_SATYPE_ESP:
 		return IPPROTO_ESP;
 	case SADB_X_SATYPE_IPCOMP:
 		return IPPROTO_IPCOMP;
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		return IPPROTO_TCP;
 	default:
 		return 0;
 	}
 	/* NOTREACHED */
 }
 
 /*
  * map IPPROTO_* to SADB_SATYPE_*
  * OUT:
  *	0: invalid protocol type.
  */
 static u_int8_t
 key_proto2satype(proto)
 	u_int16_t proto;
 {
 	switch (proto) {
 	case IPPROTO_AH:
 		return SADB_SATYPE_AH;
 	case IPPROTO_ESP:
 		return SADB_SATYPE_ESP;
 	case IPPROTO_IPCOMP:
 		return SADB_X_SATYPE_IPCOMP;
 	case IPPROTO_TCP:
 		return SADB_X_SATYPE_TCPSIGNATURE;
 	default:
 		return 0;
 	}
 	/* NOTREACHED */
 }
 
 /* %%% PF_KEY */
 /*
  * SADB_GETSPI processing is to receive
  *	<base, (SA2), src address, dst address, (SPI range)>
  * from the IKMPd, to assign a unique spi value, to hang on the INBOUND
  * tree with the status of LARVAL, and send
  *	<base, SA(*), address(SD)>
  * to the IKMPd.
  *
  * IN:	mhp: pointer to the pointer to each header.
  * OUT:	NULL if fail.
  *	other if success, return pointer to the message to send.
  */
 static int
 key_getspi(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *newsah;
 	struct secasvar *newsav;
 	u_int8_t proto;
 	u_int32_t spi;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* make sure if port number is zero. */
 	switch (((struct sockaddr *)(src0 + 1))->sa_family) {
 	case AF_INET:
 		if (((struct sockaddr *)(src0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in *)(src0 + 1))->sin_port = 0;
 		break;
 	case AF_INET6:
 		if (((struct sockaddr *)(src0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in6))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0;
 		break;
 	default:
 		; /*???*/
 	}
 	switch (((struct sockaddr *)(dst0 + 1))->sa_family) {
 	case AF_INET:
 		if (((struct sockaddr *)(dst0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in *)(dst0 + 1))->sin_port = 0;
 		break;
 	case AF_INET6:
 		if (((struct sockaddr *)(dst0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in6))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0;
 		break;
 	default:
 		; /*???*/
 	}
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 	/* SPI allocation */
 	spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE],
 	                       &saidx);
 	if (spi == 0)
 		return key_senderror(so, m, EINVAL);
 
 	/* get a SA index */
 	if ((newsah = key_getsah(&saidx)) == NULL) {
 		/* create a new SA index */
 		if ((newsah = key_newsah(&saidx)) == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			return key_senderror(so, m, ENOBUFS);
 		}
 	}
 
 	/* get a new SA */
 	/* XXX rewrite */
 	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
 	if (newsav == NULL) {
 		/* XXX don't free new SA index allocated in above. */
 		return key_senderror(so, m, error);
 	}
 
 	/* set spi */
 	newsav->spi = htonl(spi);
 
 	/* delete the entry in acqtree */
 	if (mhp->msg->sadb_msg_seq != 0) {
 		struct secacq *acq;
 		if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) != NULL) {
 			/* reset counter in order to deletion by timehandler. */
 			acq->created = time_second;
 			acq->count = 0;
 		}
     	}
 
     {
 	struct mbuf *n, *nn;
 	struct sadb_sa *m_sa;
 	struct sadb_msg *newmsg;
 	int off, len;
 
 	/* create new sadb_msg to reply. */
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) +
 	    PFKEY_ALIGN8(sizeof(struct sadb_sa));
 
 	MGETHDR(n, M_DONTWAIT, MT_DATA);
 	if (len > MHLEN) {
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off);
 	m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa));
 	m_sa->sadb_sa_exttype = SADB_EXT_SA;
 	m_sa->sadb_sa_spi = htonl(spi);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_sa));
 
 	IPSEC_ASSERT(off == len,
 		("length inconsistency (off %u len %u)", off, len));
 
 	n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC,
 	    SADB_EXT_ADDRESS_DST);
 	if (!n->m_next) {
 		m_freem(n);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
 	}
 
 	n->m_pkthdr.len = 0;
 	for (nn = n; nn; nn = nn->m_next)
 		n->m_pkthdr.len += nn->m_len;
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_seq = newsav->seq;
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
     }
 }
 
 /*
  * allocating new SPI
  * called by key_getspi().
  * OUT:
  *	0:	failure.
  *	others: success.
  */
 static u_int32_t
 key_do_getnewspi(spirange, saidx)
 	struct sadb_spirange *spirange;
 	struct secasindex *saidx;
 {
 	INIT_VNET_IPSEC(curvnet);
 	u_int32_t newspi;
 	u_int32_t min, max;
 	int count = V_key_spi_trycnt;
 
 	/* set spi range to allocate */
 	if (spirange != NULL) {
 		min = spirange->sadb_spirange_min;
 		max = spirange->sadb_spirange_max;
 	} else {
 		min = V_key_spi_minval;
 		max = V_key_spi_maxval;
 	}
 	/* IPCOMP needs 2-byte SPI */
 	if (saidx->proto == IPPROTO_IPCOMP) {
 		u_int32_t t;
 		if (min >= 0x10000)
 			min = 0xffff;
 		if (max >= 0x10000)
 			max = 0xffff;
 		if (min > max) {
 			t = min; min = max; max = t;
 		}
 	}
 
 	if (min == max) {
 		if (key_checkspidup(saidx, min) != NULL) {
 			ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n",
 				__func__, min));
 			return 0;
 		}
 
 		count--; /* taking one cost. */
 		newspi = min;
 
 	} else {
 
 		/* init SPI */
 		newspi = 0;
 
 		/* when requesting to allocate spi ranged */
 		while (count--) {
 			/* generate pseudo-random SPI value ranged. */
 			newspi = min + (key_random() % (max - min + 1));
 
 			if (key_checkspidup(saidx, newspi) == NULL)
 				break;
 		}
 
 		if (count == 0 || newspi == 0) {
 			ipseclog((LOG_DEBUG, "%s: to allocate spi is failed.\n",
 				__func__));
 			return 0;
 		}
 	}
 
 	/* statistics */
 	keystat.getspi_count =
 		(keystat.getspi_count + V_key_spi_trycnt - count) / 2;
 
 	return newspi;
 }
 
 /*
  * SADB_UPDATE processing
  * receive
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       key(AE), (identity(SD),) (sensitivity)>
  * from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL.
  * and send
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_update(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int16_t proto;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
 	     mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
 	     mhp->ext[SADB_EXT_KEY_AUTH] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 	/* XXX boundary checking for other extensions */
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 	/* get a SA header */
 	if ((sah = key_getsah(&saidx)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SA index found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	/* set spidx if there */
 	/* XXX rewrite */
 	error = key_setident(sah, m, mhp);
 	if (error)
 		return key_senderror(so, m, error);
 
 	/* find a SA with sequence number. */
 #ifdef IPSEC_DOSEQCHECK
 	if (mhp->msg->sadb_msg_seq != 0
 	 && (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no larval SA with sequence %u "
 			"exists.\n", __func__, mhp->msg->sadb_msg_seq));
 		return key_senderror(so, m, ENOENT);
 	}
 #else
 	SAHTREE_LOCK();
 	sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 	SAHTREE_UNLOCK();
 	if (sav == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no such a SA found (spi:%u)\n",
 			__func__, (u_int32_t)ntohl(sa0->sadb_sa_spi)));
 		return key_senderror(so, m, EINVAL);
 	}
 #endif
 
 	/* validity check */
 	if (sav->sah->saidx.proto != proto) {
 		ipseclog((LOG_DEBUG, "%s: protocol mismatched "
 			"(DB=%u param=%u)\n", __func__,
 			sav->sah->saidx.proto, proto));
 		return key_senderror(so, m, EINVAL);
 	}
 #ifdef IPSEC_DOSEQCHECK
 	if (sav->spi != sa0->sadb_sa_spi) {
 		ipseclog((LOG_DEBUG, "%s: SPI mismatched (DB:%u param:%u)\n",
 		    __func__,
 		    (u_int32_t)ntohl(sav->spi),
 		    (u_int32_t)ntohl(sa0->sadb_sa_spi)));
 		return key_senderror(so, m, EINVAL);
 	}
 #endif
 	if (sav->pid != mhp->msg->sadb_msg_pid) {
 		ipseclog((LOG_DEBUG, "%s: pid mismatched (DB:%u param:%u)\n",
 		    __func__, sav->pid, mhp->msg->sadb_msg_pid));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* copy sav values */
 	error = key_setsaval(sav, m, mhp);
 	if (error) {
 		KEY_FREESAV(&sav);
 		return key_senderror(so, m, error);
 	}
 
 	/* check SA values to be mature. */
 	if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) {
 		KEY_FREESAV(&sav);
 		return key_senderror(so, m, 0);
 	}
 
     {
 	struct mbuf *n;
 
 	/* set msg buf from mhp */
 	n = key_getmsgbuf_x1(m, mhp);
 	if (n == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * search SAD with sequence for a SA which state is SADB_SASTATE_LARVAL.
  * only called by key_update().
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 #ifdef IPSEC_DOSEQCHECK
 static struct secasvar *
 key_getsavbyseq(sah, seq)
 	struct secashead *sah;
 	u_int32_t seq;
 {
 	struct secasvar *sav;
 	u_int state;
 
 	state = SADB_SASTATE_LARVAL;
 
 	/* search SAD with sequence number ? */
 	LIST_FOREACH(sav, &sah->savtree[state], chain) {
 
 		KEY_CHKSASTATE(state, sav->state, __func__);
 
 		if (sav->seq == seq) {
 			sa_addref(sav);
 			KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 				printf("DP %s cause refcnt++:%d SA:%p\n",
 					__func__, sav->refcnt, sav));
 			return sav;
 		}
 	}
 
 	return NULL;
 }
 #endif
 
 /*
  * SADB_ADD processing
  * add an entry to SA database, when received
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       key(AE), (identity(SD),) (sensitivity)>
  * from the ikmpd,
  * and send
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * IGNORE identity and sensitivity messages.
  *
  * m will always be freed.
  */
 static int
 key_add(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *newsah;
 	struct secasvar *newsav;
 	u_int16_t proto;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
 	     mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
 	     mhp->ext[SADB_EXT_KEY_AUTH] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		/* XXX need more */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 	/* get a SA header */
 	if ((newsah = key_getsah(&saidx)) == NULL) {
 		/* create a new SA header */
 		if ((newsah = key_newsah(&saidx)) == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			return key_senderror(so, m, ENOBUFS);
 		}
 	}
 
 	/* set spidx if there */
 	/* XXX rewrite */
 	error = key_setident(newsah, m, mhp);
 	if (error) {
 		return key_senderror(so, m, error);
 	}
 
 	/* create new SA entry. */
 	/* We can create new SA only if SPI is differenct. */
 	SAHTREE_LOCK();
 	newsav = key_getsavbyspi(newsah, sa0->sadb_sa_spi);
 	SAHTREE_UNLOCK();
 	if (newsav != NULL) {
 		ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__));
 		return key_senderror(so, m, EEXIST);
 	}
 	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
 	if (newsav == NULL) {
 		return key_senderror(so, m, error);
 	}
 
 	/* check SA values to be mature. */
 	if ((error = key_mature(newsav)) != 0) {
 		KEY_FREESAV(&newsav);
 		return key_senderror(so, m, error);
 	}
 
 	/*
 	 * don't call key_freesav() here, as we would like to keep the SA
 	 * in the database on success.
 	 */
 
     {
 	struct mbuf *n;
 
 	/* set msg buf from mhp */
 	n = key_getmsgbuf_x1(m, mhp);
 	if (n == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /* m is retained */
 static int
 key_setident(sah, m, mhp)
 	struct secashead *sah;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	const struct sadb_ident *idsrc, *iddst;
 	int idsrclen, iddstlen;
 
 	IPSEC_ASSERT(sah != NULL, ("null secashead"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* don't make buffer if not there */
 	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL &&
 	    mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
 		sah->idents = NULL;
 		sah->identd = NULL;
 		return 0;
 	}
 	
 	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__));
 		return EINVAL;
 	}
 
 	idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC];
 	iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST];
 	idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC];
 	iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST];
 
 	/* validity check */
 	if (idsrc->sadb_ident_type != iddst->sadb_ident_type) {
 		ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__));
 		return EINVAL;
 	}
 
 	switch (idsrc->sadb_ident_type) {
 	case SADB_IDENTTYPE_PREFIX:
 	case SADB_IDENTTYPE_FQDN:
 	case SADB_IDENTTYPE_USERFQDN:
 	default:
 		/* XXX do nothing */
 		sah->idents = NULL;
 		sah->identd = NULL;
 	 	return 0;
 	}
 
 	/* make structure */
 	sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
 	if (sah->idents == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 	sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
 	if (sah->identd == NULL) {
 		free(sah->idents, M_IPSEC_MISC);
 		sah->idents = NULL;
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 	sah->idents->type = idsrc->sadb_ident_type;
 	sah->idents->id = idsrc->sadb_ident_id;
 
 	sah->identd->type = iddst->sadb_ident_type;
 	sah->identd->id = iddst->sadb_ident_id;
 
 	return 0;
 }
 
 /*
  * m will not be freed on return.
  * it is caller's responsibility to free the result. 
  */
 static struct mbuf *
 key_getmsgbuf_x1(m, mhp)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct mbuf *n;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 9, SADB_EXT_RESERVED,
 	    SADB_EXT_SA, SADB_X_EXT_SA2,
 	    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST,
 	    SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
 	    SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST);
 	if (!n)
 		return NULL;
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return NULL;
 	}
 	mtod(n, struct sadb_msg *)->sadb_msg_errno = 0;
 	mtod(n, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(n->m_pkthdr.len);
 
 	return n;
 }
 
 static int key_delete_all __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *, u_int16_t));
 
 /*
  * SADB_DELETE processing
  * receive
  *   <base, SA(*), address(SD)>
  * from the ikmpd, and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, SA(*), address(SD)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_delete(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav = NULL;
 	u_int16_t proto;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL) {
 		/*
 		 * Caller wants us to delete all non-LARVAL SAs
 		 * that match the src/dst.  This is used during
 		 * IKE INITIAL-CONTACT.
 		 */
 		ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__));
 		return key_delete_all(so, m, mhp, proto);
 	} else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/* get a SA header */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* get a SA with SPI. */
 		sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 		if (sav)
 			break;
 	}
 	if (sah == NULL) {
 		SAHTREE_UNLOCK();
 		ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 	SAHTREE_UNLOCK();
 	KEY_FREESAV(&sav);
 
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
 	    SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * delete all SAs for src/dst.  Called from key_delete().
  */
 static int
 key_delete_all(so, m, mhp, proto)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 	u_int16_t proto;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav, *nextsav;
 	u_int stateidx, state;
 
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* Delete all non-LARVAL SAs. */
 		for (stateidx = 0;
-		     stateidx < _ARRAYLEN(V_saorder_state_alive);
+		     stateidx < _ARRAYLEN(saorder_state_alive);
 		     stateidx++) {
-			state = V_saorder_state_alive[stateidx];
+			state = saorder_state_alive[stateidx];
 			if (state == SADB_SASTATE_LARVAL)
 				continue;
 			for (sav = LIST_FIRST(&sah->savtree[state]);
 			     sav != NULL; sav = nextsav) {
 				nextsav = LIST_NEXT(sav, chain);
 				/* sanity check */
 				if (sav->state != state) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"sav->state (queue %d SA %d)\n",
 						__func__, state, sav->state));
 					continue;
 				}
 				
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 	}
 	SAHTREE_UNLOCK();
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED,
 	    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_GET processing
  * receive
  *   <base, SA(*), address(SD)>
  * from the ikmpd, and get a SP and a SA to respond,
  * and send,
  *   <base, SA, (lifetime(HSC),) address(SD), (address(P),) key(AE),
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_get(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav = NULL;
 	u_int16_t proto;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/* get a SA header */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* get a SA with SPI. */
 		sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 		if (sav)
 			break;
 	}
 	SAHTREE_UNLOCK();
 	if (sah == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
     {
 	struct mbuf *n;
 	u_int8_t satype;
 
 	/* map proto to satype */
 	if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* create new sadb_msg to reply. */
 	n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq,
 	    mhp->msg->sadb_msg_pid);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
     }
 }
 
 /* XXX make it sysctl-configurable? */
 static void
 key_getcomb_setlifetime(comb)
 	struct sadb_comb *comb;
 {
 
 	comb->sadb_comb_soft_allocations = 1;
 	comb->sadb_comb_hard_allocations = 1;
 	comb->sadb_comb_soft_bytes = 0;
 	comb->sadb_comb_hard_bytes = 0;
 	comb->sadb_comb_hard_addtime = 86400;	/* 1 day */
 	comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100;
 	comb->sadb_comb_soft_usetime = 28800;	/* 8 hours */
 	comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100;
 }
 
 /*
  * XXX reorder combinations by preference
  * XXX no idea if the user wants ESP authentication or not
  */
 static struct mbuf *
 key_getcomb_esp()
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_comb *comb;
 	struct enc_xform *algo;
 	struct mbuf *result = NULL, *m, *n;
 	int encmin;
 	int i, off, o;
 	int totlen;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_EALG_MAX; i++) {
 		algo = esp_algorithm_lookup(i);
 		if (algo == NULL)
 			continue;
 
 		/* discard algorithms with key size smaller than system min */
 		if (_BITS(algo->maxkey) < V_ipsec_esp_keymin)
 			continue;
 		if (_BITS(algo->minkey) < V_ipsec_esp_keymin)
 			encmin = V_ipsec_esp_keymin;
 		else
 			encmin = _BITS(algo->minkey);
 
 		if (V_ipsec_esp_auth)
 			m = key_getcomb_ah();
 		else {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_DONTWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 				bzero(mtod(m, caddr_t), m->m_len);
 			}
 		}
 		if (!m)
 			goto fail;
 
 		totlen = 0;
 		for (n = m; n; n = n->m_next)
 			totlen += n->m_len;
 		IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l));
 
 		for (off = 0; off < totlen; off += l) {
 			n = m_pulldown(m, off, l, &o);
 			if (!n) {
 				/* m is already freed */
 				goto fail;
 			}
 			comb = (struct sadb_comb *)(mtod(n, caddr_t) + o);
 			bzero(comb, sizeof(*comb));
 			key_getcomb_setlifetime(comb);
 			comb->sadb_comb_encrypt = i;
 			comb->sadb_comb_encrypt_minbits = encmin;
 			comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey);
 		}
 
 		if (!result)
 			result = m;
 		else
 			m_cat(result, m);
 	}
 
 	return result;
 
  fail:
 	if (result)
 		m_freem(result);
 	return NULL;
 }
 
 static void
 key_getsizes_ah(
 	const struct auth_hash *ah,
 	int alg,
 	u_int16_t* min,
 	u_int16_t* max)
 {
 	INIT_VNET_IPSEC(curvnet);
 
 	*min = *max = ah->keysize;
 	if (ah->keysize == 0) {
 		/*
 		 * Transform takes arbitrary key size but algorithm
 		 * key size is restricted.  Enforce this here.
 		 */
 		switch (alg) {
 		case SADB_X_AALG_MD5:	*min = *max = 16; break;
 		case SADB_X_AALG_SHA:	*min = *max = 20; break;
 		case SADB_X_AALG_NULL:	*min = 1; *max = 256; break;
 		default:
 			DPRINTF(("%s: unknown AH algorithm %u\n",
 				__func__, alg));
 			break;
 		}
 	}
 }
 
 /*
  * XXX reorder combinations by preference
  */
 static struct mbuf *
 key_getcomb_ah()
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_comb *comb;
 	struct auth_hash *algo;
 	struct mbuf *m;
 	u_int16_t minkeysize, maxkeysize;
 	int i;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_AALG_MAX; i++) {
 #if 1
 		/* we prefer HMAC algorithms, not old algorithms */
 		if (i != SADB_AALG_SHA1HMAC && i != SADB_AALG_MD5HMAC)
 			continue;
 #endif
 		algo = ah_algorithm_lookup(i);
 		if (!algo)
 			continue;
 		key_getsizes_ah(algo, i, &minkeysize, &maxkeysize);
 		/* discard algorithms with key size smaller than system min */
 		if (_BITS(minkeysize) < V_ipsec_ah_keymin)
 			continue;
 
 		if (!m) {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_DONTWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 			}
 		} else
 			M_PREPEND(m, l, M_DONTWAIT);
 		if (!m)
 			return NULL;
 
 		comb = mtod(m, struct sadb_comb *);
 		bzero(comb, sizeof(*comb));
 		key_getcomb_setlifetime(comb);
 		comb->sadb_comb_auth = i;
 		comb->sadb_comb_auth_minbits = _BITS(minkeysize);
 		comb->sadb_comb_auth_maxbits = _BITS(maxkeysize);
 	}
 
 	return m;
 }
 
 /*
  * not really an official behavior.  discussed in pf_key@inner.net in Sep2000.
  * XXX reorder combinations by preference
  */
 static struct mbuf *
 key_getcomb_ipcomp()
 {
 	struct sadb_comb *comb;
 	struct comp_algo *algo;
 	struct mbuf *m;
 	int i;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_X_CALG_MAX; i++) {
 		algo = ipcomp_algorithm_lookup(i);
 		if (!algo)
 			continue;
 
 		if (!m) {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_DONTWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 			}
 		} else
 			M_PREPEND(m, l, M_DONTWAIT);
 		if (!m)
 			return NULL;
 
 		comb = mtod(m, struct sadb_comb *);
 		bzero(comb, sizeof(*comb));
 		key_getcomb_setlifetime(comb);
 		comb->sadb_comb_encrypt = i;
 		/* what should we set into sadb_comb_*_{min,max}bits? */
 	}
 
 	return m;
 }
 
 /*
  * XXX no way to pass mode (transport/tunnel) to userland
  * XXX replay checking?
  * XXX sysctl interface to ipsec_{ah,esp}_keymin
  */
 static struct mbuf *
 key_getprop(saidx)
 	const struct secasindex *saidx;
 {
 	struct sadb_prop *prop;
 	struct mbuf *m, *n;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop));
 	int totlen;
 
 	switch (saidx->proto)  {
 	case IPPROTO_ESP:
 		m = key_getcomb_esp();
 		break;
 	case IPPROTO_AH:
 		m = key_getcomb_ah();
 		break;
 	case IPPROTO_IPCOMP:
 		m = key_getcomb_ipcomp();
 		break;
 	default:
 		return NULL;
 	}
 
 	if (!m)
 		return NULL;
 	M_PREPEND(m, l, M_DONTWAIT);
 	if (!m)
 		return NULL;
 
 	totlen = 0;
 	for (n = m; n; n = n->m_next)
 		totlen += n->m_len;
 
 	prop = mtod(m, struct sadb_prop *);
 	bzero(prop, sizeof(*prop));
 	prop->sadb_prop_len = PFKEY_UNIT64(totlen);
 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
 	prop->sadb_prop_replay = 32;	/* XXX */
 
 	return m;
 }
 
 /*
  * SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2().
  * send
  *   <base, SA, address(SD), (address(P)), x_policy,
  *       (identity(SD),) (sensitivity,) proposal>
  * to KMD, and expect to receive
  *   <base> with SADB_ACQUIRE if error occured,
  * or
  *   <base, src address, dst address, (SPI range)> with SADB_GETSPI
  * from KMD by PF_KEY.
  *
  * XXX x_policy is outside of RFC2367 (KAME extension).
  * XXX sensitivity is not supported.
  * XXX for ipcomp, RFC2367 does not define how to fill in proposal.
  * see comment for key_getcomb_ipcomp().
  *
  * OUT:
  *    0     : succeed
  *    others: error number
  */
 static int
 key_acquire(const struct secasindex *saidx, struct secpolicy *sp)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct mbuf *result = NULL, *m;
 	struct secacq *newacq;
 	u_int8_t satype;
 	int error = -1;
 	u_int32_t seq;
 
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 	satype = key_proto2satype(saidx->proto);
 	IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto));
 
 	/*
 	 * We never do anything about acquirng SA.  There is anather
 	 * solution that kernel blocks to send SADB_ACQUIRE message until
 	 * getting something message from IKEd.  In later case, to be
 	 * managed with ACQUIRING list.
 	 */
 	/* Get an entry to check whether sending message or not. */
 	if ((newacq = key_getacq(saidx)) != NULL) {
 		if (V_key_blockacq_count < newacq->count) {
 			/* reset counter and do send message. */
 			newacq->count = 0;
 		} else {
 			/* increment counter and do nothing. */
 			newacq->count++;
 			return 0;
 		}
 	} else {
 		/* make new entry for blocking to send SADB_ACQUIRE. */
 		if ((newacq = key_newacq(saidx)) == NULL)
 			return ENOBUFS;
 	}
 
 
 	seq = newacq->seq;
 	m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/* set sadb_address for saidx's. */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &saidx->src.sa, FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &saidx->dst.sa, FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* XXX proxy address (optional) */
 
 	/* set sadb_x_policy */
 	if (sp) {
 		m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id);
 		if (!m) {
 			error = ENOBUFS;
 			goto fail;
 		}
 		m_cat(result, m);
 	}
 
 	/* XXX identity (optional) */
 #if 0
 	if (idexttype && fqdn) {
 		/* create identity extension (FQDN) */
 		struct sadb_ident *id;
 		int fqdnlen;
 
 		fqdnlen = strlen(fqdn) + 1;	/* +1 for terminating-NUL */
 		id = (struct sadb_ident *)p;
 		bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
 		id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
 		id->sadb_ident_exttype = idexttype;
 		id->sadb_ident_type = SADB_IDENTTYPE_FQDN;
 		bcopy(fqdn, id + 1, fqdnlen);
 		p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen);
 	}
 
 	if (idexttype) {
 		/* create identity extension (USERFQDN) */
 		struct sadb_ident *id;
 		int userfqdnlen;
 
 		if (userfqdn) {
 			/* +1 for terminating-NUL */
 			userfqdnlen = strlen(userfqdn) + 1;
 		} else
 			userfqdnlen = 0;
 		id = (struct sadb_ident *)p;
 		bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
 		id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
 		id->sadb_ident_exttype = idexttype;
 		id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN;
 		/* XXX is it correct? */
 		if (curproc && curproc->p_cred)
 			id->sadb_ident_id = curproc->p_cred->p_ruid;
 		if (userfqdn && userfqdnlen)
 			bcopy(userfqdn, id + 1, userfqdnlen);
 		p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen);
 	}
 #endif
 
 	/* XXX sensitivity (optional) */
 
 	/* create proposal/combination extension */
 	m = key_getprop(saidx);
 #if 0
 	/*
 	 * spec conformant: always attach proposal/combination extension,
 	 * the problem is that we have no way to attach it for ipcomp,
 	 * due to the way sadb_comb is declared in RFC2367.
 	 */
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 #else
 	/*
 	 * outside of spec; make proposal/combination extension optional.
 	 */
 	if (m)
 		m_cat(result, m);
 #endif
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	return error;
 }
 
 static struct secacq *
 key_newacq(const struct secasindex *saidx)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secacq *newacq;
 
 	/* get new entry */
 	newacq = malloc(sizeof(struct secacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO);
 	if (newacq == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return NULL;
 	}
 
 	/* copy secindex */
 	bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx));
 	newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq);
 	newacq->created = time_second;
 	newacq->count = 0;
 
 	/* add to acqtree */
 	ACQ_LOCK();
 	LIST_INSERT_HEAD(&V_acqtree, newacq, chain);
 	ACQ_UNLOCK();
 
 	return newacq;
 }
 
 static struct secacq *
 key_getacq(const struct secasindex *saidx)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secacq *acq;
 
 	ACQ_LOCK();
 	LIST_FOREACH(acq, &V_acqtree, chain) {
 		if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY))
 			break;
 	}
 	ACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secacq *
 key_getacqbyseq(seq)
 	u_int32_t seq;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secacq *acq;
 
 	ACQ_LOCK();
 	LIST_FOREACH(acq, &V_acqtree, chain) {
 		if (acq->seq == seq)
 			break;
 	}
 	ACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secspacq *
 key_newspacq(spidx)
 	struct secpolicyindex *spidx;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secspacq *acq;
 
 	/* get new entry */
 	acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO);
 	if (acq == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return NULL;
 	}
 
 	/* copy secindex */
 	bcopy(spidx, &acq->spidx, sizeof(acq->spidx));
 	acq->created = time_second;
 	acq->count = 0;
 
 	/* add to spacqtree */
 	SPACQ_LOCK();
 	LIST_INSERT_HEAD(&V_spacqtree, acq, chain);
 	SPACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secspacq *
 key_getspacq(spidx)
 	struct secpolicyindex *spidx;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secspacq *acq;
 
 	SPACQ_LOCK();
 	LIST_FOREACH(acq, &V_spacqtree, chain) {
 		if (key_cmpspidx_exactly(spidx, &acq->spidx)) {
 			/* NB: return holding spacq_lock */
 			return acq;
 		}
 	}
 	SPACQ_UNLOCK();
 
 	return NULL;
 }
 
 /*
  * SADB_ACQUIRE processing,
  * in first situation, is receiving
  *   <base>
  * from the ikmpd, and clear sequence of its secasvar entry.
  *
  * In second situation, is receiving
  *   <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
  * from a user land process, and return
  *   <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
  * to the socket.
  *
  * m will always be freed.
  */
 static int
 key_acquire2(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	const struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	u_int16_t proto;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/*
 	 * Error message from KMd.
 	 * We assume that if error was occured in IKEd, the length of PFKEY
 	 * message is equal to the size of sadb_msg structure.
 	 * We do not raise error even if error occured in this function.
 	 */
 	if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) {
 		struct secacq *acq;
 
 		/* check sequence number */
 		if (mhp->msg->sadb_msg_seq == 0) {
 			ipseclog((LOG_DEBUG, "%s: must specify sequence "
 				"number.\n", __func__));
 			m_freem(m);
 			return 0;
 		}
 
 		if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) == NULL) {
 			/*
 			 * the specified larval SA is already gone, or we got
 			 * a bogus sequence number.  we can silently ignore it.
 			 */
 			m_freem(m);
 			return 0;
 		}
 
 		/* reset acq counter in order to deletion by timehander. */
 		acq->created = time_second;
 		acq->count = 0;
 		m_freem(m);
 		return 0;
 	}
 
 	/*
 	 * This message is from user land.
 	 */
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_EXT_PROPOSAL] == NULL) {
 		/* error */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) {
 		/* error */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",	
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/* get a SA index */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID))
 			break;
 	}
 	SAHTREE_UNLOCK();
 	if (sah != NULL) {
 		ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__));
 		return key_senderror(so, m, EEXIST);
 	}
 
 	error = key_acquire(&saidx, NULL);
 	if (error != 0) {
 		ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
 			__func__, mhp->msg->sadb_msg_errno));
 		return key_senderror(so, m, error);
 	}
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_REGISTERED);
 }
 
 /*
  * SADB_REGISTER processing.
  * If SATYPE_UNSPEC has been passed as satype, only return sabd_supported.
  * receive
  *   <base>
  * from the ikmpd, and register a socket to send PF_KEY messages,
  * and send
  *   <base, supported>
  * to KMD by PF_KEY.
  * If socket is detached, must free from regnode.
  *
  * m will always be freed.
  */
 static int
 key_register(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secreg *reg, *newreg = 0;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* check for invalid register message */
 	if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0]))
 		return key_senderror(so, m, EINVAL);
 
 	/* When SATYPE_UNSPEC is specified, only return sabd_supported. */
 	if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC)
 		goto setmsg;
 
 	/* check whether existing or not */
 	REGTREE_LOCK();
 	LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) {
 		if (reg->so == so) {
 			REGTREE_UNLOCK();
 			ipseclog((LOG_DEBUG, "%s: socket exists already.\n",
 				__func__));
 			return key_senderror(so, m, EEXIST);
 		}
 	}
 
 	/* create regnode */
 	newreg =  malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT|M_ZERO);
 	if (newreg == NULL) {
 		REGTREE_UNLOCK();
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	newreg->so = so;
 	((struct keycb *)sotorawcb(so))->kp_registered++;
 
 	/* add regnode to regtree. */
 	LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain);
 	REGTREE_UNLOCK();
 
   setmsg:
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 	struct sadb_supported *sup;
 	u_int len, alen, elen;
 	int off;
 	int i;
 	struct sadb_alg *alg;
 
 	/* create new sadb_msg to reply. */
 	alen = 0;
 	for (i = 1; i <= SADB_AALG_MAX; i++) {
 		if (ah_algorithm_lookup(i))
 			alen += sizeof(struct sadb_alg);
 	}
 	if (alen)
 		alen += sizeof(struct sadb_supported);
 	elen = 0;
 	for (i = 1; i <= SADB_EALG_MAX; i++) {
 		if (esp_algorithm_lookup(i))
 			elen += sizeof(struct sadb_alg);
 	}
 	if (elen)
 		elen += sizeof(struct sadb_supported);
 
 	len = sizeof(struct sadb_msg) + alen + elen;
 
 	if (len > MCLBYTES)
 		return key_senderror(so, m, ENOBUFS);
 
 	MGETHDR(n, M_DONTWAIT, MT_DATA);
 	if (len > MHLEN) {
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_pkthdr.len = n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(len);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	/* for authentication algorithm */
 	if (alen) {
 		sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
 		sup->sadb_supported_len = PFKEY_UNIT64(alen);
 		sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
 		off += PFKEY_ALIGN8(sizeof(*sup));
 
 		for (i = 1; i <= SADB_AALG_MAX; i++) {
 			struct auth_hash *aalgo;
 			u_int16_t minkeysize, maxkeysize;
 
 			aalgo = ah_algorithm_lookup(i);
 			if (!aalgo)
 				continue;
 			alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
 			alg->sadb_alg_id = i;
 			alg->sadb_alg_ivlen = 0;
 			key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize);
 			alg->sadb_alg_minbits = _BITS(minkeysize);
 			alg->sadb_alg_maxbits = _BITS(maxkeysize);
 			off += PFKEY_ALIGN8(sizeof(*alg));
 		}
 	}
 
 	/* for encryption algorithm */
 	if (elen) {
 		sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
 		sup->sadb_supported_len = PFKEY_UNIT64(elen);
 		sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
 		off += PFKEY_ALIGN8(sizeof(*sup));
 
 		for (i = 1; i <= SADB_EALG_MAX; i++) {
 			struct enc_xform *ealgo;
 
 			ealgo = esp_algorithm_lookup(i);
 			if (!ealgo)
 				continue;
 			alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
 			alg->sadb_alg_id = i;
 			alg->sadb_alg_ivlen = ealgo->blocksize;
 			alg->sadb_alg_minbits = _BITS(ealgo->minkey);
 			alg->sadb_alg_maxbits = _BITS(ealgo->maxkey);
 			off += PFKEY_ALIGN8(sizeof(struct sadb_alg));
 		}
 	}
 
 	IPSEC_ASSERT(off == len,
 		("length assumption failed (off %u len %u)", off, len));
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED);
     }
 }
 
 /*
  * free secreg entry registered.
  * XXX: I want to do free a socket marked done SADB_RESIGER to socket.
  */
 void
 key_freereg(struct socket *so)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secreg *reg;
 	int i;
 
 	IPSEC_ASSERT(so != NULL, ("NULL so"));
 
 	/*
 	 * check whether existing or not.
 	 * check all type of SA, because there is a potential that
 	 * one socket is registered to multiple type of SA.
 	 */
 	REGTREE_LOCK();
 	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
 		LIST_FOREACH(reg, &V_regtree[i], chain) {
 			if (reg->so == so && __LIST_CHAINED(reg)) {
 				LIST_REMOVE(reg, chain);
 				free(reg, M_IPSEC_SAR);
 				break;
 			}
 		}
 	}
 	REGTREE_UNLOCK();
 }
 
 /*
  * SADB_EXPIRE processing
  * send
  *   <base, SA, SA2, lifetime(C and one of HS), address(SD)>
  * to KMD by PF_KEY.
  * NOTE: We send only soft lifetime extension.
  *
  * OUT:	0	: succeed
  *	others	: error number
  */
 static int
 key_expire(struct secasvar *sav)
 {
 	int s;
 	int satype;
 	struct mbuf *result = NULL, *m;
 	int len;
 	int error = -1;
 	struct sadb_lifetime *lt;
 
 	/* XXX: Why do we lock ? */
 	s = splnet();	/*called from softclock()*/
 
 	IPSEC_ASSERT (sav != NULL, ("null sav"));
 	IPSEC_ASSERT (sav->sah != NULL, ("null sa header"));
 
 	/* set msg header */
 	satype = key_proto2satype(sav->sah->saidx.proto);
 	IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype));
 	m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/* create SA extension */
 	m = key_setsadbsa(sav);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* create SA extension */
 	m = key_setsadbxsa2(sav->sah->saidx.mode,
 			sav->replay ? sav->replay->count : 0,
 			sav->sah->saidx.reqid);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* create lifetime extension (current and soft) */
 	len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
 	m = key_alloc_mbuf(len);
 	if (!m || m->m_next) {	/*XXX*/
 		if (m)
 			m_freem(m);
 		error = ENOBUFS;
 		goto fail;
 	}
 	bzero(mtod(m, caddr_t), len);
 	lt = mtod(m, struct sadb_lifetime *);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
 	lt->sadb_lifetime_allocations = sav->lft_c->allocations;
 	lt->sadb_lifetime_bytes = sav->lft_c->bytes;
 	lt->sadb_lifetime_addtime = sav->lft_c->addtime;
 	lt->sadb_lifetime_usetime = sav->lft_c->usetime;
 	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
 	lt->sadb_lifetime_allocations = sav->lft_s->allocations;
 	lt->sadb_lifetime_bytes = sav->lft_s->bytes;
 	lt->sadb_lifetime_addtime = sav->lft_s->addtime;
 	lt->sadb_lifetime_usetime = sav->lft_s->usetime;
 	m_cat(result, m);
 
 	/* set sadb_address for source */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sav->sah->saidx.src.sa,
 	    FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set sadb_address for destination */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sav->sah->saidx.dst.sa,
 	    FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	splx(s);
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	splx(s);
 	return error;
 }
 
 /*
  * SADB_FLUSH processing
  * receive
  *   <base>
  * from the ikmpd, and free all entries in secastree.
  * and send,
  *   <base>
  * to the ikmpd.
  * NOTE: to do is only marking SADB_SASTATE_DEAD.
  *
  * m will always be freed.
  */
 static int
 key_flush(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_msg *newmsg;
 	struct secashead *sah, *nextsah;
 	struct secasvar *sav, *nextsav;
 	u_int16_t proto;
 	u_int8_t state;
 	u_int stateidx;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* no SATYPE specified, i.e. flushing all SA. */
 	SAHTREE_LOCK();
 	for (sah = LIST_FIRST(&V_sahtree);
 	     sah != NULL;
 	     sah = nextsah) {
 		nextsah = LIST_NEXT(sah, chain);
 
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		for (stateidx = 0;
-		     stateidx < _ARRAYLEN(V_saorder_state_alive);
+		     stateidx < _ARRAYLEN(saorder_state_alive);
 		     stateidx++) {
-			state = V_saorder_state_any[stateidx];
+			state = saorder_state_any[stateidx];
 			for (sav = LIST_FIRST(&sah->savtree[state]);
 			     sav != NULL;
 			     sav = nextsav) {
 
 				nextsav = LIST_NEXT(sav, chain);
 
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 
 		sah->state = SADB_SASTATE_DEAD;
 	}
 	SAHTREE_UNLOCK();
 
 	if (m->m_len < sizeof(struct sadb_msg) ||
 	    sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (m->m_next)
 		m_freem(m->m_next);
 	m->m_next = NULL;
 	m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg);
 	newmsg = mtod(m, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 }
 
 /*
  * SADB_DUMP processing
  * dump all entries including status of DEAD in SAD.
  * receive
  *   <base>
  * from the ikmpd, and dump all secasvar leaves
  * and send,
  *   <base> .....
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_dump(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int16_t proto;
 	u_int stateidx;
 	u_int8_t satype;
 	u_int8_t state;
 	int cnt;
 	struct sadb_msg *newmsg;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* count sav entries to be sent to the userland. */
 	cnt = 0;
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		for (stateidx = 0;
-		     stateidx < _ARRAYLEN(V_saorder_state_any);
+		     stateidx < _ARRAYLEN(saorder_state_any);
 		     stateidx++) {
-			state = V_saorder_state_any[stateidx];
+			state = saorder_state_any[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				cnt++;
 			}
 		}
 	}
 
 	if (cnt == 0) {
 		SAHTREE_UNLOCK();
 		return key_senderror(so, m, ENOENT);
 	}
 
 	/* send this to the userland, one at a time. */
 	newmsg = NULL;
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		/* map proto to satype */
 		if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
 			SAHTREE_UNLOCK();
 			ipseclog((LOG_DEBUG, "%s: there was invalid proto in "
 				"SAD.\n", __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		for (stateidx = 0;
-		     stateidx < _ARRAYLEN(V_saorder_state_any);
+		     stateidx < _ARRAYLEN(saorder_state_any);
 		     stateidx++) {
-			state = V_saorder_state_any[stateidx];
+			state = saorder_state_any[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				n = key_setdumpsa(sav, SADB_DUMP, satype,
 				    --cnt, mhp->msg->sadb_msg_pid);
 				if (!n) {
 					SAHTREE_UNLOCK();
 					return key_senderror(so, m, ENOBUFS);
 				}
 				key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 			}
 		}
 	}
 	SAHTREE_UNLOCK();
 
 	m_freem(m);
 	return 0;
 }
 
 /*
  * SADB_X_PROMISC processing
  *
  * m will always be freed.
  */
 static int
 key_promisc(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	int olen;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
 
 	if (olen < sizeof(struct sadb_msg)) {
 #if 1
 		return key_senderror(so, m, EINVAL);
 #else
 		m_freem(m);
 		return 0;
 #endif
 	} else if (olen == sizeof(struct sadb_msg)) {
 		/* enable/disable promisc mode */
 		struct keycb *kp;
 
 		if ((kp = (struct keycb *)sotorawcb(so)) == NULL)
 			return key_senderror(so, m, EINVAL);
 		mhp->msg->sadb_msg_errno = 0;
 		switch (mhp->msg->sadb_msg_satype) {
 		case 0:
 		case 1:
 			kp->kp_promisc = mhp->msg->sadb_msg_satype;
 			break;
 		default:
 			return key_senderror(so, m, EINVAL);
 		}
 
 		/* send the original message back to everyone */
 		mhp->msg->sadb_msg_errno = 0;
 		return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 	} else {
 		/* send packet as is */
 
 		m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg)));
 
 		/* TODO: if sadb_msg_seq is specified, send to specific pid */
 		return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 	}
 }
 
 static int (*key_typesw[]) __P((struct socket *, struct mbuf *,
 		const struct sadb_msghdr *)) = {
 	NULL,		/* SADB_RESERVED */
 	key_getspi,	/* SADB_GETSPI */
 	key_update,	/* SADB_UPDATE */
 	key_add,	/* SADB_ADD */
 	key_delete,	/* SADB_DELETE */
 	key_get,	/* SADB_GET */
 	key_acquire2,	/* SADB_ACQUIRE */
 	key_register,	/* SADB_REGISTER */
 	NULL,		/* SADB_EXPIRE */
 	key_flush,	/* SADB_FLUSH */
 	key_dump,	/* SADB_DUMP */
 	key_promisc,	/* SADB_X_PROMISC */
 	NULL,		/* SADB_X_PCHANGE */
 	key_spdadd,	/* SADB_X_SPDUPDATE */
 	key_spdadd,	/* SADB_X_SPDADD */
 	key_spddelete,	/* SADB_X_SPDDELETE */
 	key_spdget,	/* SADB_X_SPDGET */
 	NULL,		/* SADB_X_SPDACQUIRE */
 	key_spddump,	/* SADB_X_SPDDUMP */
 	key_spdflush,	/* SADB_X_SPDFLUSH */
 	key_spdadd,	/* SADB_X_SPDSETIDX */
 	NULL,		/* SADB_X_SPDEXPIRE */
 	key_spddelete2,	/* SADB_X_SPDDELETE2 */
 };
 
 /*
  * parse sadb_msg buffer to process PFKEYv2,
  * and create a data to response if needed.
  * I think to be dealed with mbuf directly.
  * IN:
  *     msgp  : pointer to pointer to a received buffer pulluped.
  *             This is rewrited to response.
  *     so    : pointer to socket.
  * OUT:
  *    length for buffer to send to user process.
  */
 int
 key_parse(m, so)
 	struct mbuf *m;
 	struct socket *so;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_msg *msg;
 	struct sadb_msghdr mh;
 	u_int orglen;
 	int error;
 	int target;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 #if 0	/*kdebug_sadb assumes msg in linear buffer*/
 	KEYDEBUG(KEYDEBUG_KEY_DUMP,
 		ipseclog((LOG_DEBUG, "%s: passed sadb_msg\n", __func__));
 		kdebug_sadb(msg));
 #endif
 
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		m = m_pullup(m, sizeof(struct sadb_msg));
 		if (!m)
 			return ENOBUFS;
 	}
 	msg = mtod(m, struct sadb_msg *);
 	orglen = PFKEY_UNUNIT64(msg->sadb_msg_len);
 	target = KEY_SENDUP_ONE;
 
 	if ((m->m_flags & M_PKTHDR) == 0 ||
 	    m->m_pkthdr.len != m->m_pkthdr.len) {
 		ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__));
 		V_pfkeystat.out_invlen++;
 		error = EINVAL;
 		goto senderror;
 	}
 
 	if (msg->sadb_msg_version != PF_KEY_V2) {
 		ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n",
 		    __func__, msg->sadb_msg_version));
 		V_pfkeystat.out_invver++;
 		error = EINVAL;
 		goto senderror;
 	}
 
 	if (msg->sadb_msg_type > SADB_MAX) {
 		ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
 		    __func__, msg->sadb_msg_type));
 		V_pfkeystat.out_invmsgtype++;
 		error = EINVAL;
 		goto senderror;
 	}
 
 	/* for old-fashioned code - should be nuked */
 	if (m->m_pkthdr.len > MCLBYTES) {
 		m_freem(m);
 		return ENOBUFS;
 	}
 	if (m->m_next) {
 		struct mbuf *n;
 
 		MGETHDR(n, M_DONTWAIT, MT_DATA);
 		if (n && m->m_pkthdr.len > MHLEN) {
 			MCLGET(n, M_DONTWAIT);
 			if ((n->m_flags & M_EXT) == 0) {
 				m_free(n);
 				n = NULL;
 			}
 		}
 		if (!n) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
 		n->m_next = NULL;
 		m_freem(m);
 		m = n;
 	}
 
 	/* align the mbuf chain so that extensions are in contiguous region. */
 	error = key_align(m, &mh);
 	if (error)
 		return error;
 
 	msg = mh.msg;
 
 	/* check SA type */
 	switch (msg->sadb_msg_satype) {
 	case SADB_SATYPE_UNSPEC:
 		switch (msg->sadb_msg_type) {
 		case SADB_GETSPI:
 		case SADB_UPDATE:
 		case SADB_ADD:
 		case SADB_DELETE:
 		case SADB_GET:
 		case SADB_ACQUIRE:
 		case SADB_EXPIRE:
 			ipseclog((LOG_DEBUG, "%s: must specify satype "
 			    "when msg type=%u.\n", __func__,
 			    msg->sadb_msg_type));
 			V_pfkeystat.out_invsatype++;
 			error = EINVAL;
 			goto senderror;
 		}
 		break;
 	case SADB_SATYPE_AH:
 	case SADB_SATYPE_ESP:
 	case SADB_X_SATYPE_IPCOMP:
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		switch (msg->sadb_msg_type) {
 		case SADB_X_SPDADD:
 		case SADB_X_SPDDELETE:
 		case SADB_X_SPDGET:
 		case SADB_X_SPDDUMP:
 		case SADB_X_SPDFLUSH:
 		case SADB_X_SPDSETIDX:
 		case SADB_X_SPDUPDATE:
 		case SADB_X_SPDDELETE2:
 			ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
 				__func__, msg->sadb_msg_type));
 			V_pfkeystat.out_invsatype++;
 			error = EINVAL;
 			goto senderror;
 		}
 		break;
 	case SADB_SATYPE_RSVP:
 	case SADB_SATYPE_OSPFV2:
 	case SADB_SATYPE_RIPV2:
 	case SADB_SATYPE_MIP:
 		ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n",
 			__func__, msg->sadb_msg_satype));
 		V_pfkeystat.out_invsatype++;
 		error = EOPNOTSUPP;
 		goto senderror;
 	case 1:	/* XXX: What does it do? */
 		if (msg->sadb_msg_type == SADB_X_PROMISC)
 			break;
 		/*FALLTHROUGH*/
 	default:
 		ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
 			__func__, msg->sadb_msg_satype));
 		V_pfkeystat.out_invsatype++;
 		error = EINVAL;
 		goto senderror;
 	}
 
 	/* check field of upper layer protocol and address family */
 	if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL
 	 && mh.ext[SADB_EXT_ADDRESS_DST] != NULL) {
 		struct sadb_address *src0, *dst0;
 		u_int plen;
 
 		src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]);
 		dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]);
 
 		/* check upper layer protocol */
 		if (src0->sadb_address_proto != dst0->sadb_address_proto) {
 			ipseclog((LOG_DEBUG, "%s: upper layer protocol "
 				"mismatched.\n", __func__));
 			V_pfkeystat.out_invaddr++;
 			error = EINVAL;
 			goto senderror;
 		}
 
 		/* check family */
 		if (PFKEY_ADDR_SADDR(src0)->sa_family !=
 		    PFKEY_ADDR_SADDR(dst0)->sa_family) {
 			ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
 				__func__));
 			V_pfkeystat.out_invaddr++;
 			error = EINVAL;
 			goto senderror;
 		}
 		if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 		    PFKEY_ADDR_SADDR(dst0)->sa_len) {
 			ipseclog((LOG_DEBUG, "%s: address struct size "
 				"mismatched.\n", __func__));
 			V_pfkeystat.out_invaddr++;
 			error = EINVAL;
 			goto senderror;
 		}
 
 		switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
 		case AF_INET:
 			if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 			    sizeof(struct sockaddr_in)) {
 				V_pfkeystat.out_invaddr++;
 				error = EINVAL;
 				goto senderror;
 			}
 			break;
 		case AF_INET6:
 			if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 			    sizeof(struct sockaddr_in6)) {
 				V_pfkeystat.out_invaddr++;
 				error = EINVAL;
 				goto senderror;
 			}
 			break;
 		default:
 			ipseclog((LOG_DEBUG, "%s: unsupported address family\n",
 				__func__));
 			V_pfkeystat.out_invaddr++;
 			error = EAFNOSUPPORT;
 			goto senderror;
 		}
 
 		switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
 		case AF_INET:
 			plen = sizeof(struct in_addr) << 3;
 			break;
 		case AF_INET6:
 			plen = sizeof(struct in6_addr) << 3;
 			break;
 		default:
 			plen = 0;	/*fool gcc*/
 			break;
 		}
 
 		/* check max prefix length */
 		if (src0->sadb_address_prefixlen > plen ||
 		    dst0->sadb_address_prefixlen > plen) {
 			ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n",
 				__func__));
 			V_pfkeystat.out_invaddr++;
 			error = EINVAL;
 			goto senderror;
 		}
 
 		/*
 		 * prefixlen == 0 is valid because there can be a case when
 		 * all addresses are matched.
 		 */
 	}
 
 	if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) ||
 	    key_typesw[msg->sadb_msg_type] == NULL) {
 		V_pfkeystat.out_invmsgtype++;
 		error = EINVAL;
 		goto senderror;
 	}
 
 	return (*key_typesw[msg->sadb_msg_type])(so, m, &mh);
 
 senderror:
 	msg->sadb_msg_errno = error;
 	return key_sendup_mbuf(so, m, target);
 }
 
 static int
 key_senderror(so, m, code)
 	struct socket *so;
 	struct mbuf *m;
 	int code;
 {
 	struct sadb_msg *msg;
 
 	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
 		("mbuf too small, len %u", m->m_len));
 
 	msg = mtod(m, struct sadb_msg *);
 	msg->sadb_msg_errno = code;
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
 }
 
 /*
  * set the pointer to each header into message buffer.
  * m will be freed on error.
  * XXX larger-than-MCLBYTES extension?
  */
 static int
 key_align(m, mhp)
 	struct mbuf *m;
 	struct sadb_msghdr *mhp;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct mbuf *n;
 	struct sadb_ext *ext;
 	size_t off, end;
 	int extlen;
 	int toff;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
 		("mbuf too small, len %u", m->m_len));
 
 	/* initialize */
 	bzero(mhp, sizeof(*mhp));
 
 	mhp->msg = mtod(m, struct sadb_msg *);
 	mhp->ext[0] = (struct sadb_ext *)mhp->msg;	/*XXX backward compat */
 
 	end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
 	extlen = end;	/*just in case extlen is not updated*/
 	for (off = sizeof(struct sadb_msg); off < end; off += extlen) {
 		n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff);
 		if (!n) {
 			/* m is already freed */
 			return ENOBUFS;
 		}
 		ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
 
 		/* set pointer */
 		switch (ext->sadb_ext_type) {
 		case SADB_EXT_SA:
 		case SADB_EXT_ADDRESS_SRC:
 		case SADB_EXT_ADDRESS_DST:
 		case SADB_EXT_ADDRESS_PROXY:
 		case SADB_EXT_LIFETIME_CURRENT:
 		case SADB_EXT_LIFETIME_HARD:
 		case SADB_EXT_LIFETIME_SOFT:
 		case SADB_EXT_KEY_AUTH:
 		case SADB_EXT_KEY_ENCRYPT:
 		case SADB_EXT_IDENTITY_SRC:
 		case SADB_EXT_IDENTITY_DST:
 		case SADB_EXT_SENSITIVITY:
 		case SADB_EXT_PROPOSAL:
 		case SADB_EXT_SUPPORTED_AUTH:
 		case SADB_EXT_SUPPORTED_ENCRYPT:
 		case SADB_EXT_SPIRANGE:
 		case SADB_X_EXT_POLICY:
 		case SADB_X_EXT_SA2:
 			/* duplicate check */
 			/*
 			 * XXX Are there duplication payloads of either
 			 * KEY_AUTH or KEY_ENCRYPT ?
 			 */
 			if (mhp->ext[ext->sadb_ext_type] != NULL) {
 				ipseclog((LOG_DEBUG, "%s: duplicate ext_type "
 					"%u\n", __func__, ext->sadb_ext_type));
 				m_freem(m);
 				V_pfkeystat.out_dupext++;
 				return EINVAL;
 			}
 			break;
 		default:
 			ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n",
 				__func__, ext->sadb_ext_type));
 			m_freem(m);
 			V_pfkeystat.out_invexttype++;
 			return EINVAL;
 		}
 
 		extlen = PFKEY_UNUNIT64(ext->sadb_ext_len);
 
 		if (key_validate_ext(ext, extlen)) {
 			m_freem(m);
 			V_pfkeystat.out_invlen++;
 			return EINVAL;
 		}
 
 		n = m_pulldown(m, off, extlen, &toff);
 		if (!n) {
 			/* m is already freed */
 			return ENOBUFS;
 		}
 		ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
 
 		mhp->ext[ext->sadb_ext_type] = ext;
 		mhp->extoff[ext->sadb_ext_type] = off;
 		mhp->extlen[ext->sadb_ext_type] = extlen;
 	}
 
 	if (off != end) {
 		m_freem(m);
 		V_pfkeystat.out_invlen++;
 		return EINVAL;
 	}
 
 	return 0;
 }
 
 static int
 key_validate_ext(ext, len)
 	const struct sadb_ext *ext;
 	int len;
 {
 	const struct sockaddr *sa;
 	enum { NONE, ADDR } checktype = NONE;
 	int baselen = 0;
 	const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len);
 
 	if (len != PFKEY_UNUNIT64(ext->sadb_ext_len))
 		return EINVAL;
 
 	/* if it does not match minimum/maximum length, bail */
 	if (ext->sadb_ext_type >= sizeof(minsize) / sizeof(minsize[0]) ||
 	    ext->sadb_ext_type >= sizeof(maxsize) / sizeof(maxsize[0]))
 		return EINVAL;
 	if (!minsize[ext->sadb_ext_type] || len < minsize[ext->sadb_ext_type])
 		return EINVAL;
 	if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type])
 		return EINVAL;
 
 	/* more checks based on sadb_ext_type XXX need more */
 	switch (ext->sadb_ext_type) {
 	case SADB_EXT_ADDRESS_SRC:
 	case SADB_EXT_ADDRESS_DST:
 	case SADB_EXT_ADDRESS_PROXY:
 		baselen = PFKEY_ALIGN8(sizeof(struct sadb_address));
 		checktype = ADDR;
 		break;
 	case SADB_EXT_IDENTITY_SRC:
 	case SADB_EXT_IDENTITY_DST:
 		if (((const struct sadb_ident *)ext)->sadb_ident_type ==
 		    SADB_X_IDENTTYPE_ADDR) {
 			baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident));
 			checktype = ADDR;
 		} else
 			checktype = NONE;
 		break;
 	default:
 		checktype = NONE;
 		break;
 	}
 
 	switch (checktype) {
 	case NONE:
 		break;
 	case ADDR:
 		sa = (const struct sockaddr *)(((const u_int8_t*)ext)+baselen);
 		if (len < baselen + sal)
 			return EINVAL;
 		if (baselen + PFKEY_ALIGN8(sa->sa_len) != len)
 			return EINVAL;
 		break;
 	}
 
 	return 0;
 }
 
 void
 key_init(void)
 {
 	INIT_VNET_IPSEC(curvnet);
 	int i;
 
 	V_key_debug_level = 0;
 	V_key_spi_trycnt = 1000;
 	V_key_spi_minval = 0x100;
 	V_key_spi_maxval = 0x0fffffff;	/* XXX */
 	V_policy_id = 0;
 	V_key_int_random = 60;		/*interval to initialize randseed,1(m)*/
 	V_key_larval_lifetime = 30;	/* interval to expire acquiring, 30(s)*/
 	V_key_blockacq_count = 10;	/* counter for blocking SADB_ACQUIRE.*/
 	V_key_blockacq_lifetime = 20;	/* lifetime for blocking SADB_ACQUIRE.*/
 	V_key_preferred_oldsa = 1;	/* preferred old sa rather than new sa*/
 
 	V_acq_seq = 0;
 
 	V_ipsec_esp_keymin = 256;
 	V_ipsec_esp_auth = 0;
 	V_ipsec_ah_keymin = 128;
 
 	SPTREE_LOCK_INIT();
 	REGTREE_LOCK_INIT();
 	SAHTREE_LOCK_INIT();
 	ACQ_LOCK_INIT();
 	SPACQ_LOCK_INIT();
 
 	for (i = 0; i < IPSEC_DIR_MAX; i++)
 		LIST_INIT(&V_sptree[i]);
 
 	LIST_INIT(&V_sahtree);
 
 	for (i = 0; i <= SADB_SATYPE_MAX; i++)
 		LIST_INIT(&V_regtree[i]);
 
 	LIST_INIT(&V_acqtree);
 	LIST_INIT(&V_spacqtree);
 
 	/* system default */
 	V_ip4_def_policy.policy = IPSEC_POLICY_NONE;
 	V_ip4_def_policy.refcnt++;	/*never reclaim this*/
 
 #ifndef IPSEC_DEBUG2
 	timeout((void *)key_timehandler, (void *)0, hz);
 #endif /*IPSEC_DEBUG2*/
 
 	/* initialize key statistics */
 	keystat.getspi_count = 1;
 
 	printf("IPsec: Initialized Security Association Processing.\n");
 
 	return;
 }
 
 /*
  * XXX: maybe This function is called after INBOUND IPsec processing.
  *
  * Special check for tunnel-mode packets.
  * We must make some checks for consistency between inner and outer IP header.
  *
  * xxx more checks to be provided
  */
 int
 key_checktunnelsanity(sav, family, src, dst)
 	struct secasvar *sav;
 	u_int family;
 	caddr_t src;
 	caddr_t dst;
 {
 	IPSEC_ASSERT(sav->sah != NULL, ("null SA header"));
 
 	/* XXX: check inner IP header */
 
 	return 1;
 }
 
 /* record data transfer on SA, and update timestamps */
 void
 key_sa_recordxfer(sav, m)
 	struct secasvar *sav;
 	struct mbuf *m;
 {
 	IPSEC_ASSERT(sav != NULL, ("Null secasvar"));
 	IPSEC_ASSERT(m != NULL, ("Null mbuf"));
 	if (!sav->lft_c)
 		return;
 
 	/*
 	 * XXX Currently, there is a difference of bytes size
 	 * between inbound and outbound processing.
 	 */
 	sav->lft_c->bytes += m->m_pkthdr.len;
 	/* to check bytes lifetime is done in key_timehandler(). */
 
 	/*
 	 * We use the number of packets as the unit of
 	 * allocations.  We increment the variable
 	 * whenever {esp,ah}_{in,out}put is called.
 	 */
 	sav->lft_c->allocations++;
 	/* XXX check for expires? */
 
 	/*
 	 * NOTE: We record CURRENT usetime by using wall clock,
 	 * in seconds.  HARD and SOFT lifetime are measured by the time
 	 * difference (again in seconds) from usetime.
 	 *
 	 *	usetime
 	 *	v     expire   expire
 	 * -----+-----+--------+---> t
 	 *	<--------------> HARD
 	 *	<-----> SOFT
 	 */
 	sav->lft_c->usetime = time_second;
 	/* XXX check for expires? */
 
 	return;
 }
 
 /* dumb version */
 void
 key_sa_routechange(dst)
 	struct sockaddr *dst;
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct secashead *sah;
 	struct route *ro;
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		ro = &sah->sa_route;
 		if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len
 		 && bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) {
 			RTFREE(ro->ro_rt);
 			ro->ro_rt = (struct rtentry *)NULL;
 		}
 	}
 	SAHTREE_UNLOCK();
 }
 
 static void
 key_sa_chgstate(sav, state)
 	struct secasvar *sav;
 	u_int8_t state;
 {
 	IPSEC_ASSERT(sav != NULL, ("NULL sav"));
 	SAHTREE_LOCK_ASSERT();
 
 	if (sav->state != state) {
 		if (__LIST_CHAINED(sav))
 			LIST_REMOVE(sav, chain);
 		sav->state = state;
 		LIST_INSERT_HEAD(&sav->sah->savtree[state], sav, chain);
 	}
 }
 
 void
 key_sa_stir_iv(sav)
 	struct secasvar *sav;
 {
 
 	IPSEC_ASSERT(sav->iv != NULL, ("null IV"));
 	key_randomfill(sav->iv, sav->ivlen);
 }
 
 /* XXX too much? */
 static struct mbuf *
 key_alloc_mbuf(l)
 	int l;
 {
 	struct mbuf *m = NULL, *n;
 	int len, t;
 
 	len = l;
 	while (len > 0) {
 		MGET(n, M_DONTWAIT, MT_DATA);
 		if (n && len > MLEN)
 			MCLGET(n, M_DONTWAIT);
 		if (!n) {
 			m_freem(m);
 			return NULL;
 		}
 
 		n->m_next = NULL;
 		n->m_len = 0;
 		n->m_len = M_TRAILINGSPACE(n);
 		/* use the bottom of mbuf, hoping we can prepend afterwards */
 		if (n->m_len > len) {
 			t = (n->m_len - len) & ~(sizeof(long) - 1);
 			n->m_data += t;
 			n->m_len = len;
 		}
 
 		len -= n->m_len;
 
 		if (m)
 			m_cat(m, n);
 		else
 			m = n;
 	}
 
 	return m;
 }
 
 /*
  * Take one of the kernel's security keys and convert it into a PF_KEY
  * structure within an mbuf, suitable for sending up to a waiting
  * application in user land.
  * 
  * IN: 
  *    src: A pointer to a kernel security key.
  *    exttype: Which type of key this is. Refer to the PF_KEY data structures.
  * OUT:
  *    a valid mbuf or NULL indicating an error
  *
  */
 
 static struct mbuf *
 key_setkey(struct seckey *src, u_int16_t exttype) 
 {
 	struct mbuf *m;
 	struct sadb_key *p;
 	int len;
 
 	if (src == NULL)
 		return NULL;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src));
 	m = key_alloc_mbuf(len);
 	if (m == NULL)
 		return NULL;
 	p = mtod(m, struct sadb_key *);
 	bzero(p, len);
 	p->sadb_key_len = PFKEY_UNIT64(len);
 	p->sadb_key_exttype = exttype;
 	p->sadb_key_bits = src->bits;
 	bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src));
 
 	return m;
 }
 
 /*
  * Take one of the kernel's lifetime data structures and convert it
  * into a PF_KEY structure within an mbuf, suitable for sending up to
  * a waiting application in user land.
  * 
  * IN: 
  *    src: A pointer to a kernel lifetime structure.
  *    exttype: Which type of lifetime this is. Refer to the PF_KEY 
  *             data structures for more information.
  * OUT:
  *    a valid mbuf or NULL indicating an error
  *
  */
 
 static struct mbuf *
 key_setlifetime(struct seclifetime *src, u_int16_t exttype)
 {
 	struct mbuf *m = NULL;
 	struct sadb_lifetime *p;
 	int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime));
 
 	if (src == NULL)
 		return NULL;
 
 	m = key_alloc_mbuf(len);
 	if (m == NULL)
 		return m;
 	p = mtod(m, struct sadb_lifetime *);
 
 	bzero(p, len);
 	p->sadb_lifetime_len = PFKEY_UNIT64(len);
 	p->sadb_lifetime_exttype = exttype;
 	p->sadb_lifetime_allocations = src->allocations;
 	p->sadb_lifetime_bytes = src->bytes;
 	p->sadb_lifetime_addtime = src->addtime;
 	p->sadb_lifetime_usetime = src->usetime;
 	
 	return m;
 
 }
Index: head/sys/netipsec/keysock.c
===================================================================
--- head/sys/netipsec/keysock.c	(revision 185347)
+++ head/sys/netipsec/keysock.c	(revision 185348)
@@ -1,589 +1,589 @@
 /*	$FreeBSD$	*/
 /*	$KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ipsec.h"
 
 /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vimage.h>
 
 #include <net/if.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 
 #include <net/pfkeyv2.h>
 #include <netipsec/key.h>
 #include <netipsec/keysock.h>
 #include <netipsec/key_debug.h>
 #include <netipsec/ipsec.h>
 
 #include <machine/stdarg.h>
 
 struct key_cb {
 	int key_count;
 	int any_count;
 };
 
 #ifdef VIMAGE_GLOBALS
 static struct key_cb key_cb;
 struct pfkeystat pfkeystat;
 #endif
 
-static struct sockaddr key_src = { 2, PF_KEY, };
+static struct sockaddr key_src = { 2, PF_KEY };
 
 static int key_sendup0 __P((struct rawcb *, struct mbuf *, int));
 
 /*
  * key_output()
  */
 int
 key_output(struct mbuf *m, struct socket *so)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct sadb_msg *msg;
 	int len, error = 0;
 
 	if (m == 0)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	V_pfkeystat.out_total++;
 	V_pfkeystat.out_bytes += m->m_pkthdr.len;
 
 	len = m->m_pkthdr.len;
 	if (len < sizeof(struct sadb_msg)) {
 		V_pfkeystat.out_tooshort++;
 		error = EINVAL;
 		goto end;
 	}
 
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) {
 			V_pfkeystat.out_nomem++;
 			error = ENOBUFS;
 			goto end;
 		}
 	}
 
 	M_ASSERTPKTHDR(m);
 
 	KEYDEBUG(KEYDEBUG_KEY_DUMP, kdebug_mbuf(m));
 
 	msg = mtod(m, struct sadb_msg *);
 	V_pfkeystat.out_msgtype[msg->sadb_msg_type]++;
 	if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) {
 		V_pfkeystat.out_invlen++;
 		error = EINVAL;
 		goto end;
 	}
 
 	error = key_parse(m, so);
 	m = NULL;
 end:
 	if (m)
 		m_freem(m);
 	return error;
 }
 
 /*
  * send message to the socket.
  */
 static int
 key_sendup0(rp, m, promisc)
 	struct rawcb *rp;
 	struct mbuf *m;
 	int promisc;
 {
 	INIT_VNET_IPSEC(curvnet);
 	int error;
 
 	if (promisc) {
 		struct sadb_msg *pmsg;
 
 		M_PREPEND(m, sizeof(struct sadb_msg), M_DONTWAIT);
 		if (m && m->m_len < sizeof(struct sadb_msg))
 			m = m_pullup(m, sizeof(struct sadb_msg));
 		if (!m) {
 			V_pfkeystat.in_nomem++;
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m->m_pkthdr.len += sizeof(*pmsg);
 
 		pmsg = mtod(m, struct sadb_msg *);
 		bzero(pmsg, sizeof(*pmsg));
 		pmsg->sadb_msg_version = PF_KEY_V2;
 		pmsg->sadb_msg_type = SADB_X_PROMISC;
 		pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 		/* pid and seq? */
 
 		V_pfkeystat.in_msgtype[pmsg->sadb_msg_type]++;
 	}
 
-	if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&V_key_src,
+	if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src,
 	    m, NULL)) {
 		V_pfkeystat.in_nomem++;
 		m_freem(m);
 		error = ENOBUFS;
 	} else
 		error = 0;
 	sorwakeup(rp->rcb_socket);
 	return error;
 }
 
 /* XXX this interface should be obsoleted. */
 int
 key_sendup(so, msg, len, target)
 	struct socket *so;
 	struct sadb_msg *msg;
 	u_int len;
 	int target;	/*target of the resulting message*/
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct mbuf *m, *n, *mprev;
 	int tlen;
 
 	/* sanity check */
 	if (so == 0 || msg == 0)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	KEYDEBUG(KEYDEBUG_KEY_DUMP,
 		printf("%s: \n", __func__);
 		kdebug_sadb(msg));
 
 	/*
 	 * we increment statistics here, just in case we have ENOBUFS
 	 * in this function.
 	 */
 	V_pfkeystat.in_total++;
 	V_pfkeystat.in_bytes += len;
 	V_pfkeystat.in_msgtype[msg->sadb_msg_type]++;
 
 	/*
 	 * Get mbuf chain whenever possible (not clusters),
 	 * to save socket buffer.  We'll be generating many SADB_ACQUIRE
 	 * messages to listening key sockets.  If we simply allocate clusters,
 	 * sbappendaddr() will raise ENOBUFS due to too little sbspace().
 	 * sbspace() computes # of actual data bytes AND mbuf region.
 	 *
 	 * TODO: SADB_ACQUIRE filters should be implemented.
 	 */
 	tlen = len;
 	m = mprev = NULL;
 	while (tlen > 0) {
 		if (tlen == len) {
 			MGETHDR(n, M_DONTWAIT, MT_DATA);
 			if (n == NULL) {
 				V_pfkeystat.in_nomem++;
 				return ENOBUFS;
 			}
 			n->m_len = MHLEN;
 		} else {
 			MGET(n, M_DONTWAIT, MT_DATA);
 			if (n == NULL) {
 				V_pfkeystat.in_nomem++;
 				return ENOBUFS;
 			}
 			n->m_len = MLEN;
 		}
 		if (tlen >= MCLBYTES) {	/*XXX better threshold? */
 			MCLGET(n, M_DONTWAIT);
 			if ((n->m_flags & M_EXT) == 0) {
 				m_free(n);
 				m_freem(m);
 				V_pfkeystat.in_nomem++;
 				return ENOBUFS;
 			}
 			n->m_len = MCLBYTES;
 		}
 
 		if (tlen < n->m_len)
 			n->m_len = tlen;
 		n->m_next = NULL;
 		if (m == NULL)
 			m = mprev = n;
 		else {
 			mprev->m_next = n;
 			mprev = n;
 		}
 		tlen -= n->m_len;
 		n = NULL;
 	}
 	m->m_pkthdr.len = len;
 	m->m_pkthdr.rcvif = NULL;
 	m_copyback(m, 0, len, (caddr_t)msg);
 
 	/* avoid duplicated statistics */
 	V_pfkeystat.in_total--;
 	V_pfkeystat.in_bytes -= len;
 	V_pfkeystat.in_msgtype[msg->sadb_msg_type]--;
 
 	return key_sendup_mbuf(so, m, target);
 }
 
 /* so can be NULL if target != KEY_SENDUP_ONE */
 int
 key_sendup_mbuf(so, m, target)
 	struct socket *so;
 	struct mbuf *m;
 	int target;
 {
 	INIT_VNET_NET(curvnet);
 	INIT_VNET_IPSEC(curvnet);
 	struct mbuf *n;
 	struct keycb *kp;
 	int sendup;
 	struct rawcb *rp;
 	int error = 0;
 
 	if (m == NULL)
 		panic("key_sendup_mbuf: NULL pointer was passed.\n");
 	if (so == NULL && target == KEY_SENDUP_ONE)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	V_pfkeystat.in_total++;
 	V_pfkeystat.in_bytes += m->m_pkthdr.len;
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		m = m_pullup(m, sizeof(struct sadb_msg));
 		if (m == NULL) {
 			V_pfkeystat.in_nomem++;
 			return ENOBUFS;
 		}
 	}
 	if (m->m_len >= sizeof(struct sadb_msg)) {
 		struct sadb_msg *msg;
 		msg = mtod(m, struct sadb_msg *);
 		V_pfkeystat.in_msgtype[msg->sadb_msg_type]++;
 	}
 	mtx_lock(&rawcb_mtx);
 	LIST_FOREACH(rp, &V_rawcb_list, list)
 	{
 		if (rp->rcb_proto.sp_family != PF_KEY)
 			continue;
 		if (rp->rcb_proto.sp_protocol
 		 && rp->rcb_proto.sp_protocol != PF_KEY_V2) {
 			continue;
 		}
 
 		kp = (struct keycb *)rp;
 
 		/*
 		 * If you are in promiscuous mode, and when you get broadcasted
 		 * reply, you'll get two PF_KEY messages.
 		 * (based on pf_key@inner.net message on 14 Oct 1998)
 		 */
 		if (((struct keycb *)rp)->kp_promisc) {
 			if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
 				(void)key_sendup0(rp, n, 1);
 				n = NULL;
 			}
 		}
 
 		/* the exact target will be processed later */
 		if (so && sotorawcb(so) == rp)
 			continue;
 
 		sendup = 0;
 		switch (target) {
 		case KEY_SENDUP_ONE:
 			/* the statement has no effect */
 			if (so && sotorawcb(so) == rp)
 				sendup++;
 			break;
 		case KEY_SENDUP_ALL:
 			sendup++;
 			break;
 		case KEY_SENDUP_REGISTERED:
 			if (kp->kp_registered)
 				sendup++;
 			break;
 		}
 		V_pfkeystat.in_msgtarget[target]++;
 
 		if (!sendup)
 			continue;
 
 		if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) {
 			m_freem(m);
 			V_pfkeystat.in_nomem++;
 			mtx_unlock(&rawcb_mtx);
 			return ENOBUFS;
 		}
 
 		if ((error = key_sendup0(rp, n, 0)) != 0) {
 			m_freem(m);
 			mtx_unlock(&rawcb_mtx);
 			return error;
 		}
 
 		n = NULL;
 	}
 
 	if (so) {
 		error = key_sendup0(sotorawcb(so), m, 0);
 		m = NULL;
 	} else {
 		error = 0;
 		m_freem(m);
 	}
 	mtx_unlock(&rawcb_mtx);
 	return error;
 }
 
 /*
  * key_abort()
  * derived from net/rtsock.c:rts_abort()
  */
 static void
 key_abort(struct socket *so)
 {
 	raw_usrreqs.pru_abort(so);
 }
 
 /*
  * key_attach()
  * derived from net/rtsock.c:rts_attach()
  */
 static int
 key_attach(struct socket *so, int proto, struct thread *td)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct keycb *kp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("key_attach: so_pcb != NULL"));
 
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NET_RAW);
 		if (error)
 			return error;
 	}
 
 	/* XXX */
 	kp = malloc(sizeof *kp, M_PCB, M_WAITOK | M_ZERO); 
 	if (kp == 0)
 		return ENOBUFS;
 
 	so->so_pcb = (caddr_t)kp;
 	error = raw_attach(so, proto);
 	kp = (struct keycb *)sotorawcb(so);
 	if (error) {
 		free(kp, M_PCB);
 		so->so_pcb = (caddr_t) 0;
 		return error;
 	}
 
 	kp->kp_promisc = kp->kp_registered = 0;
 
 	if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */
 		V_key_cb.key_count++;
 	V_key_cb.any_count++;
 	soisconnected(so);
 	so->so_options |= SO_USELOOPBACK;
 
 	return 0;
 }
 
 /*
  * key_bind()
  * derived from net/rtsock.c:rts_bind()
  */
 static int
 key_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
   return EINVAL;
 }
 
 /*
  * key_close()
  * derived from net/rtsock.c:rts_close().
  */
 static void
 key_close(struct socket *so)
 {
 
 	raw_usrreqs.pru_close(so);
 }
 
 /*
  * key_connect()
  * derived from net/rtsock.c:rts_connect()
  */
 static int
 key_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	return EINVAL;
 }
 
 /*
  * key_detach()
  * derived from net/rtsock.c:rts_detach()
  */
 static void
 key_detach(struct socket *so)
 {
 	INIT_VNET_IPSEC(curvnet);
 	struct keycb *kp = (struct keycb *)sotorawcb(so);
 
 	KASSERT(kp != NULL, ("key_detach: kp == NULL"));
 	if (kp->kp_raw.rcb_proto.sp_protocol
 	    == PF_KEY) /* XXX: AF_KEY */
 		V_key_cb.key_count--;
 	V_key_cb.any_count--;
 
 	key_freereg(so);
 	raw_usrreqs.pru_detach(so);
 }
 
 /*
  * key_disconnect()
  * derived from net/rtsock.c:key_disconnect()
  */
 static int
 key_disconnect(struct socket *so)
 {
 	return(raw_usrreqs.pru_disconnect(so));
 }
 
 /*
  * key_peeraddr()
  * derived from net/rtsock.c:rts_peeraddr()
  */
 static int
 key_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return(raw_usrreqs.pru_peeraddr(so, nam));
 }
 
 /*
  * key_send()
  * derived from net/rtsock.c:rts_send()
  */
 static int
 key_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	return(raw_usrreqs.pru_send(so, flags, m, nam, control, td));
 }
 
 /*
  * key_shutdown()
  * derived from net/rtsock.c:rts_shutdown()
  */
 static int
 key_shutdown(struct socket *so)
 {
 	return(raw_usrreqs.pru_shutdown(so));
 }
 
 /*
  * key_sockaddr()
  * derived from net/rtsock.c:rts_sockaddr()
  */
 static int
 key_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return(raw_usrreqs.pru_sockaddr(so, nam));
 }
 
 struct pr_usrreqs key_usrreqs = {
 	.pru_abort =		key_abort,
 	.pru_attach =		key_attach,
 	.pru_bind =		key_bind,
 	.pru_connect =		key_connect,
 	.pru_detach =		key_detach,
 	.pru_disconnect =	key_disconnect,
 	.pru_peeraddr =		key_peeraddr,
 	.pru_send =		key_send,
 	.pru_shutdown =		key_shutdown,
 	.pru_sockaddr =		key_sockaddr,
 	.pru_close =		key_close,
 };
 
 /* sysctl */
 SYSCTL_NODE(_net, PF_KEY, key, CTLFLAG_RW, 0, "Key Family");
 
 /*
  * Definitions of protocols supported in the KEY domain.
  */
 
 extern struct domain keydomain;
 
 struct protosw keysw[] = {
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&keydomain,
 	.pr_protocol =		PF_KEY_V2,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_output =		key_output,
 	.pr_ctlinput =		raw_ctlinput,
 	.pr_init =		raw_init,
 	.pr_usrreqs =		&key_usrreqs
 }
 };
 
 static void
 key_init0(void)
 {
 	INIT_VNET_IPSEC(curvnet);
 
 	bzero((caddr_t)&V_key_cb, sizeof(V_key_cb));
 	ipsec_init();
 	key_init();
 }
 
 struct domain keydomain = {
 	.dom_family =		PF_KEY,
 	.dom_name =		"key",
 	.dom_init =		key_init0,
 	.dom_protosw =		keysw,
 	.dom_protoswNPROTOSW =	&keysw[sizeof(keysw)/sizeof(keysw[0])]
 };
 
 DOMAIN_SET(key);
Index: head/sys/netipsec/vipsec.h
===================================================================
--- head/sys/netipsec/vipsec.h	(revision 185347)
+++ head/sys/netipsec/vipsec.h	(revision 185348)
@@ -1,188 +1,178 @@
 /*
  * Copyright (c) 2007-2008 University of Zagreb
  * Copyright (c) 2007-2008 FreeBSD Foundation
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NETIPSEC_VIPSEC_H_
 #define _NETIPSEC_VIPSEC_H_
 
 #ifdef VIMAGE
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ah_var.h>
 #include <netipsec/ipcomp_var.h>
 #include <netipsec/ipip_var.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/raw_cb.h>
 
 #include <netipsec/keysock.h>
 
 struct vnet_ipsec {
 	int			_ipsec_debug;
 	struct	ipsecstat	_ipsec4stat;
 	struct	secpolicy 	_ip4_def_policy;
 
 	int			_ip4_esp_trans_deflev;
 	int			_ip4_esp_net_deflev;
 	int			_ip4_ah_trans_deflev;
 	int			_ip4_ah_net_deflev;
 	int			_ip4_ah_offsetmask;
 	int			_ip4_ipsec_dfbit;
 	int			_ip4_ipsec_ecn;
 	int			_ip4_esp_randpad;
 
 	int			_ipsec_replay;
 	int			_ipsec_integrity;
 	int			_crypto_support;
 
 	u_int32_t		_key_debug_level;	
 	u_int			_key_spi_trycnt;
 	u_int32_t		_key_spi_minval;
 	u_int32_t		_key_spi_maxval;  
 	u_int32_t		_policy_id;
 	u_int			_key_int_random;   
 	u_int			_key_larval_lifetime; 
 	int			_key_blockacq_count;
 	int			_key_blockacq_lifetime;
 	int			_key_preferred_oldsa;
 	u_int32_t		_acq_seq;
 
-	u_int			_saorder_state_alive[3];
-	u_int			_saorder_state_any[4];
 	int			_esp_enable;
 	struct espstat		_espstat;
 	int			_esp_max_ivlen;
 	int			_ipsec_esp_keymin;
 	int			_ipsec_esp_auth;
 	int			_ipsec_ah_keymin;
 	int			_ipip_allow;
 	struct ipipstat		_ipipstat;
 
 	struct ipsecstat	_ipsec6stat;
 	int			_ip6_esp_trans_deflev;
 	int			_ip6_esp_net_deflev;
 	int			_ip6_ah_trans_deflev;
 	int			_ip6_ah_net_deflev;
 	int			_ip6_ipsec_ecn;
-	int			_ip6_esp_randpad;
 
 	int			_ah_enable;
 	int			_ah_cleartos;
 	struct ahstat		_ahstat;
 
 	int			_ipcomp_enable;
 	struct ipcompstat	_ipcompstat;
 
 	struct pfkeystat	_pfkeystat;
 	struct key_cb		_key_cb;
-	struct sockaddr		_key_dst;
-	struct sockaddr		_key_src;
-
 	LIST_HEAD(, secpolicy)	_sptree[IPSEC_DIR_MAX];
 	LIST_HEAD(, secashead)	_sahtree;
 	LIST_HEAD(, secreg)	_regtree[SADB_SATYPE_MAX + 1];
 	LIST_HEAD(, secacq)	_acqtree;
 	LIST_HEAD(, secspacq)	_spacqtree;
 };
 #endif
 
 /*
  * Symbol translation macros
  */
 #define	INIT_VNET_IPSEC(vnet) \
 	INIT_FROM_VNET(vnet, VNET_MOD_IPSEC, struct vnet_ipsec, vnet_ipsec)
 
 #define	VNET_IPSEC(sym)	VSYM(vnet_ipsec, sym)
 
 #define	V_acq_seq			VNET_IPSEC(acq_seq)
 #define	V_acqtree			VNET_IPSEC(acqtree)
 #define	V_ah_cleartos			VNET_IPSEC(ah_cleartos)
 #define	V_ah_enable			VNET_IPSEC(ah_enable)
 #define	V_ahstat			VNET_IPSEC(ahstat)
 #define	V_crypto_support		VNET_IPSEC(crypto_support)
 #define	V_esp_enable			VNET_IPSEC(esp_enable)
 #define	V_esp_max_ivlen			VNET_IPSEC(esp_max_ivlen)
 #define	V_espstat			VNET_IPSEC(espstat)
 #define	V_ip4_ah_net_deflev		VNET_IPSEC(ip4_ah_net_deflev)
 #define	V_ip4_ah_offsetmask		VNET_IPSEC(ip4_ah_offsetmask)
 #define	V_ip4_ah_trans_deflev		VNET_IPSEC(ip4_ah_trans_deflev)
 #define	V_ip4_def_policy		VNET_IPSEC(ip4_def_policy)
 #define	V_ip4_esp_net_deflev		VNET_IPSEC(ip4_esp_net_deflev)
 #define	V_ip4_esp_randpad		VNET_IPSEC(ip4_esp_randpad)
 #define	V_ip4_esp_trans_deflev		VNET_IPSEC(ip4_esp_trans_deflev)
 #define	V_ip4_ipsec_dfbit		VNET_IPSEC(ip4_ipsec_dfbit)
 #define	V_ip4_ipsec_ecn			VNET_IPSEC(ip4_ipsec_ecn)
 #define	V_ip6_ah_net_deflev		VNET_IPSEC(ip6_ah_net_deflev)
 #define	V_ip6_ah_trans_deflev		VNET_IPSEC(ip6_ah_trans_deflev)
 #define	V_ip6_esp_net_deflev		VNET_IPSEC(ip6_esp_net_deflev)
 #define	V_ip6_esp_randpad		VNET_IPSEC(ip6_esp_randpad)
 #define	V_ip6_esp_trans_deflev		VNET_IPSEC(ip6_esp_trans_deflev)
 #define	V_ip6_ipsec_ecn			VNET_IPSEC(ip6_ipsec_ecn)
 #define	V_ipcomp_enable			VNET_IPSEC(ipcomp_enable)
 #define	V_ipcompstat			VNET_IPSEC(ipcompstat)
 #define	V_ipip_allow			VNET_IPSEC(ipip_allow)
 #define	V_ipipstat			VNET_IPSEC(ipipstat)
 #define	V_ipsec4stat			VNET_IPSEC(ipsec4stat)
 #define	V_ipsec6stat			VNET_IPSEC(ipsec6stat)
 #define	V_ipsec_ah_keymin		VNET_IPSEC(ipsec_ah_keymin)
 #define	V_ipsec_debug			VNET_IPSEC(ipsec_debug)
 #define	V_ipsec_esp_auth		VNET_IPSEC(ipsec_esp_auth)
 #define	V_ipsec_esp_keymin		VNET_IPSEC(ipsec_esp_keymin)
 #define	V_ipsec_integrity		VNET_IPSEC(ipsec_integrity)
 #define	V_ipsec_replay			VNET_IPSEC(ipsec_replay)
 #define	V_key_blockacq_count		VNET_IPSEC(key_blockacq_count)
 #define	V_key_blockacq_lifetime		VNET_IPSEC(key_blockacq_lifetime)
 #define	V_key_cb			VNET_IPSEC(key_cb)
 #define	V_key_debug_level		VNET_IPSEC(key_debug_level)
-#define	V_key_dst			VNET_IPSEC(key_dst)
 #define	V_key_int_random		VNET_IPSEC(key_int_random)
 #define	V_key_larval_lifetime		VNET_IPSEC(key_larval_lifetime)
 #define	V_key_preferred_oldsa		VNET_IPSEC(key_preferred_oldsa)
 #define	V_key_spi_maxval		VNET_IPSEC(key_spi_maxval)	
 #define	V_key_spi_minval		VNET_IPSEC(key_spi_minval)
 #define	V_key_spi_trycnt		VNET_IPSEC(key_spi_trycnt)
-#define	V_key_src			VNET_IPSEC(key_src)
 #define	V_pfkeystat			VNET_IPSEC(pfkeystat)
 #define	V_policy_id			VNET_IPSEC(policy_id)
 #define	V_regtree			VNET_IPSEC(regtree)
 #define	V_sahtree			VNET_IPSEC(sahtree)
-#define	V_saorder_state_alive		VNET_IPSEC(saorder_state_alive)
-#define	V_saorder_state_any		VNET_IPSEC(saorder_state_any)
 #define	V_spacqtree			VNET_IPSEC(spacqtree)
 #define	V_sptree			VNET_IPSEC(sptree)
 
 #endif /* !_NETIPSEC_VIPSEC_H_ */