Index: head/sys/contrib/ipfilter/netinet/ip_auth.c
===================================================================
--- head/sys/contrib/ipfilter/netinet/ip_auth.c	(revision 105193)
+++ head/sys/contrib/ipfilter/netinet/ip_auth.c	(revision 105194)
@@ -1,630 +1,631 @@
 /*
  * Copyright (C) 1998-2001 by Darren Reed & Guido van Rooij.
  *
  * See the IPFILTER.LICENCE file for details on licencing.
  */
 #ifdef __sgi
 # include <sys/ptimers.h>
 #endif
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/file.h>
 #if !defined(_KERNEL) && !defined(KERNEL)
 # include <stdio.h>
 # include <stdlib.h>
 # include <string.h>
 #endif
 #if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000)
 # include <sys/filio.h>
 # include <sys/fcntl.h>
 #else
 # include <sys/ioctl.h>
 #endif
 #ifndef linux
 # include <sys/protosw.h>
 #endif
 #include <sys/socket.h>
 #if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux)
 # include <sys/systm.h>
 #endif
 #if !defined(__SVR4) && !defined(__svr4__)
 # ifndef linux
 #  include <sys/mbuf.h>
 # endif
 #else
 # include <sys/filio.h>
 # include <sys/byteorder.h>
 # ifdef _KERNEL
 #  include <sys/dditypes.h>
 # endif
 # include <sys/stream.h>
 # include <sys/kmem.h>
 #endif
 #if (_BSDI_VERSION >= 199802) || (__FreeBSD_version >= 400000)
 # include <sys/queue.h>
 #endif
 #if defined(__NetBSD__) || defined(__OpenBSD__) || defined(bsdi)
 # include <machine/cpu.h>
 #endif
 #include <net/if.h>
 #ifdef sun
 # include <net/af.h>
 #endif
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifndef	KERNEL
 # define	KERNEL
 # define	NOT_KERNEL
 #endif
 #ifndef linux
 # include <netinet/ip_var.h>
 #endif
 #ifdef	NOT_KERNEL
 # undef	KERNEL
 #endif
 #ifdef __sgi
 # ifdef IFF_DRVRLOCK /* IRIX6 */
 #  include <sys/hashing.h>
 # endif
 #endif
 #include <netinet/tcp.h>
 #if defined(__sgi) && !defined(IFF_DRVRLOCK) /* IRIX < 6 */
 extern struct ifqueue   ipintrq;		/* ip packet input queue */
 #else
 # ifndef linux
 #  if __FreeBSD_version >= 300000
 #   include <net/if_var.h>
 #  endif
 #  include <netinet/in_var.h>
 #  include <netinet/tcp_fsm.h>
 # endif
 #endif
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
 #include "netinet/ip_compat.h"
 #include <netinet/tcpip.h>
 #include "netinet/ip_fil.h"
 #include "netinet/ip_auth.h"
 #if !SOLARIS && !defined(linux)
 # include <net/netisr.h>
 # ifdef __FreeBSD__
 #  include <machine/cpufunc.h>
 # endif
 #endif
 #if (__FreeBSD_version >= 300000)
 # include <sys/malloc.h>
 # if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM)
 #  include <sys/libkern.h>
 #  include <sys/systm.h>
 # endif
 #endif
 
 #if !defined(lint)
 /* static const char rcsid[] = "@(#)$Id: ip_auth.c,v 2.11.2.12 2001/07/18 14:57:08 darrenr Exp $"; */
 static const char rcsid[] = "@(#)$FreeBSD$";
 #endif
 
 
 #if (SOLARIS || defined(__sgi)) && defined(_KERNEL)
 extern KRWLOCK_T ipf_auth, ipf_mutex;
 extern kmutex_t ipf_authmx;
 # if SOLARIS
 extern kcondvar_t ipfauthwait;
 # endif
 #endif
 #ifdef linux
 static struct wait_queue *ipfauthwait = NULL;
 #endif
 
 int	fr_authsize = FR_NUMAUTH;
 int	fr_authused = 0;
 int	fr_defaultauthage = 600;
 int	fr_auth_lock = 0;
 fr_authstat_t	fr_authstats;
 static frauth_t fr_auth[FR_NUMAUTH];
 mb_t	*fr_authpkts[FR_NUMAUTH];
 static int	fr_authstart = 0, fr_authend = 0, fr_authnext = 0;
 static frauthent_t	*fae_list = NULL;
 frentry_t	*ipauth = NULL,
 		*fr_authlist = NULL;
 
 
 /*
  * Check if a packet has authorization.  If the packet is found to match an
  * authorization result and that would result in a feedback loop (i.e. it
  * will end up returning FR_AUTH) then return FR_BLOCK instead.
  */
 u_32_t fr_checkauth(ip, fin)
 ip_t *ip;
 fr_info_t *fin;
 {
 	u_short id = ip->ip_id;
 	frentry_t *fr;
 	frauth_t *fra;
 	u_32_t pass;
 	int i;
 
 	if (fr_auth_lock || !fr_authused)
 		return 0;
 
 	READ_ENTER(&ipf_auth);
 	for (i = fr_authstart; i != fr_authend; ) {
 		/*
 		 * index becomes -2 only after an SIOCAUTHW.  Check this in
 		 * case the same packet gets sent again and it hasn't yet been
 		 * auth'd.
 		 */
 		fra = fr_auth + i;
 		if ((fra->fra_index == -2) && (id == fra->fra_info.fin_id) &&
 		    !bcmp((char *)fin, (char *)&fra->fra_info, FI_CSIZE)) {
 			/*
 			 * Avoid feedback loop.
 			 */
 			if (!(pass = fra->fra_pass) || (pass & FR_AUTH))
 				pass = FR_BLOCK;
 			/*
 			 * Create a dummy rule for the stateful checking to
 			 * use and return.  Zero out any values we don't
 			 * trust from userland!
 			 */
 			if ((pass & FR_KEEPSTATE) || ((pass & FR_KEEPFRAG) &&
 			     (fin->fin_fi.fi_fl & FI_FRAG))) {
 				KMALLOC(fr, frentry_t *);
 				if (fr) {
 					bcopy((char *)fra->fra_info.fin_fr,
 					      fr, sizeof(*fr));
 					fr->fr_grp = NULL;
 					fr->fr_ifa = fin->fin_ifp;
 					fr->fr_func = NULL;
 					fr->fr_ref = 1;
 					fr->fr_flags = pass;
 #if BSD >= 199306
 					fr->fr_oifa = NULL;
 #endif
 				}
 			} else
 				fr = fra->fra_info.fin_fr;
 			fin->fin_fr = fr;
 			RWLOCK_EXIT(&ipf_auth);
 			WRITE_ENTER(&ipf_auth);
 			if (fr && fr != fra->fra_info.fin_fr) {
 				fr->fr_next = fr_authlist;
 				fr_authlist = fr;
 			}
 			fr_authstats.fas_hits++;
 			fra->fra_index = -1;
 			fr_authused--;
 			if (i == fr_authstart) {
 				while (fra->fra_index == -1) {
 					i++;
 					fra++;
 					if (i == FR_NUMAUTH) {
 						i = 0;
 						fra = fr_auth;
 					}
 					fr_authstart = i;
 					if (i == fr_authend)
 						break;
 				}
 				if (fr_authstart == fr_authend) {
 					fr_authnext = 0;
 					fr_authstart = fr_authend = 0;
 				}
 			}
 			RWLOCK_EXIT(&ipf_auth);
 			return pass;
 		}
 		i++;
 		if (i == FR_NUMAUTH)
 			i = 0;
 	}
 	fr_authstats.fas_miss++;
 	RWLOCK_EXIT(&ipf_auth);
 	return 0;
 }
 
 
 /*
  * Check if we have room in the auth array to hold details for another packet.
  * If we do, store it and wake up any user programs which are waiting to
  * hear about these events.
  */
 int fr_newauth(m, fin, ip)
 mb_t *m;
 fr_info_t *fin;
 ip_t *ip;
 {
 #if defined(_KERNEL) && SOLARIS
 	qif_t *qif = fin->fin_qif;
 #endif
 	frauth_t *fra;
 	int i;
 
 	if (fr_auth_lock)
 		return 0;
 
 	WRITE_ENTER(&ipf_auth);
 	if (fr_authstart > fr_authend) {
 		fr_authstats.fas_nospace++;
 		RWLOCK_EXIT(&ipf_auth);
 		return 0;
 	} else {
 		if (fr_authused == FR_NUMAUTH) {
 			fr_authstats.fas_nospace++;
 			RWLOCK_EXIT(&ipf_auth);
 			return 0;
 		}
 	}
 
 	fr_authstats.fas_added++;
 	fr_authused++;
 	i = fr_authend++;
 	if (fr_authend == FR_NUMAUTH)
 		fr_authend = 0;
 	RWLOCK_EXIT(&ipf_auth);
 	fra = fr_auth + i;
 	fra->fra_index = i;
 	fra->fra_pass = 0;
 	fra->fra_age = fr_defaultauthage;
 	bcopy((char *)fin, (char *)&fra->fra_info, sizeof(*fin));
 #if SOLARIS && defined(_KERNEL)
 # if !defined(sparc)
 	/*
 	 * No need to copyback here as we want to undo the changes, not keep
 	 * them.
 	 */
 	if ((ip == (ip_t *)m->b_rptr) && (ip->ip_v == 4))
 	{
 		register u_short bo;
 
 		bo = ip->ip_len;
 		ip->ip_len = htons(bo);
 # if !SOLARIS && !defined(__NetBSD__) && !defined(__FreeBSD__)
 		/* 4.4BSD converts this ip_input.c, but I don't in solaris.c */
 		bo = ip->ip_id;
 		ip->ip_id = htons(bo);
 # endif
 		bo = ip->ip_off;
 		ip->ip_off = htons(bo);
 	}
 # endif
 	m->b_rptr -= qif->qf_off;
 	fr_authpkts[i] = *(mblk_t **)fin->fin_mp;
 	fra->fra_q = qif->qf_q;
 	cv_signal(&ipfauthwait);
 #else
 # if defined(BSD) && !defined(sparc) && (BSD >= 199306)
 	if (!fin->fin_out) {
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 	}
 # endif
 	fr_authpkts[i] = m;
 	WAKEUP(&fr_authnext);
 #endif
 	return 1;
 }
 
 
 int fr_auth_ioctl(data, mode, cmd, fr, frptr)
 caddr_t data;
 int mode;
 #if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003)
 u_long cmd;
 #else
 int cmd;
 #endif
 frentry_t *fr, **frptr;
 {
 	mb_t *m;
 #if defined(_KERNEL) && !SOLARIS
 	int s;
 #endif
 	frauth_t auth, *au = &auth, *fra;
 	frauthent_t *fae, **faep;
 	int i, error = 0;
 
 	switch (cmd)
 	{
 	case SIOCSTLCK :
 		error = fr_lock(data, &fr_auth_lock);
 		break;
 	case SIOCINIFR :
 	case SIOCRMIFR :
 	case SIOCADIFR :
 		error = EINVAL;
 		break;
 	case SIOCINAFR :
 		error = EINVAL;
 		break;
 	case SIOCRMAFR :
 	case SIOCADAFR :
 		for (faep = &fae_list; (fae = *faep); )
 			if (&fae->fae_fr == fr)
 				break;
 			else
 				faep = &fae->fae_next;
 		if (cmd == SIOCRMAFR) {
 			if (!fr || !frptr)
 				error = EINVAL;
 			else if (!fae)
 				error = ESRCH;
 			else {
 				WRITE_ENTER(&ipf_auth);
 				SPL_NET(s);
 				*faep = fae->fae_next;
 				*frptr = fr->fr_next;
 				SPL_X(s);
 				RWLOCK_EXIT(&ipf_auth);
 				KFREE(fae);
 			}
 		} else if (fr && frptr) {
 			KMALLOC(fae, frauthent_t *);
 			if (fae != NULL) {
 				bcopy((char *)fr, (char *)&fae->fae_fr,
 				      sizeof(*fr));
 				WRITE_ENTER(&ipf_auth);
 				SPL_NET(s);
 				fae->fae_age = fr_defaultauthage;
 				fae->fae_fr.fr_hits = 0;
 				fae->fae_fr.fr_next = *frptr;
 				*frptr = &fae->fae_fr;
 				fae->fae_next = *faep;
 				*faep = fae;
 				ipauth = &fae_list->fae_fr;
 				SPL_X(s);
 				RWLOCK_EXIT(&ipf_auth);
 			} else
 				error = ENOMEM;
 		} else
 			error = EINVAL;
 		break;
 	case SIOCATHST:
 		fr_authstats.fas_faelist = fae_list;
 		error = IWCOPYPTR((char *)&fr_authstats, data,
 				   sizeof(fr_authstats));
 		break;
 	case SIOCAUTHW:
 		if (!(mode & FWRITE)) {
 			error = EPERM;
 			break;
 		}
 fr_authioctlloop:
 		READ_ENTER(&ipf_auth);
 		if ((fr_authnext != fr_authend) && fr_authpkts[fr_authnext]) {
 			error = IWCOPYPTR((char *)&fr_auth[fr_authnext], data,
 					  sizeof(frauth_t));
 			RWLOCK_EXIT(&ipf_auth);
 			if (error)
 				break;
 			WRITE_ENTER(&ipf_auth);
 			SPL_NET(s);
 			fr_authnext++;
 			if (fr_authnext == FR_NUMAUTH)
 				fr_authnext = 0;
 			SPL_X(s);
 			RWLOCK_EXIT(&ipf_auth);
 			return 0;
 		}
 		RWLOCK_EXIT(&ipf_auth);
 #ifdef	_KERNEL
 # if	SOLARIS
 		mutex_enter(&ipf_authmx);
 		if (!cv_wait_sig(&ipfauthwait, &ipf_authmx)) {
 			mutex_exit(&ipf_authmx);
 			return EINTR;
 		}
 		mutex_exit(&ipf_authmx);
 # else
 		error = SLEEP(&fr_authnext, "fr_authnext");
 # endif
 #endif
 		if (!error)
 			goto fr_authioctlloop;
 		break;
 	case SIOCAUTHR:
 		if (!(mode & FWRITE)) {
 			error = EPERM;
 			break;
 		}
 		error = IRCOPYPTR(data, (caddr_t)&auth, sizeof(auth));
 		if (error)
 			return error;
 		WRITE_ENTER(&ipf_auth);
 		SPL_NET(s);
 		i = au->fra_index;
 		fra = fr_auth + i;
 		if ((i < 0) || (i > FR_NUMAUTH) ||
 		    (fra->fra_info.fin_id != au->fra_info.fin_id)) {
 			SPL_X(s);
 			RWLOCK_EXIT(&ipf_auth);
 			return EINVAL;
 		}
 		m = fr_authpkts[i];
 		fra->fra_index = -2;
 		fra->fra_pass = au->fra_pass;
 		fr_authpkts[i] = NULL;
 		RWLOCK_EXIT(&ipf_auth);
 #ifdef	_KERNEL
 		if (m && au->fra_info.fin_out) {
 # if SOLARIS
 			error = (fr_qout(fra->fra_q, m) == 0) ? EINVAL : 0;
 # else /* SOLARIS */
 			struct route ro;
 
 			bzero((char *)&ro, sizeof(ro));
 #  if ((_BSDI_VERSION >= 199802) && (_BSDI_VERSION < 200005)) || \
-       defined(__OpenBSD__) || (defined(IRIX) && (IRIX >= 605))
+       defined(__OpenBSD__) || (defined(IRIX) && (IRIX >= 605)) || \
+       (__FreeBSD_version >= 500042)
 			error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL,
 					  NULL);
 #  else
 			error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL);
 #  endif
 			if (ro.ro_rt) {
 				RTFREE(ro.ro_rt);
 			}
 # endif /* SOLARIS */
 			if (error)
 				fr_authstats.fas_sendfail++;
 			else
 				fr_authstats.fas_sendok++;
 		} else if (m) {
 # if SOLARIS
 			error = (fr_qin(fra->fra_q, m) == 0) ? EINVAL : 0;
 # else /* SOLARIS */
 			if (! IF_HANDOFF(&ipintrq, m, NULL))
 				error = ENOBUFS;
 			else
 				schednetisr(NETISR_IP);
 # endif /* SOLARIS */
 			if (error)
 				fr_authstats.fas_quefail++;
 			else
 				fr_authstats.fas_queok++;
 		} else
 			error = EINVAL;
 # if SOLARIS
 		if (error)
 			error = EINVAL;
 # else
 		/*
 		 * If we experience an error which will result in the packet
 		 * not being processed, make sure we advance to the next one.
 		 */ 
 		if (error == ENOBUFS) {
 			fr_authused--;
 			fra->fra_index = -1;
 			fra->fra_pass = 0;
 			if (i == fr_authstart) {
 				while (fra->fra_index == -1) {
 					i++;
 					if (i == FR_NUMAUTH)
 						i = 0;
 					fr_authstart = i;
 					if (i == fr_authend)
 						break;
 				}
 				if (fr_authstart == fr_authend) {
 					fr_authnext = 0;
 					fr_authstart = fr_authend = 0;
 				}
 			}
 		}
 # endif
 #endif /* _KERNEL */
 		SPL_X(s);
 		break;
 	default :
 		error = EINVAL;
 		break;
 	}
 	return error;
 }
 
 
 /*
  * Free all network buffer memory used to keep saved packets.
  */
 void fr_authunload()
 {
 	register int i;
 	register frauthent_t *fae, **faep;
 	frentry_t *fr, **frp;
 	mb_t *m;
 
 	WRITE_ENTER(&ipf_auth);
 	for (i = 0; i < FR_NUMAUTH; i++) {
 		if ((m = fr_authpkts[i])) {
 			FREE_MB_T(m);
 			fr_authpkts[i] = NULL;
 			fr_auth[i].fra_index = -1;
 		}
 	}
 
 
 	for (faep = &fae_list; (fae = *faep); ) {
 		*faep = fae->fae_next;
 		KFREE(fae);
 	}
 	ipauth = NULL;
 	RWLOCK_EXIT(&ipf_auth);
 
 	if (fr_authlist) {
 		/*
 		 * We *MuST* reget ipf_auth because otherwise we won't get the
 		 * locks in the right order and risk deadlock.
 		 * We need ipf_mutex here to prevent a rule from using it
 		 * inside fr_check().
 		 */
 		WRITE_ENTER(&ipf_mutex);
 		WRITE_ENTER(&ipf_auth);
 		for (frp = &fr_authlist; (fr = *frp); ) {
 			if (fr->fr_ref == 1) {
 				*frp = fr->fr_next;
 				KFREE(fr);
 			} else
 				frp = &fr->fr_next;
 		}
 		RWLOCK_EXIT(&ipf_auth);
 		RWLOCK_EXIT(&ipf_mutex);
 	}
 }
 
 
 /*
  * Slowly expire held auth records.  Timeouts are set
  * in expectation of this being called twice per second.
  */
 void fr_authexpire()
 {
 	register int i;
 	register frauth_t *fra;
 	register frauthent_t *fae, **faep;
 	register frentry_t *fr, **frp;
 	mb_t *m;
 #if !SOLARIS && defined(_KERNEL)
 	int s;
 #endif
 
 	if (fr_auth_lock)
 		return;
 
 	SPL_NET(s);
 	WRITE_ENTER(&ipf_auth);
 	for (i = 0, fra = fr_auth; i < FR_NUMAUTH; i++, fra++) {
 		if ((!--fra->fra_age) && (m = fr_authpkts[i])) {
 			FREE_MB_T(m);
 			fr_authpkts[i] = NULL;
 			fr_auth[i].fra_index = -1;
 			fr_authstats.fas_expire++;
 			fr_authused--;
 		}
 	}
 
 	for (faep = &fae_list; (fae = *faep); ) {
 		if (!--fae->fae_age) {
 			*faep = fae->fae_next;
 			KFREE(fae);
 			fr_authstats.fas_expire++;
 		} else
 			faep = &fae->fae_next;
 	}
 	if (fae_list != NULL)
 		ipauth = &fae_list->fae_fr;
 	else
 		ipauth = NULL;
 
 	for (frp = &fr_authlist; (fr = *frp); ) {
 		if (fr->fr_ref == 1) {
 			*frp = fr->fr_next;
 			KFREE(fr);
 		} else
 			frp = &fr->fr_next;
 	}
 	RWLOCK_EXIT(&ipf_auth);
 	SPL_X(s);
 }
Index: head/sys/dev/hfa/fore_receive.c
===================================================================
--- head/sys/dev/hfa/fore_receive.c	(revision 105193)
+++ head/sys/dev/hfa/fore_receive.c	(revision 105194)
@@ -1,596 +1,596 @@
 /*
  *
  * ===================================
  * HARP  |  Host ATM Research Platform
  * ===================================
  *
  *
  * This Host ATM Research Platform ("HARP") file (the "Software") is
  * made available by Network Computing Services, Inc. ("NetworkCS")
  * "AS IS".  NetworkCS does not provide maintenance, improvements or
  * support of any kind.
  *
  * NETWORKCS MAKES NO WARRANTIES OR REPRESENTATIONS, EXPRESS OR IMPLIED,
  * INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF MERCHANTABILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE, AS TO ANY ELEMENT OF THE
  * SOFTWARE OR ANY SUPPORT PROVIDED IN CONNECTION WITH THIS SOFTWARE.
  * In no event shall NetworkCS be responsible for any damages, including
  * but not limited to consequential damages, arising from or relating to
  * any use of the Software or related support.
  *
  * Copyright 1994-1998 Network Computing Services, Inc.
  *
  * Copies of this Software may be made, however, the above copyright
  * notice must be reproduced on all copies.
  *
  *	@(#) $FreeBSD$
  *
  */
 
 /*
  * FORE Systems 200-Series Adapter Support
  * ---------------------------------------
  *
  * Receive queue management
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <net/if.h>
 #include <net/netisr.h>
 #include <netatm/port.h>
 #include <netatm/queue.h>
 #include <netatm/atm.h>
 #include <netatm/atm_sys.h>
 #include <netatm/atm_sap.h>
 #include <netatm/atm_cm.h>
 #include <netatm/atm_if.h>
 #include <netatm/atm_vc.h>
 #include <netatm/atm_stack.h>
 #include <netatm/atm_pcb.h>
 #include <netatm/atm_var.h>
 #include <pci/pcivar.h>
 #include <dev/hfa/fore.h>
 #include <dev/hfa/fore_aali.h>
 #include <dev/hfa/fore_slave.h>
 #include <dev/hfa/fore_stats.h>
 #include <dev/hfa/fore_var.h>
 #include <dev/hfa/fore_include.h>
 
 #ifndef lint
 __RCSID("@(#) $FreeBSD$");
 #endif
 
 
 /*
  * Local functions
  */
 static void	fore_recv_stack(void *, KBuffer *);
 
 
 /*
  * Allocate Receive Queue Data Structures
  *
  * Arguments:
  *	fup		pointer to device unit structure
  *
  * Returns:
  *	0		allocations successful
  *	else		allocation failed
  */
 int
 fore_recv_allocate(fup)
 	Fore_unit	*fup;
 {
 	caddr_t		memp;
 
 	/*
 	 * Allocate non-cacheable memory for receive status words
 	 */
 	memp = atm_dev_alloc(sizeof(Q_status) * RECV_QUELEN,
 			QSTAT_ALIGN, ATM_DEV_NONCACHE);
 	if (memp == NULL) {
 		return (1);
 	}
 	fup->fu_recv_stat = (Q_status *) memp;
 
 	memp = (caddr_t)vtophys(fup->fu_recv_stat);
 	if (memp == NULL) {
 		return (1);
 	}
 	fup->fu_recv_statd = (Q_status *) memp;
 
 	/*
 	 * Allocate memory for receive descriptors
 	 */
 	memp = atm_dev_alloc(sizeof(Recv_descr) * RECV_QUELEN,
 			RECV_DESCR_ALIGN, 0);
 	if (memp == NULL) {
 		return (1);
 	}
 	fup->fu_recv_desc = (Recv_descr *) memp;
 
 	memp = (caddr_t)vtophys(fup->fu_recv_desc);
 	if (memp == NULL) {
 		return (1);
 	}
 	fup->fu_recv_descd = (Recv_descr *) memp;
 
 	return (0);
 }
 
 
 /*
  * Receive Queue Initialization
  *
  * Allocate and initialize the host-resident receive queue structures
  * and then initialize the CP-resident queue structures.
  * 
  * Called at interrupt level.
  *
  * Arguments:
  *	fup		pointer to device unit structure
  *
  * Returns:
  *	none
  */
 void
 fore_recv_initialize(fup)
 	Fore_unit	*fup;
 {
 	Aali		*aap = fup->fu_aali;
 	Recv_queue	*cqp;
 	H_recv_queue	*hrp;
 	Recv_descr	*rdp;
 	Recv_descr	*rdp_dma;
 	Q_status	*qsp;
 	Q_status	*qsp_dma;
 	int		i;
 
 	/*
 	 * Point to CP-resident receive queue
 	 */
 	cqp = (Recv_queue *)(fup->fu_ram + CP_READ(aap->aali_recv_q));
 
 	/*
 	 * Point to host-resident receive queue structures
 	 */
 	hrp = fup->fu_recv_q;
 	qsp = fup->fu_recv_stat;
 	qsp_dma = fup->fu_recv_statd;
 	rdp = fup->fu_recv_desc;
 	rdp_dma = fup->fu_recv_descd;
 
 	/*
 	 * Loop thru all queue entries and do whatever needs doing
 	 */
 	for (i = 0; i < RECV_QUELEN; i++) {
 
 		/*
 		 * Set queue status word to free
 		 */
 		*qsp = QSTAT_FREE;
 
 		/*
 		 * Set up host queue entry and link into ring
 		 */
 		hrp->hrq_cpelem = cqp;
 		hrp->hrq_status = qsp;
 		hrp->hrq_descr = rdp;
 		hrp->hrq_descr_dma = rdp_dma;
 		if (i == (RECV_QUELEN - 1))
 			hrp->hrq_next = fup->fu_recv_q;
 		else
 			hrp->hrq_next = hrp + 1;
 
 		/*
 		 * Now let the CP into the game
 		 */
 		cqp->cq_descr = (CP_dma) CP_WRITE(rdp_dma);
 		cqp->cq_status = (CP_dma) CP_WRITE(qsp_dma);
 
 		/*
 		 * Bump all queue pointers
 		 */
 		hrp++;
 		qsp++;
 		qsp_dma++;
 		rdp++;
 		rdp_dma++;
 		cqp++;
 	}
 
 	/*
 	 * Initialize queue pointers
 	 */
 	fup->fu_recv_head = fup->fu_recv_q;
 
 	return;
 }
 
 
 /*
  * Drain Receive Queue
  *
  * This function will process all completed entries at the head of the
  * receive queue.  The received segments will be linked into a received
  * PDU buffer chain and it will then be passed up the PDU's VCC stack for 
  * processing by the next higher protocol layer.
  *
  * May be called in interrupt state.
  * Must be called with interrupts locked out.
  *
  * Arguments:
  *	fup		pointer to device unit structure
  *
  * Returns:
  *	none
  */
 void
 fore_recv_drain(fup)
 	Fore_unit	*fup;
 {
 	H_recv_queue	*hrp = NULL;
 	Recv_descr	*rdp;
 	Recv_seg_descr	*rsp;
 	Buf_handle	*bhp;
 	Fore_vcc	*fvp;
 	struct vccb	*vcp;
 	KBuffer		*m, *mhead, *mtail;
 	caddr_t		cp;
 	u_long		hdr, nsegs;
 	u_int		seglen, type0;
 	int		i, pdulen, retries = 0, error;
 
 	/* Silence the compiler */
 	mtail = NULL;
 	type0 = 0;
 
 	/*
 	 * Process each completed entry
 	 */
 retry:
 	while (*fup->fu_recv_head->hrq_status & QSTAT_COMPLETED) {
 
 		/*
 		 * Get completed entry's receive descriptor
 		 */
 		hrp = fup->fu_recv_head;
 		rdp = hrp->hrq_descr;
 
 #ifdef VAC
 		/*
 		 * Cache flush receive descriptor 
 		 */
 		if (vac) {
 			vac_flush((addr_t)rdp, sizeof(Recv_descr));
 		}
 #endif
 
 		hdr = rdp->rd_cell_hdr;
 		nsegs = rdp->rd_nsegs;
 
 		pdulen = 0;
 		error = 0;
 		mhead = NULL;
 
 		/*
 		 * Locate incoming VCC for this PDU
 		 */
 		fvp = (Fore_vcc *) atm_dev_vcc_find((Cmn_unit *)fup,
 			ATM_HDR_GET_VPI(hdr), ATM_HDR_GET_VCI(hdr), VCC_IN);
 
 		/*
 		 * Check for a receive error
 		 *
 		 * Apparently the receive descriptor itself contains valid 
 		 * information, but the received pdu data is probably bogus.
 		 * We'll arrange for the receive buffer segments to be tossed.
 		 */
 		if (*hrp->hrq_status & QSTAT_ERROR) {
 
 			fup->fu_pif.pif_ierrors++;
 			if (fvp) {
 				vcp = fvp->fv_connvc->cvc_vcc;
 				vcp->vc_ierrors++;
 				if (vcp->vc_nif)
 					vcp->vc_nif->nif_if.if_ierrors++;
 			}
 			ATM_DEBUG1("fore receive error: hdr=0x%lx\n", hdr);
 			error = 1;
 		}
 
 		/*
 		 * Build PDU buffer chain from receive segments
 		 */
 		for (i = 0, rsp = rdp->rd_seg; i < nsegs; i++, rsp++) {
 
 			bhp = rsp->rsd_handle;
 			seglen = rsp->rsd_len;
 
 			/*
 			 * Remove buffer from our supplied queue and get
 			 * to the underlying buffer
 			 */
 			switch (bhp->bh_type) {
 
 			case BHT_S1_SMALL:
 				DEQUEUE(bhp, Buf_handle, bh_qelem,
 					fup->fu_buf1s_bq);
 				fup->fu_buf1s_cnt--;
 				m = (KBuffer *) ((caddr_t)bhp - BUF1_SM_HOFF);
 				KB_DATASTART(m, cp, caddr_t);
 				break;
 
 			case BHT_S1_LARGE:
 				DEQUEUE(bhp, Buf_handle, bh_qelem,
 					fup->fu_buf1l_bq);
 				fup->fu_buf1l_cnt--;
 				m = (KBuffer *) ((caddr_t)bhp - BUF1_LG_HOFF);
 				KB_DATASTART(m, cp, caddr_t);
 				break;
 
 			default:
 				log(LOG_ERR,
 					"fore_recv_drain: bhp=%p type=0x%x\n",
 					bhp, bhp->bh_type);
 				panic("fore_recv_drain: bad buffer type");
 			}
 
 			/*
 			 * Toss any zero-length or receive error buffers 
 			 */
 			if ((seglen == 0) || error) {
 				KB_FREEALL(m);
 				continue;
 			}
 
 			/*
 			 * Link buffer into chain
 			 */
 			if (mhead == NULL) {
 				type0 = bhp->bh_type;
 				KB_LINKHEAD(m, mhead);
 				mhead = m;
 			} else {
 				KB_LINK(m, mtail);
 			}
 			KB_LEN(m) = seglen;
 			pdulen += seglen;
 			mtail = m;
 
 			/*
 			 * Flush received buffer data
 			 */
 #ifdef VAC
 			if (vac) {
 				addr_t	dp;
 
 				KB_DATASTART(m, dp, addr_t);
 				vac_pageflush(dp);
 			}
 #endif
 		}
 
 		/*
 		 * Make sure we've got a non-null PDU
 		 */
 		if (mhead == NULL) {
 			goto free_ent;
 		}
 
 		/*
 		 * We only support user data PDUs (for now)
 		 */
 		if (hdr & ATM_HDR_SET_PT(ATM_PT_NONUSER)) {
 			KB_FREEALL(mhead);
 			goto free_ent;
 		}
 
 		/*
 		 * Toss the data if there's no VCC
 		 */
 		if (fvp == NULL) {
 			fup->fu_stats->st_drv.drv_rv_novcc++;
 			KB_FREEALL(mhead);
 			goto free_ent;
 		}
 
 #ifdef DIAGNOSTIC
 		if (atm_dev_print)
 			atm_dev_pdu_print((Cmn_unit *)fup, (Cmn_vcc *)fvp, 
 				mhead, "fore_recv");
 #endif
 
 		/*
 		 * Make sure we have our queueing headroom at the front
 		 * of the buffer chain
 		 */
 		if (type0 != BHT_S1_SMALL) {
 
 			/*
 			 * Small buffers already have headroom built-in, but
 			 * if CP had to use a large buffer for the first 
 			 * buffer, then we have to allocate a buffer here to
 			 * contain the headroom.
 			 */
 			fup->fu_stats->st_drv.drv_rv_nosbf++;
 
 			KB_ALLOCPKT(m, BUF1_SM_SIZE, KB_F_NOWAIT, KB_T_DATA);
 			if (m == NULL) {
 				fup->fu_stats->st_drv.drv_rv_nomb++;
 				KB_FREEALL(mhead);
 				goto free_ent;
 			}
 
 			/*
 			 * Put new buffer at head of PDU chain
 			 */
 			KB_LINKHEAD(m, mhead);
 			KB_LEN(m) = 0;
 			KB_HEADSET(m, BUF1_SM_DOFF);
 			mhead = m;
 		}
 
 		/*
 		 * It looks like we've got a valid PDU - count it quick!!
 		 */
 		mhead->m_pkthdr.rcvif = NULL;
 		mhead->m_pkthdr.csum_flags = 0;
-		mhead->m_pkthdr.aux   = NULL; 
+		SLIST_INIT(&mhead->m_pkthdr.tags);
 		KB_PLENSET(mhead, pdulen);
 		fup->fu_pif.pif_ipdus++;
 		fup->fu_pif.pif_ibytes += pdulen;
 		vcp = fvp->fv_connvc->cvc_vcc;
 		vcp->vc_ipdus++;
 		vcp->vc_ibytes += pdulen;
 		if (vcp->vc_nif) {
 			vcp->vc_nif->nif_ibytes += pdulen;
 			vcp->vc_nif->nif_if.if_ipackets++;
 #if (defined(BSD) && (BSD >= 199103))
 			vcp->vc_nif->nif_if.if_ibytes += pdulen;
 #endif
 		}
 
 		/*
 		 * The STACK_CALL needs to happen at splnet() in order
 		 * for the stack sequence processing to work.  Schedule an
 		 * interrupt queue callback at splnet() since we are 
 		 * currently at device level.
 		 */
 
 		/*
 		 * Prepend callback function pointer and token value to buffer.
 		 * We have already guaranteed that the space is available
 		 * in the first buffer.
 		 */
 		KB_HEADADJ(mhead, sizeof(atm_intr_func_t) + sizeof(int));
 		KB_DATASTART(mhead, cp, caddr_t);
 		*((atm_intr_func_t *)cp) = fore_recv_stack;
 		cp += sizeof(atm_intr_func_t);
 		*((void **)cp) = (void *)fvp;
 
 		/*
 		 * Schedule callback
 		 */
 		if (IF_HANDOFF(&atm_intrq, mhead, NULL)) {
 			schednetisr(NETISR_ATM);
 		} else {
 			fup->fu_stats->st_drv.drv_rv_ifull++;
 			goto free_ent;
 		}
 
 free_ent:
 		/*
 		 * Mark this entry free for use and bump head pointer
 		 * to the next entry in the queue
 		 */
 		*hrp->hrq_status = QSTAT_FREE;
 		hrp->hrq_cpelem->cq_descr = 
 			(CP_dma) CP_WRITE((u_long)hrp->hrq_descr_dma);
 		fup->fu_recv_head = hrp->hrq_next;
 	}
 
 	/*
 	 * Nearly all of the interrupts generated by the CP will be due
 	 * to PDU reception.  However, we may receive an interrupt before
 	 * the CP has completed the status word DMA to host memory.  Thus,
 	 * if we haven't processed any PDUs during this interrupt, we will
 	 * wait a bit for completed work on the receive queue, rather than 
 	 * having to field an extra interrupt very soon.
 	 */
 	if (hrp == NULL) {
 		if (++retries <= FORE_RECV_RETRY) {
 			DELAY(FORE_RECV_DELAY);
 			goto retry;
 		}
 	}
 
 	return;
 }
 
 
 /*
  * Pass Incoming PDU up Stack
  *
  * This function is called via the core ATM interrupt queue callback 
  * set in fore_recv_drain().  It will pass the supplied incoming 
  * PDU up the incoming VCC's stack.
  *
  * Called at splnet.
  *
  * Arguments:
  *	tok		token to identify stack instantiation
  *	m		pointer to incoming PDU buffer chain
  *
  * Returns:
  *	none
  */
 static void
 fore_recv_stack(tok, m)
 	void		*tok;
 	KBuffer		*m;
 {
 	Fore_vcc	*fvp = (Fore_vcc *)tok;
 	int		err;
 
 	/*
 	 * Send the data up the stack
 	 */
 	STACK_CALL(CPCS_UNITDATA_SIG, fvp->fv_upper,
 		fvp->fv_toku, fvp->fv_connvc, (int)m, 0, err);
 	if (err)
 		KB_FREEALL(m);
 
 	return;
 }
 
 
 /*
  * Free Receive Queue Data Structures
  *
  * Arguments:
  *	fup		pointer to device unit structure
  *
  * Returns:
  *	none
  */
 void
 fore_recv_free(fup)
 	Fore_unit	*fup;
 {
 	/*
 	 * We'll just let fore_buf_free() take care of freeing any
 	 * buffers sitting on the receive queue (which are also still
 	 * on the fu_*_bq queue).
 	 */
 	if (fup->fu_flags & CUF_INITED) {
 	}
 
 	/*
 	 * Free the status words
 	 */
 	if (fup->fu_recv_stat) {
 		atm_dev_free((volatile void *)fup->fu_recv_stat);
 		fup->fu_recv_stat = NULL;
 		fup->fu_recv_statd = NULL;
 	}
 
 	/*
 	 * Free the receive descriptors
 	 */
 	if (fup->fu_recv_desc) {
 		atm_dev_free(fup->fu_recv_desc);
 		fup->fu_recv_desc = NULL;
 		fup->fu_recv_descd = NULL;
 	}
 
 	return;
 }
 
Index: head/sys/kern/subr_mbuf.c
===================================================================
--- head/sys/kern/subr_mbuf.c	(revision 105193)
+++ head/sys/kern/subr_mbuf.c	(revision 105194)
@@ -1,1550 +1,1543 @@
 /*-
  * Copyright (c) 2001, 2002
  * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission. 
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_mac.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 /******************************************************************************
  * mb_alloc mbuf and cluster allocator.
  *
  * Maximum number of PCPU containers. If you know what you're doing you could
  * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
  * system during compilation, and thus prevent kernel structure bloat.
  *
  * SMP and non-SMP kernels clearly have a different number of possible CPUs,
  * but because we cannot assume a dense array of CPUs, we always allocate
  * and traverse PCPU containers up to NCPU amount and merely check for
  * CPU availability.
  */
 #ifdef MBALLOC_NCPU
 #define	NCPU	MBALLOC_NCPU
 #else
 #define	NCPU	MAXCPU
 #endif
 
 /*-
  * The mbuf allocator is heavily based on Alfred Perlstein's
  * (alfred@FreeBSD.org) "memcache" allocator which is itself based
  * on concepts from several per-CPU memory allocators. The difference
  * between this allocator and memcache is that, among other things:
  *
  * (i) We don't free back to the map from the free() routine - we leave the
  *     option of implementing lazy freeing (from a kproc) in the future. 
  *
  * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
  *	maximum number of allocatable objects of a given type. Further,
  *	we handle blocking on a cv in the case that the map is starved and
  *	we have to rely solely on cached (circulating) objects.
  *
  * The mbuf allocator keeps all objects that it allocates in mb_buckets.
  * The buckets keep a page worth of objects (an object can be an mbuf or an
  * mbuf cluster) and facilitate moving larger sets of contiguous objects
  * from the per-CPU lists to the main list for the given object. The buckets
  * also have an added advantage in that after several moves from a per-CPU
  * list to the main list and back to the per-CPU list, contiguous objects
  * are kept together, thus trying to put the TLB cache to good use.
  *
  * The buckets are kept on singly-linked lists called "containers." A container
  * is protected by a mutex lock in order to ensure consistency.  The mutex lock
  * itself is allocated separately and attached to the container at boot time,
  * thus allowing for certain containers to share the same mutex lock.  Per-CPU
  * containers for mbufs and mbuf clusters all share the same per-CPU
  * lock whereas the "general system" containers (i.e., the "main lists") for
  * these objects share one global lock.
  */
 struct mb_bucket {
 	SLIST_ENTRY(mb_bucket) mb_blist;
 	int 	mb_owner;
 	int	mb_numfree;
 	void 	*mb_free[0];
 };
 
 struct mb_container {
 	SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
 	struct	mtx *mc_lock;
 	int	mc_numowner;
 	u_int	mc_starved;
 	long	*mc_types;
 	u_long	*mc_objcount;
 	u_long	*mc_numpgs;
 };
 
 struct mb_gen_list {
 	struct	mb_container mb_cont;
 	struct	cv mgl_mstarved;
 };
 
 struct mb_pcpu_list {
 	struct	mb_container mb_cont;
 };
 
 /*
  * Boot-time configurable object counts that will determine the maximum
  * number of permitted objects in the mbuf and mcluster cases.  In the
  * ext counter (nmbcnt) case, it's just an indicator serving to scale
  * kmem_map size properly - in other words, we may be allowed to allocate
  * more than nmbcnt counters, whereas we will never be allowed to allocate
  * more than nmbufs mbufs or nmbclusters mclusters.
  * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
  * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
  */
 #ifndef NMBCLUSTERS
 #define	NMBCLUSTERS	(1024 + maxusers * 64)
 #endif
 #ifndef NMBUFS
 #define	NMBUFS		(nmbclusters * 2)
 #endif
 #ifndef NSFBUFS
 #define	NSFBUFS		(512 + maxusers * 16)
 #endif
 #ifndef NMBCNTS
 #define	NMBCNTS		(nmbclusters + nsfbufs)
 #endif
 int	nmbufs;
 int	nmbclusters;
 int	nmbcnt;
 int	nsfbufs;
 
 /*
  * Perform sanity checks of tunables declared above.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 
 	/*
 	 * This has to be done before VM init.
 	 */
 	nmbclusters = NMBCLUSTERS;
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	nmbufs = NMBUFS;
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	nsfbufs = NSFBUFS;
 	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
 	nmbcnt = NMBCNTS;
 	TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
 	/* Sanity checks */
 	if (nmbufs < nmbclusters * 2)
 		nmbufs = nmbclusters * 2;
 	if (nmbcnt < nmbclusters + nsfbufs)
 		nmbcnt = nmbclusters + nsfbufs;
 }
 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
 
 /*
  * The freelist structures and mutex locks.  The number statically declared
  * here depends on the number of CPUs.
  *
  * We set up in such a way that all the objects (mbufs, clusters)
  * share the same mutex lock.  It has been established that we do not benefit
  * from different locks for different objects, so we use the same lock,
  * regardless of object type.  This also allows us to do optimised
  * multi-object allocations without dropping the lock in between.
  */
 struct mb_lstmngr {
 	struct mb_gen_list *ml_genlist;
 	struct mb_pcpu_list *ml_cntlst[NCPU];
 	struct mb_bucket **ml_btable;
 	vm_map_t	ml_map;
 	vm_offset_t	ml_mapbase;
 	vm_offset_t	ml_maptop;
 	int		ml_mapfull;
 	u_int		ml_objsize;
 	u_int		*ml_wmhigh;
 };
 static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
 static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
 u_int *cl_refcntmap;
 
 /*
  * Local macros for internal allocator structure manipulations.
  */
 #ifdef SMP
 #define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
 #else
 #define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[0]
 #endif
 
 #define	MB_GET_GEN_LIST(mb_lst)		(mb_lst)->ml_genlist
 
 #define	MB_LOCK_CONT(mb_cnt)		mtx_lock((mb_cnt)->mb_cont.mc_lock)
 
 #define	MB_UNLOCK_CONT(mb_cnt)		mtx_unlock((mb_cnt)->mb_cont.mc_lock)
 
 #define	MB_GET_PCPU_LIST_NUM(mb_lst, num)				\
     (mb_lst)->ml_cntlst[(num)]
 
 #define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
     (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
 
 #define	MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)				\
 {									\
 	struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead);	\
 									\
 	(mb_bckt)->mb_numfree--;					\
 	(mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)];	\
 	(*((mb_lst)->mb_cont.mc_objcount))--;				\
 	if ((mb_bckt)->mb_numfree == 0) {				\
 		SLIST_REMOVE_HEAD(_mchd, mb_blist);			\
 		SLIST_NEXT((mb_bckt), mb_blist) = NULL;			\
 		(mb_bckt)->mb_owner |= MB_BUCKET_FREE;			\
 	}								\
 }
 
 #define	MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst)				\
 	(mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp);	\
 	(mb_bckt)->mb_numfree++;					\
 	(*((mb_lst)->mb_cont.mc_objcount))++;
 
 #define	MB_MBTYPES_INC(mb_cnt, mb_type, mb_num)				\
 	if ((mb_type) != MT_NOTMBUF)					\
 	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
 
 #define	MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num)				\
 	if ((mb_type) != MT_NOTMBUF)					\
 	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
 
 /*
  * Ownership of buckets/containers is represented by integers.  The PCPU
  * lists range from 0 to NCPU-1.  We need a free numerical id for the general
  * list (we use NCPU).  We also need a non-conflicting free bit to indicate
  * that the bucket is free and removed from a container, while not losing
  * the bucket's originating container id.  We use the highest bit
  * for the free marker.
  */
 #define	MB_GENLIST_OWNER	(NCPU)
 #define	MB_BUCKET_FREE		(1 << (sizeof(int) * 8 - 1))
 
 /* Statistics structures for allocator (per-CPU and general). */
 static struct mbpstat mb_statpcpu[NCPU + 1];
 struct mbstat mbstat;
 
 /* Sleep time for wait code (in ticks). */
 static int mbuf_wait = 64;
 
 static u_int mbuf_limit = 512;	/* Upper limit on # of mbufs per CPU. */
 static u_int clust_limit = 128;	/* Upper limit on # of clusters per CPU. */
 
 /*
  * Objects exported by sysctl(8).
  */
 SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, 
     "Maximum number of mbuf clusters available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
     "Maximum number of mbufs available"); 
 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
     "Number used to scale kmem_map to ensure sufficient space for counters");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
     "Maximum number of sendfile(2) sf_bufs available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
     "Sleep time of mbuf subsystem wait allocations during exhaustion");
 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
     "Upper limit of number of mbufs allowed on each PCPU list");
 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
     "Upper limit of number of mbuf clusters allowed on each PCPU list");
 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
     "Mbuf general information and statistics");
 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
     sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
 
 /*
  * Prototypes of local allocator routines.
  */
 static void		*mb_alloc_wait(struct mb_lstmngr *, short);
 static struct mb_bucket	*mb_pop_cont(struct mb_lstmngr *, int,
 			    struct mb_pcpu_list *);
 static void		 mb_reclaim(void);
 static void		 mbuf_init(void *);
 
 /*
  * Initial allocation numbers.  Each parameter represents the number of buckets
  * of each object that will be placed initially in each PCPU container for
  * said object.
  */
 #define	NMB_MBUF_INIT	4
 #define	NMB_CLUST_INIT	16
 
 /*
  * Internal flags that allow for cache locks to remain "persistent" across
  * allocation and free calls.  They may be used in combination.
  */
 #define	MBP_PERSIST	0x1	/* Return with lock still held. */
 #define	MBP_PERSISTENT	0x2	/* Cache lock is already held coming in. */
 
 /*
  * Initialize the mbuf subsystem.
  *
  * We sub-divide the kmem_map into several submaps; this way, we don't have
  * to worry about artificially limiting the number of mbuf or mbuf cluster
  * allocations, due to fear of one type of allocation "stealing" address
  * space initially reserved for another.
  *
  * Set up both the general containers and all the PCPU containers.  Populate
  * the PCPU containers with initial numbers.
  */
 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
 static void
 mbuf_init(void *dummy)
 {
 	struct mb_pcpu_list *pcpu_cnt;
 	vm_size_t mb_map_size;
 	int i, j;
 
 	/*
 	 * Set up all the submaps, for each type of object that we deal
 	 * with in this allocator.  We also allocate space for the cluster
 	 * ref. counts in the mbuf map (and not the cluster map) in order to
 	 * give clusters a nice contiguous address space without any holes.
 	 */
 	mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * sizeof(u_int));
 	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
 	mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
 	    sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
 	if (mb_list_mbuf.ml_btable == NULL)
 		goto bad;
 	mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
 	    &(mb_list_mbuf.ml_maptop), mb_map_size);
 	mb_list_mbuf.ml_map->system_map = 1;
 	mb_list_mbuf.ml_mapfull = 0;
 	mb_list_mbuf.ml_objsize = MSIZE;
 	mb_list_mbuf.ml_wmhigh = &mbuf_limit;
 
 	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
 	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
 	mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
 	    * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
 	if (mb_list_clust.ml_btable == NULL)
 		goto bad;
 	mb_list_clust.ml_map = kmem_suballoc(kmem_map,
 	    &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
 	    mb_map_size);
 	mb_list_clust.ml_map->system_map = 1;
 	mb_list_clust.ml_mapfull = 0;
 	mb_list_clust.ml_objsize = MCLBYTES;
 	mb_list_clust.ml_wmhigh = &clust_limit;
 
 	/*
 	 * Allocate required general (global) containers for each object type.
 	 */
 	mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
 	    M_NOWAIT);
 	mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
 	    M_NOWAIT);
 	if ((mb_list_mbuf.ml_genlist == NULL) ||
 	    (mb_list_clust.ml_genlist == NULL))
 		goto bad;
 
 	/*
 	 * Initialize condition variables and general container mutex locks.
 	 */
 	mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
 	cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
 	cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
 	    "mcluster pool starved");
 	mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
 	    mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
 
 	/*
 	 * Set up the general containers for each object.
 	 */
 	mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
 	    mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
 	mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
 	    mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
 	mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
 	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
 	mb_list_clust.ml_genlist->mb_cont.mc_objcount =
 	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
 	mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
 	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
 	mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
 	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
 	mb_list_mbuf.ml_genlist->mb_cont.mc_types =
 	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
 	mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
 	SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
 	SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
 
 	/*
 	 * Allocate all the required counters for clusters.  This makes
 	 * cluster allocations/deallocations much faster.
 	 */
 	cl_refcntmap = (u_int *)kmem_malloc(mb_list_clust.ml_map,
 	    roundup(nmbclusters * sizeof(u_int), MSIZE), M_NOWAIT);
 	if (cl_refcntmap == NULL)
 		goto bad;
 
 	/*
 	 * Initialize general mbuf statistics.
 	 */
 	mbstat.m_msize = MSIZE;
 	mbstat.m_mclbytes = MCLBYTES;
 	mbstat.m_minclsize = MINCLSIZE;
 	mbstat.m_mlen = MLEN;
 	mbstat.m_mhlen = MHLEN;
 	mbstat.m_numtypes = MT_NTYPES;
 
 	/*
 	 * Allocate and initialize PCPU containers.
 	 */
 	for (i = 0; i < NCPU; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 
 		mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
 		    M_MBUF, M_NOWAIT);
 		mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
 		    M_MBUF, M_NOWAIT);
 		if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
 		    (mb_list_clust.ml_cntlst[i] == NULL))
 			goto bad;
 
 		mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
 		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
 
 		mb_statpcpu[i].mb_active = 1;
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
 		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
 		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
 		    &(mb_statpcpu[i].mb_mbfree);
 		mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
 		    &(mb_statpcpu[i].mb_clfree);
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
 		    &(mb_statpcpu[i].mb_mbpgs);
 		mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
 		    &(mb_statpcpu[i].mb_clpgs);
 		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
 		    &(mb_statpcpu[i].mb_mbtypes[0]);
 		mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
 
 		SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
 		SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
 
 		/*
 		 * Perform initial allocations.
 		 */
 		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
 		MB_LOCK_CONT(pcpu_cnt);
 		for (j = 0; j < NMB_MBUF_INIT; j++) {
 			if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
 			    == NULL)
 				goto bad;
 		}
 		MB_UNLOCK_CONT(pcpu_cnt);
 
 		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
 		MB_LOCK_CONT(pcpu_cnt);
 		for (j = 0; j < NMB_CLUST_INIT; j++) {
 			if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
 			    == NULL)
 				goto bad;
 		}
 		MB_UNLOCK_CONT(pcpu_cnt);
 	}
 
 	return;
 bad:
 	panic("mbuf_init(): failed to initialize mbuf subsystem!");
 }
 
 /*
  * Populate a given mbuf PCPU container with a bucket full of fresh new
  * buffers.  Return a pointer to the new bucket (already in the container if
  * successful), or return NULL on failure.
  *
  * LOCKING NOTES:
  * PCPU container lock must be held when this is called.
  * The lock is dropped here so that we can cleanly call the underlying VM
  * code.  If we fail, we return with no locks held. If we succeed (i.e., return
  * non-NULL), we return with the PCPU lock held, ready for allocation from
  * the returned bucket.
  */
 static struct mb_bucket *
 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
 {
 	struct mb_bucket *bucket;
 	caddr_t p;
 	int i;
 
 	MB_UNLOCK_CONT(cnt_lst);
 	/*
 	 * If our object's (finite) map is starved now (i.e., no more address
 	 * space), bail out now.
 	 */
 	if (mb_list->ml_mapfull)
 		return (NULL);
 
 	bucket = malloc(sizeof(struct mb_bucket) +
 	    PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
 	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 	if (bucket == NULL)
 		return (NULL);
 
 	p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
 	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 	if (p == NULL) {
 		free(bucket, M_MBUF);
 		if (how == M_TRYWAIT)
 			mb_list->ml_mapfull = 1;
 		return (NULL);
 	}
 
 	bucket->mb_numfree = 0;
 	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
 	for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
 		bucket->mb_free[i] = p;
 		bucket->mb_numfree++;
 		p += mb_list->ml_objsize;
 	}
 
 	MB_LOCK_CONT(cnt_lst);
 	bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 	SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
 	(*(cnt_lst->mb_cont.mc_numpgs))++;
 	*(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
 
 	return (bucket);
 }
 
 /*
  * Allocate an mbuf-subsystem type object.
  * The general case is very easy.  Complications only arise if our PCPU
  * container is empty.  Things get worse if the PCPU container is empty,
  * the general container is empty, and we've run out of address space
  * in our map; then we try to block if we're willing to (M_TRYWAIT).
  */
 static __inline
 void *
 mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, 
 	 int *pers_list)
 {
 	static int last_report;
 	struct mb_pcpu_list *cnt_lst;
 	struct mb_bucket *bucket;
 	void *m;
 
 	m = NULL;
 	if ((persist & MBP_PERSISTENT) != 0) {
 		/*
 		 * If we're a "persistent" call, then the per-CPU #(pers_list)
 		 * cache lock is already held, and we just need to refer to
 		 * the correct cache descriptor.
 		 */
 		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
 	} else {
 		cnt_lst = MB_GET_PCPU_LIST(mb_list);
 		MB_LOCK_CONT(cnt_lst);
 	}
 
 	if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
 		/*
 		 * This is the easy allocation case. We just grab an object
 		 * from a bucket in the PCPU container. At worst, we
 		 * have just emptied the bucket and so we remove it
 		 * from the container.
 		 */
 		MB_GET_OBJECT(m, bucket, cnt_lst);
 		MB_MBTYPES_INC(cnt_lst, type, 1);
 
 		/* If asked to persist, do not drop the lock. */
 		if ((persist & MBP_PERSIST) == 0)
 			MB_UNLOCK_CONT(cnt_lst);
 		else
 			*pers_list = cnt_lst->mb_cont.mc_numowner;
 	} else {
 		struct mb_gen_list *gen_list;
 
 		/*
 		 * This is the less-common more difficult case. We must
 		 * first verify if the general list has anything for us
 		 * and if that also fails, we must allocate a page from
 		 * the map and create a new bucket to place in our PCPU
 		 * container (already locked). If the map is starved then
 		 * we're really in for trouble, as we have to wait on
 		 * the general container's condition variable.
 		 */
 		gen_list = MB_GET_GEN_LIST(mb_list);
 		MB_LOCK_CONT(gen_list);
 
 		if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
 		    != NULL) {
 			/*
 			 * Give ownership of the bucket to our CPU's
 			 * container, but only actually put the bucket
 			 * in the container if it doesn't become free
 			 * upon removing an mbuf from it.
 			 */
 			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
 			    mb_blist);
 			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 			(*(gen_list->mb_cont.mc_numpgs))--;
 			(*(cnt_lst->mb_cont.mc_numpgs))++;
 			*(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
 			bucket->mb_numfree--;
 			m = bucket->mb_free[(bucket->mb_numfree)];
 			if (bucket->mb_numfree == 0) {
 				SLIST_NEXT(bucket, mb_blist) = NULL;
 				bucket->mb_owner |= MB_BUCKET_FREE;
 			} else {
 				SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 				     bucket, mb_blist);
 				*(cnt_lst->mb_cont.mc_objcount) +=
 				    bucket->mb_numfree;
 			}
 			MB_UNLOCK_CONT(gen_list);
 			MB_MBTYPES_INC(cnt_lst, type, 1);
 
 			/* If asked to persist, do not drop the lock. */
 			if ((persist & MBP_PERSIST) == 0)
 				MB_UNLOCK_CONT(cnt_lst);
 			else
 				*pers_list = cnt_lst->mb_cont.mc_numowner;
 		} else {
 			/*
 			 * We'll have to allocate a new page.
 			 */
 			MB_UNLOCK_CONT(gen_list);
 			bucket = mb_pop_cont(mb_list, how, cnt_lst);
 			if (bucket != NULL) {
 				MB_GET_OBJECT(m, bucket, cnt_lst);
 				MB_MBTYPES_INC(cnt_lst, type, 1);
 
 				/* If asked to persist, do not drop the lock. */
 				if ((persist & MBP_PERSIST) == 0)
 					MB_UNLOCK_CONT(cnt_lst);
 				else
 					*pers_list=cnt_lst->mb_cont.mc_numowner;
 			} else {
 				if (how == M_TRYWAIT) {
 					/*
 				 	 * Absolute worst-case scenario.
 					 * We block if we're willing to, but
 					 * only after trying to steal from
 					 * other lists.
 					 */
 					m = mb_alloc_wait(mb_list, type);
 				} else {
 					/* XXX: No consistency. */
 					mbstat.m_drops++;
 
 					if (ticks < last_report ||
 					   (ticks - last_report) >= hz) {
 						last_report = ticks;
 						printf(
 "All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
 					}
 
 				}
 				if (m != NULL && (persist & MBP_PERSIST) != 0) {
 					cnt_lst = MB_GET_PCPU_LIST(mb_list);
 					MB_LOCK_CONT(cnt_lst);
 					*pers_list=cnt_lst->mb_cont.mc_numowner;
 				}
 			}
 		}
 	}
 
 	return (m);
 }
 
 /*
  * This is the worst-case scenario called only if we're allocating with
  * M_TRYWAIT.  We first drain all the protocols, then try to find an mbuf
  * by looking in every PCPU container.  If we're still unsuccesful, we
  * try the general container one last time and possibly block on our
  * starved cv.
  */
 static void *
 mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
 {
 	struct mb_pcpu_list *cnt_lst;
 	struct mb_gen_list *gen_list;
 	struct mb_bucket *bucket;
 	void *m;
 	int i, cv_ret;
 
 	/*
 	 * Try to reclaim mbuf-related objects (mbufs, clusters).
 	 */
 	mb_reclaim();
 
 	/*
 	 * Cycle all the PCPU containers. Increment starved counts if found
 	 * empty.
 	 */
 	for (i = 0; i < NCPU; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
 		MB_LOCK_CONT(cnt_lst);
 
 		/*
 		 * If container is non-empty, get a single object from it.
 		 * If empty, increment starved count.
 		 */
 		if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
 		    NULL) {
 			MB_GET_OBJECT(m, bucket, cnt_lst);
 			MB_MBTYPES_INC(cnt_lst, type, 1);
 			MB_UNLOCK_CONT(cnt_lst);
 			mbstat.m_wait++;	/* XXX: No consistency. */
 			return (m);
 		} else
 			cnt_lst->mb_cont.mc_starved++;
 
 		MB_UNLOCK_CONT(cnt_lst);
 	}
 
 	/*
 	 * We're still here, so that means it's time to get the general
 	 * container lock, check it one more time (now that mb_reclaim()
 	 * has been called) and if we still get nothing, block on the cv.
 	 */
 	gen_list = MB_GET_GEN_LIST(mb_list);
 	MB_LOCK_CONT(gen_list);
 	if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
 		MB_GET_OBJECT(m, bucket, gen_list);
 		MB_MBTYPES_INC(gen_list, type, 1);
 		MB_UNLOCK_CONT(gen_list);
 		mbstat.m_wait++;	/* XXX: No consistency. */
 		return (m);
 	}
 
 	gen_list->mb_cont.mc_starved++;
 	cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
 	    gen_list->mb_cont.mc_lock, mbuf_wait);
 	gen_list->mb_cont.mc_starved--;
 
 	if ((cv_ret == 0) &&
 	    ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
 		MB_GET_OBJECT(m, bucket, gen_list);
 		MB_MBTYPES_INC(gen_list, type, 1);
 		mbstat.m_wait++;	/* XXX: No consistency. */
 	} else {
 		mbstat.m_drops++;	/* XXX: No consistency. */
 		m = NULL;
 	}
 
 	MB_UNLOCK_CONT(gen_list);
 
 	return (m);
 }
 
 /*-
  * Free an object to its rightful container.
  * In the very general case, this operation is really very easy.
  * Complications arise primarily if:
  *	(a) We've hit the high limit on number of free objects allowed in
  *	    our PCPU container.
  *	(b) We're in a critical situation where our container has been
  *	    marked 'starved' and we need to issue wakeups on the starved
  *	    condition variable.
  *	(c) Minor (odd) cases: our bucket has migrated while we were
  *	    waiting for the lock; our bucket is in the general container;
  *	    our bucket is empty.
  */
 static __inline
 void
 mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
 	int *pers_list)
 {
 	struct mb_pcpu_list *cnt_lst;
 	struct mb_gen_list *gen_list;
 	struct mb_bucket *bucket;
 	u_int owner;
 
 	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
 
 	/*
 	 * Make sure that if after we lock the bucket's present container the
 	 * bucket has migrated, that we drop the lock and get the new one.
 	 */
 retry_lock:
 	owner = bucket->mb_owner & ~MB_BUCKET_FREE;
 	switch (owner) {
 	case MB_GENLIST_OWNER:
 		gen_list = MB_GET_GEN_LIST(mb_list);
 		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
 			if (*pers_list != MB_GENLIST_OWNER) {
 				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
 				    *pers_list);
 				MB_UNLOCK_CONT(cnt_lst);
 				MB_LOCK_CONT(gen_list);
 			}
 		} else {
 			MB_LOCK_CONT(gen_list);
 		}
 		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
 			MB_UNLOCK_CONT(gen_list);
 			*pers_list = -1;
 			goto retry_lock;
 		}
 
 		/*
 		 * If we're intended for the general container, this is
 		 * real easy: no migrating required. The only `bogon'
 		 * is that we're now contending with all the threads
 		 * dealing with the general list, but this is expected.
 		 */
 		MB_PUT_OBJECT(m, bucket, gen_list);
 		MB_MBTYPES_DEC(gen_list, type, 1);
 		if (gen_list->mb_cont.mc_starved > 0)
 			cv_signal(&(gen_list->mgl_mstarved));
 		if ((persist & MBP_PERSIST) == 0)
 			MB_UNLOCK_CONT(gen_list);
 		else
 			*pers_list = MB_GENLIST_OWNER;
 		break;
 
 	default:
 		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
 		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
 			if (*pers_list == MB_GENLIST_OWNER) {
 				gen_list = MB_GET_GEN_LIST(mb_list);
 				MB_UNLOCK_CONT(gen_list);
 				MB_LOCK_CONT(cnt_lst);
 			} else {
 				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
 				    *pers_list);
 				owner = *pers_list;
 			}
 		} else {
 			MB_LOCK_CONT(cnt_lst);
 		}
 		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
 			MB_UNLOCK_CONT(cnt_lst);
 			*pers_list = -1;
 			goto retry_lock;
 		}
 
 		MB_PUT_OBJECT(m, bucket, cnt_lst);
 		MB_MBTYPES_DEC(cnt_lst, type, 1);
 
 		if (cnt_lst->mb_cont.mc_starved > 0) {
 			/*
 			 * This is a tough case. It means that we've
 			 * been flagged at least once to indicate that
 			 * we're empty, and that the system is in a critical
 			 * situation, so we ought to migrate at least one
 			 * bucket over to the general container.
 			 * There may or may not be a thread blocking on
 			 * the starved condition variable, but chances
 			 * are that one will eventually come up soon so
 			 * it's better to migrate now than never.
 			 */
 			gen_list = MB_GET_GEN_LIST(mb_list);
 			MB_LOCK_CONT(gen_list);
 			KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
 			    ("mb_free: corrupt bucket %p\n", bucket));
 			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
 			    bucket, mb_blist);
 			bucket->mb_owner = MB_GENLIST_OWNER;
 			(*(cnt_lst->mb_cont.mc_objcount))--;
 			(*(gen_list->mb_cont.mc_objcount))++;
 			(*(cnt_lst->mb_cont.mc_numpgs))--;
 			(*(gen_list->mb_cont.mc_numpgs))++;
 
 			/*
 			 * Determine whether or not to keep transferring
 			 * buckets to the general list or whether we've
 			 * transferred enough already.
 			 * We realize that although we may flag another
 			 * bucket to be migrated to the general container
 			 * that in the meantime, the thread that was
 			 * blocked on the cv is already woken up and
 			 * long gone. But in that case, the worst
 			 * consequence is that we will end up migrating
 			 * one bucket too many, which is really not a big
 			 * deal, especially if we're close to a critical
 			 * situation.
 			 */
 			if (gen_list->mb_cont.mc_starved > 0) {
 				cnt_lst->mb_cont.mc_starved--;
 				cv_signal(&(gen_list->mgl_mstarved));
 			} else
 				cnt_lst->mb_cont.mc_starved = 0;
 
 			MB_UNLOCK_CONT(gen_list);
 			if ((persist & MBP_PERSIST) == 0)
 				MB_UNLOCK_CONT(cnt_lst);
 			else
 				*pers_list = owner;
 			break;
 		}
 
 		if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
 			/*
 			 * We've hit the high limit of allowed numbers of mbufs
 			 * on this PCPU list. We must now migrate a bucket
 			 * over to the general container.
 			 */
 			gen_list = MB_GET_GEN_LIST(mb_list);
 			MB_LOCK_CONT(gen_list);
 			if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
 				bucket =
 				    SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
 				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 				    mb_blist);
 			}
 			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
 			    bucket, mb_blist);
 			bucket->mb_owner = MB_GENLIST_OWNER;
 			*(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
 			*(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
 			(*(cnt_lst->mb_cont.mc_numpgs))--;
 			(*(gen_list->mb_cont.mc_numpgs))++;
 
 			/*
 			 * While we're at it, transfer some of the mbtypes
 			 * "count load" onto the general list's mbtypes
 			 * array, seeing as how we're moving the bucket
 			 * there now, meaning that the freeing of objects
 			 * there will now decrement the _general list's_
 			 * mbtypes counters, and no longer our PCPU list's
 			 * mbtypes counters. We do this for the type presently
 			 * being freed in an effort to keep the mbtypes
 			 * counters approximately balanced across all lists.
 			 */ 
 			MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
 			    mb_list->ml_objsize) - bucket->mb_numfree);
 			MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
 			    mb_list->ml_objsize) - bucket->mb_numfree);
  
 			MB_UNLOCK_CONT(gen_list);
 			if ((persist & MBP_PERSIST) == 0)
 				MB_UNLOCK_CONT(cnt_lst);
 			else
 				*pers_list = owner;
 			break;
 		}
 
 		if (bucket->mb_owner & MB_BUCKET_FREE) {
 			SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 			    bucket, mb_blist);
 			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 		}
 
 		if ((persist & MBP_PERSIST) == 0)
 			MB_UNLOCK_CONT(cnt_lst);
 		else
 			*pers_list = owner;
 		break;
 	}
 }
 
 /*
  * Drain protocols in hopes to free up some resources.
  *
  * LOCKING NOTES:
  * No locks should be held when this is called.  The drain routines have to
  * presently acquire some locks which raises the possibility of lock order
  * violation if we're holding any mutex if that mutex is acquired in reverse
  * order relative to one of the locks in the drain routines.
  */
 static void
 mb_reclaim(void)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 /*
  * XXX: Argh, we almost always trip here with witness turned on now-a-days
  * XXX: because we often come in with Giant held. For now, there's no way
  * XXX: to avoid this.
  */
 #ifdef WITNESS
 	KASSERT(witness_list(curthread) == 0,
 	    ("mb_reclaim() called with locks held"));
 #endif
 
 	mbstat.m_drain++;	/* XXX: No consistency. */
 
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
 				(*pr->pr_drain)();
 }
 
 /******************************************************************************
  * Internal setup macros.
  */
 
 #define	_mb_setup(m, type) do {						\
 	(m)->m_type = (type);						\
 	(m)->m_next = NULL;						\
 	(m)->m_nextpkt = NULL;						\
 	(m)->m_data = (m)->m_dat;					\
 	(m)->m_flags = 0;						\
 } while (0)
 
 #define	_mbhdr_setup(m, type) do {					\
 	(m)->m_type = (type);						\
 	(m)->m_next = NULL;						\
 	(m)->m_nextpkt = NULL;						\
 	(m)->m_data = (m)->m_pktdat;					\
 	(m)->m_flags = M_PKTHDR;					\
 	(m)->m_pkthdr.rcvif = NULL;					\
 	(m)->m_pkthdr.csum_flags = 0;					\
-	(m)->m_pkthdr.aux = NULL;					\
+	SLIST_INIT(&(m)->m_pkthdr.tags);				\
 } while (0)
 
 #define _mcl_setup(m) do {						\
 	(m)->m_data = (m)->m_ext.ext_buf;				\
 	(m)->m_flags |= M_EXT;						\
 	(m)->m_ext.ext_free = NULL;					\
 	(m)->m_ext.ext_args = NULL;					\
 	(m)->m_ext.ext_size = MCLBYTES;					\
 	(m)->m_ext.ext_type = EXT_CLUSTER;				\
 } while (0)
 
 #define	_mext_init_ref(m, ref) do {					\
 	(m)->m_ext.ref_cnt = ((ref) == NULL) ?				\
 	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref);	\
 	if ((m)->m_ext.ref_cnt != NULL) {				\
 		*((m)->m_ext.ref_cnt) = 0;				\
 		MEXT_ADD_REF((m));					\
 	}								\
 } while (0)
 
 #define	cl2ref(cl)							\
     (((uintptr_t)(cl) - (uintptr_t)cl_refcntmap) >> MCLSHIFT)
 
 #define	_mext_dealloc_ref(m)						\
 	free((m)->m_ext.ref_cnt, M_MBUF)
 
 /******************************************************************************
  * Internal routines.
  * 
  * Because mb_alloc() and mb_free() are inlines (to keep the common
  * cases down to a maximum of one function call), below are a few
  * routines used only internally for the sole purpose of making certain
  * functions smaller.
  *
  * - _mext_free(): frees associated storage when the ref. count is
  *   exactly one and we're freeing.
  *
  * - _mgetm_internal(): common "persistent-lock" routine that allocates
  *   an mbuf and a cluster in one shot, but where the lock is already
  *   held coming in (which is what makes it different from the exported
  *   m_getcl()).  The lock is dropped when done.  This is used by m_getm()
  *   and, therefore, is very m_getm()-specific.
  */
 static struct mbuf *_mgetm_internal(int, short, short, int);
 
 void
 _mext_free(struct mbuf *mb)
 {
 
 	if (mb->m_ext.ext_type == EXT_CLUSTER) {
 		mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
 		    0, NULL);
 	} else {
 		(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
 		_mext_dealloc_ref(mb);
 	}
 }
 
 static struct mbuf *
 _mgetm_internal(int how, short type, short persist, int cchnum)
 {
 	struct mbuf *mb;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
 	if (mb == NULL)
 		return NULL;
 	_mb_setup(mb, type);
 
 	if ((persist & MBP_PERSIST) != 0) {
 		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
 		    how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
 		if (mb->m_ext.ext_buf == NULL) {
 			(void)m_free(mb);
 			mb = NULL;
 		}
 		_mcl_setup(mb);
 		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
 	}
 	return (mb);
 }
 
 /******************************************************************************
  * Exported buffer allocation and de-allocation routines.
  */
 
 /*
  * Allocate and return a single (normal) mbuf.  NULL is returned on failure.
  *
  * Arguments:
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  */
 struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *mb;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
 	if (mb != NULL)
 		_mb_setup(mb, type);
 	return (mb);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one and return the top of the provided (existing)
  * chain.  NULL is returned on failure, in which case the [optional]
  * provided chain is left untouched, and any memory already allocated
  * is freed.
  *
  * Arguments:
  *  - m: existing chain to which to append new chain (optional).
  *  - len: total length of data to append, either in mbufs or clusters
  *    (we allocate whatever combination yields the best fit).
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  */
 struct mbuf *
 m_getm(struct mbuf *m, int len, int how, short type)
 {
 	struct mbuf *mb, *top, *cur, *mtail;
 	int num, rem, cchnum;
 	short persist;
 	int i;
 
 	KASSERT(len >= 0, ("m_getm(): len is < 0"));
 
 	/* If m != NULL, we will append to the end of that chain. */
 	if (m != NULL)
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
 	else
 		mtail = NULL;
 
 	/*
 	 * In the best-case scenario (which should be the common case
 	 * unless we're in a starvation situation), we will be able to
 	 * go through the allocation of all the desired mbufs and clusters
 	 * here without dropping our per-CPU cache lock in between.
 	 */
 	num = len / MCLBYTES;
 	rem = len % MCLBYTES;
 	persist = 0;
 	cchnum = -1;
 	top = cur = NULL;
 	for (i = 0; i < num; i++) {
 		mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
 		    MBP_PERSIST | persist, &cchnum);
 		if (mb == NULL)
 			goto failed;
 		_mb_setup(mb, type);
 
 		persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
 		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
 		    how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
 		if (mb->m_ext.ext_buf == NULL) {
 			(void)m_free(mb);
 			goto failed;
 		}
 		_mcl_setup(mb);
 		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
 		persist = MBP_PERSISTENT;
 
 		if (cur == NULL)
 			top = cur = mb;
 		else
 			cur->m_next = mb;
 	}
 	if (rem > 0) {
 		if (cchnum >= 0) {
 			persist = MBP_PERSISTENT;
 			persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
 			mb = _mgetm_internal(how, type, persist, cchnum);
 			if (mb == NULL)
 				goto failed;
 		} else if (rem > MINCLSIZE) {
 			mb = m_getcl(how, type, 0);
 		} else {
 			mb = m_get(how, type);
 		}
 		if (mb != NULL) {
 			if (cur == NULL)
 				top = mb;
 			else
 				cur->m_next = mb;
 		} else
 			goto failed;
 	}
 
 	if (mtail != NULL)
 		mtail->m_next = top;
 	else
 		mtail = top;
 	return mtail;
 failed:
 	if (top != NULL)
 		m_freem(top);
 	return NULL;
 }
 
 /*
  * Allocate and return a single M_PKTHDR mbuf.  NULL is returned on failure.
  *
  * Arguments:
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  */
 struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *mb;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
 	if (mb != NULL) {
 		_mbhdr_setup(mb, type);
 #ifdef MAC
 		if (mac_init_mbuf(mb, how) != 0) {
 			m_free(mb);
 			return NULL;
 		}
 #endif
 	}
 	return (mb);
 }
 
 /*
  * Allocate and return a single (normal) pre-zero'd mbuf.  NULL is
  * returned on failure.
  *
  * Arguments:
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  */
 struct mbuf *
 m_get_clrd(int how, short type)
 {
 	struct mbuf *mb;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
 	if (mb != NULL) {
 		_mb_setup(mb, type);
 		bzero(mtod(mb, caddr_t), MLEN);
 	}
 	return (mb);
 }
 
 /*
  * Allocate and return a single M_PKTHDR pre-zero'd mbuf.  NULL is
  * returned on failure.
  *
  * Arguments:
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  */
 struct mbuf *
 m_gethdr_clrd(int how, short type)
 {
 	struct mbuf *mb;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
 	if (mb != NULL) {
 		_mbhdr_setup(mb, type);
 #ifdef MAC
 		if (mac_init_mbuf(mb, how) != 0) {
 			m_free(mb);
 			return NULL;
 		}
 #endif
 		bzero(mtod(mb, caddr_t), MHLEN);
 	}
 	return (mb);
 }
 
 /*
  * Free a single mbuf and any associated storage that it may have attached
  * to it.  The associated storage may not be immediately freed if its
  * reference count is above 1.  Returns the next mbuf in the chain following
  * the mbuf being freed.
  *
  * Arguments:
  *  - mb: the mbuf to free.
  */
 struct mbuf *
 m_free(struct mbuf *mb)
 {
 	struct mbuf *nb;
 	int cchnum;
 	short persist = 0;
 
-	/* XXX: This check is bogus... please fix (see KAME). */
-	if ((mb->m_flags & M_PKTHDR) != 0 && mb->m_pkthdr.aux) {
-		m_freem(mb->m_pkthdr.aux);
-		mb->m_pkthdr.aux = NULL;
-	}
+	if ((mb->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(mb, NULL);
 #ifdef MAC
 	if ((mb->m_flags & M_PKTHDR) &&
 	    (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED))
 		mac_destroy_mbuf(mb);
 #endif
 	nb = mb->m_next;
 	if ((mb->m_flags & M_EXT) != 0) {
 		MEXT_REM_REF(mb);
 		if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
 			if (mb->m_ext.ext_type == EXT_CLUSTER) {
 				mb_free(&mb_list_clust,
 				    (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
 				    MBP_PERSIST, &cchnum);
 				persist = MBP_PERSISTENT;
 			} else {
 				(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
 				    mb->m_ext.ext_args);
 				_mext_dealloc_ref(mb);
 				persist = 0;
 			}
 		}
 	}
 	mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
 	return (nb);
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.  Right now, we only optimize a little so that the cache
  * lock may be held across a single mbuf+cluster free.  Hopefully,
  * we'll eventually be holding the lock across more than merely two
  * consecutive frees but right now this is hard to implement because of
  * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
- * loop, as well as the fact that we may recurse on m_freem() in
- * m_pkthdr.aux != NULL cases.
+ * loop.
  *
  *  - mb: the mbuf chain to free.
  */
 void
 m_freem(struct mbuf *mb)
 {
 	struct mbuf *m;
 	int cchnum;
 	short persist;
 
 	while (mb != NULL) {
-		/* XXX: This check is bogus... please fix (see KAME). */
-		if ((mb->m_flags & M_PKTHDR) != 0 && mb->m_pkthdr.aux) {
-			m_freem(mb->m_pkthdr.aux);
-			mb->m_pkthdr.aux = NULL;
-		}
+		if ((mb->m_flags & M_PKTHDR) != 0)
+			m_tag_delete_chain(mb, NULL);
 #ifdef MAC
 		if ((mb->m_flags & M_PKTHDR) &&
 		    (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED))
 			mac_destroy_mbuf(mb);
 #endif
 		persist = 0;
 		m = mb;
 		mb = mb->m_next;
 		if ((m->m_flags & M_EXT) != 0) {
 			MEXT_REM_REF(m);
 			if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
 				if (m->m_ext.ext_type == EXT_CLUSTER) {
 					mb_free(&mb_list_clust,
 					    (caddr_t)m->m_ext.ext_buf,
 					    MT_NOTMBUF, MBP_PERSIST, &cchnum);
 					persist = MBP_PERSISTENT;
 				} else {
 					(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
 					    m->m_ext.ext_args);
 					_mext_dealloc_ref(m);
 					persist = 0;
 				}
 			}
 		}
 		mb_free(&mb_list_mbuf, m, m->m_type, persist, &cchnum);
 	}
 }
 
 /*
  * Fetch an mbuf with a cluster attached to it.  If one of the
  * allocations fails, the entire allocation fails.  This routine is
  * the preferred way of fetching both the mbuf and cluster together,
  * as it avoids having to unlock/relock between allocations.  Returns
  * NULL on failure. 
  *
  * Arguments:
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  *  - type: the type of the mbuf being allocated.
  *  - flags: any flags to pass to the mbuf being allocated; if this includes
  *    the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
  */
 struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *mb;
 	int cchnum;
 
 	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
 	    MBP_PERSIST, &cchnum);
 	if (mb == NULL)
 		return NULL;
 	mb->m_type = type;
 	mb->m_next = NULL;
 	mb->m_flags = flags;
 	if ((flags & M_PKTHDR) != 0) {
 		mb->m_nextpkt = NULL;
 		mb->m_pkthdr.rcvif = NULL;
 		mb->m_pkthdr.csum_flags = 0;
-		mb->m_pkthdr.aux = NULL;
+		SLIST_INIT(&mb->m_pkthdr.tags);
 	}
 
 	mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
 	    MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
 	if (mb->m_ext.ext_buf == NULL) {
 		(void)m_free(mb);
 		mb = NULL;
 	} else {
 		_mcl_setup(mb);
 		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
 	}
 #ifdef MAC
 	if ((flags & M_PKTHDR) && (mac_init_mbuf(mb, how) != 0)) {
 		m_free(mb);
 		return NULL;
 	}
 #endif
 	return (mb);
 }
 
 /*
  * Fetch a single mbuf cluster and attach it to an existing mbuf.  If
  * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
  * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
  * The M_EXT bit is not set on failure.
  *
  * Arguments:
  *  - mb: the existing mbuf to which to attach the allocated cluster.
  *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
  *    if really starved for memory.  M_DONTWAIT to never block.
  */
 void
 m_clget(struct mbuf *mb, int how)
 {
 
 	mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
 	    0, NULL);
 	if (mb->m_ext.ext_buf != NULL) {
 		_mcl_setup(mb);
 		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
 	}
 }
 
 /*
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.  If the setting
  * up of the reference count fails, the M_EXT bit will not be set.  If
  * successfull, the M_EXT bit is set in the mbuf's flags.
  *
  * Arguments:
  *  - mb: the existing mbuf to which to attach the provided buffer.
  *  - buf: the address of the provided external storage buffer.
  *  - size: the size of the provided buffer.
  *  - freef: a pointer to a routine that is responsible for freeing the
  *    provided external storage buffer.
  *  - args: a pointer to an argument structure (of any type) to be passed
  *    to the provided freef routine (may be NULL).
  *  - flags: any other flags to be passed to the provided mbuf.
  *  - type: the type that the external storage buffer should be labeled with.
  */
 void
 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
     void (*freef)(void *, void *), void *args, int flags, int type)
 {
 
 	_mext_init_ref(mb, ((type != EXT_CLUSTER) ?
 	    NULL : &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]));
 	if (mb->m_ext.ref_cnt != NULL) {
 		mb->m_flags |= (M_EXT | flags);
 		mb->m_ext.ext_buf = buf;
 		mb->m_data = mb->m_ext.ext_buf;
 		mb->m_ext.ext_size = size;
 		mb->m_ext.ext_free = freef;
 		mb->m_ext.ext_args = args;
 		mb->m_ext.ext_type = type;
 	}
 }
 
 /*
  * Change type of provided mbuf.  This is a relatively expensive operation
  * (due to the cost of statistics manipulations) and should be avoided, where
  * possible.
  *
  * Arguments:
  *  - mb: the provided mbuf for which the type needs to be changed.
  *  - new_type: the new type to change the mbuf to.
  */
 void
 m_chtype(struct mbuf *mb, short new_type)
 {
 	struct mb_gen_list *gen_list;
 
 	gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
 	MB_LOCK_CONT(gen_list);
 	MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
 	MB_MBTYPES_INC(gen_list, new_type, 1);
 	MB_UNLOCK_CONT(gen_list);
 	mb->m_type = new_type;
 }
Index: head/sys/kern/uipc_mbuf.c
===================================================================
--- head/sys/kern/uipc_mbuf.c	(revision 105193)
+++ head/sys/kern/uipc_mbuf.c	(revision 105194)
@@ -1,739 +1,739 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
  * $FreeBSD$
  */
 
 #include "opt_mac.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
 int	max_datalen;
 
 /*
  * sysctl(8) exported objects
  */
 SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
 	   &max_linkhdr, 0, "");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
 	   &max_protohdr, 0, "");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
 	   &max_datalen, 0, "");
 
 /*
  * Copy mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * aux pointer will be moved to "to".
  */
 void
 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	KASSERT(to->m_flags & M_PKTHDR,
 	    ("m_copy_pkthdr() called on non-header"));
 #endif
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		mac_destroy_mbuf(to);
 #endif
 	to->m_data = to->m_pktdat;
 	to->m_flags = from->m_flags & M_COPYFLAGS;
 	to->m_pkthdr = from->m_pkthdr;
 #ifdef MAC
 	mac_init_mbuf(to, 1);			/* XXXMAC no way to fail */
 	mac_create_mbuf_from_mbuf(from, to);
 #endif
-	from->m_pkthdr.aux = NULL;
+	SLIST_INIT(&from->m_pkthdr.tags);
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	MGET(mn, how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR) {
 		M_COPY_PKTHDR(mn, m);
 #ifdef MAC
 		mac_destroy_mbuf(m);
 #endif
 		m->m_flags &= ~M_PKTHDR;
 	}
 	mn->m_next = m;
 	m = mn;
 	if (len < MHLEN)
 		MH_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = 0;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL, 
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		MGET(n, wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			M_COPY_PKTHDR(n, m);
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			n->m_ext = m->m_ext;
 			n->m_flags |= M_EXT;
 			MEXT_ADD_REF(m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 	if (top == NULL)
 		mbstat.m_mcfail++;	/* XXX: No consistency. */
 
 	return (top);
 nospace:
 	m_freem(top);
 	mbstat.m_mcfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MGET(n, how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	M_COPY_PKTHDR(n, m);
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
 		n->m_ext = m->m_ext;
 		n->m_flags |= M_EXT;
 		MEXT_ADD_REF(m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		MGET(o, how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
 			n->m_ext = m->m_ext;
 			n->m_flags |= M_EXT;
 			MEXT_ADD_REF(m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	mbstat.m_mcfail++;	/* XXX: No consistency. */ 
 	return (NULL);
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		MGET(n, how, m->m_type);
 		if (n == NULL)
 			goto nospace;
 		if (top == NULL) {		/* first one, must be PKTHDR */
 			M_COPY_PKTHDR(n, m);
 			nsize = MHLEN;
 		} else				/* not the first one */
 			nsize = MLEN;
 		if (remain >= MINCLSIZE) {
 			MCLGET(n, how);
 			if ((n->m_flags & M_EXT) == 0) {
 				(void)m_free(n);
 				goto nospace;
 			}
 			nsize = MCLBYTES;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	mbstat.m_mcfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (m->m_flags & M_EXT ||
 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		m = mp;
 		if (mp->m_flags & M_PKTHDR)
 			m->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				break;
 			}
 			count -= m->m_len;
 		}
 		while (m->m_next)
 			(m = m->m_next) ->m_len = 0;
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod and dtom
  * will work for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		MGET(m, M_DONTWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		m->m_len = 0;
 		if (n->m_flags & M_PKTHDR) {
 			M_COPY_PKTHDR(m, n);
 			n->m_flags &= ~M_PKTHDR;
 		}
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	mbstat.m_mpfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR) {
 		MGETHDR(n, wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
 			MH_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
 			MH_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		MGET(n, wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_flags |= M_EXT;
 		n->m_ext = m->m_ext;
 		MEXT_ADD_REF(m);
 		n->m_data = m->m_data + len;
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
 	 void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = 0, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.len = totlen;
 	len = MHLEN;
 
 	while (totlen > 0) {
 		if (top) {
 			MGET(m, M_DONTWAIT, MT_DATA);
 			if (m == NULL) {
 				m_freem(top);
 				return (NULL);
 			}
 			len = MLEN;
 		}
 		if (totlen + off >= MINCLSIZE) {
 			MCLGET(m, M_DONTWAIT);
 			if (m->m_flags & M_EXT)
 				len = MCLBYTES;
 		} else {
 			/*
 			 * Place initial small packet/header at end of mbuf.
 			 */
 			if (top == NULL && totlen + off + max_linkhdr <= len) {
 				m->m_data += max_linkhdr;
 				len -= max_linkhdr;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get_clrd(M_DONTWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		mlen = min (m->m_len - off, len);
 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_DONTWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 void
 m_print(const struct mbuf *m)
 {
 	int len;
 	const struct mbuf *m2;
 
 	len = m->m_pkthdr.len;
 	m2 = m;
 	while (len) {
 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
 		len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
Index: head/sys/kern/uipc_mbuf2.c
===================================================================
--- head/sys/kern/uipc_mbuf2.c	(revision 105193)
+++ head/sys/kern/uipc_mbuf2.c	(revision 105194)
@@ -1,404 +1,456 @@
 /*	$FreeBSD$	*/
 /*	$KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $	*/
 /*	$NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $	*/
 
 /*
  * Copyright (C) 1999 WIDE Project.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.4 (Berkeley) 2/14/95
  */
 
 /*#define PULLDOWN_DEBUG*/
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 
+MALLOC_DEFINE(M_PACKET_TAGS, "tag", "packet-attached information");
+
 /* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
 static struct mbuf *m_dup1(struct mbuf *, int, int, int);
 
 /*
  * ensure that [off, off + len) is contiguous on the mbuf chain "m".
  * packet chain before "off" is kept untouched.
  * if offp == NULL, the target will start at <retval, 0> on resulting chain.
  * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
  *
  * on error return (NULL return value), original "m" will be freed.
  *
  * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf.
  */
 struct mbuf *
 m_pulldown(struct mbuf *m, int off, int len, int *offp)
 {
 	struct mbuf *n, *o;
 	int hlen, tlen, olen;
 	int writable;
 
 	/* check invalid arguments. */
 	if (m == NULL)
 		panic("m == NULL in m_pulldown()");
 	if (len > MCLBYTES) {
 		m_freem(m);
 		return NULL;	/* impossible */
 	}
 
 #ifdef PULLDOWN_DEBUG
     {
 	struct mbuf *t;
 	printf("before:");
 	for (t = m; t; t = t->m_next)
 		printf(" %d", t->m_len);
 	printf("\n");
     }
 #endif
 	n = m;
 	while (n != NULL && off > 0) {
 		if (n->m_len > off)
 			break;
 		off -= n->m_len;
 		n = n->m_next;
 	}
 	/* be sure to point non-empty mbuf */
 	while (n != NULL && n->m_len == 0)
 		n = n->m_next;
 	if (!n) {
 		m_freem(m);
 		return NULL;	/* mbuf chain too short */
 	}
 
 	/*
 	 * XXX: This code is flawed because it considers a "writable" mbuf
 	 *      data region to require all of the following:
 	 *	  (i) mbuf _has_ to have M_EXT set; if it is just a regular
 	 *	      mbuf, it is still not considered "writable."
 	 *	  (ii) since mbuf has M_EXT, the ext_type _has_ to be
 	 *	       EXT_CLUSTER. Anything else makes it non-writable.
 	 *	  (iii) M_WRITABLE() must evaluate true.
 	 *      Ideally, the requirement should only be (iii).
 	 *
 	 * If we're writable, we're sure we're writable, because the ref. count
 	 * cannot increase from 1, as that would require posession of mbuf
 	 * n by someone else (which is impossible). However, if we're _not_
 	 * writable, we may eventually become writable )if the ref. count drops
 	 * to 1), but we'll fail to notice it unless we re-evaluate
 	 * M_WRITABLE(). For now, we only evaluate once at the beginning and
 	 * live with this.
 	 */
 	/*
 	 * XXX: This is dumb. If we're just a regular mbuf with no M_EXT,
 	 *      then we're not "writable," according to this code.
 	 */
 	writable = 0;
 	if ((n->m_flags & M_EXT) == 0 ||
 	    (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n)))
 		writable = 1;
 
 	/*
 	 * the target data is on <n, off>.
 	 * if we got enough data on the mbuf "n", we're done.
 	 */
 	if ((off == 0 || offp) && len <= n->m_len - off && writable)
 		goto ok;
 
 	/*
 	 * when len <= n->m_len - off and off != 0, it is a special case.
 	 * len bytes from <n, off> sits in single mbuf, but the caller does
 	 * not like the starting position (off).
 	 * chop the current mbuf into two pieces, set off to 0.
 	 */
 	if (len <= n->m_len - off) {
 		o = m_dup1(n, off, n->m_len - off, M_DONTWAIT);
 		if (o == NULL) {
 			m_freem(m);
 			return NULL;	/* ENOBUFS */
 		}
 		n->m_len = off;
 		o->m_next = n->m_next;
 		n->m_next = o;
 		n = n->m_next;
 		off = 0;
 		goto ok;
 	}
 
 	/*
 	 * we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
 	 * and construct contiguous mbuf with m_len == len.
 	 * note that hlen + tlen == len, and tlen > 0.
 	 */
 	hlen = n->m_len - off;
 	tlen = len - hlen;
 
 	/*
 	 * ensure that we have enough trailing data on mbuf chain.
 	 * if not, we can do nothing about the chain.
 	 */
 	olen = 0;
 	for (o = n->m_next; o != NULL; o = o->m_next)
 		olen += o->m_len;
 	if (hlen + olen < len) {
 		m_freem(m);
 		return NULL;	/* mbuf chain too short */
 	}
 
 	/*
 	 * easy cases first.
 	 * we need to use m_copydata() to get data from <n->m_next, 0>.
 	 */
 	if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen
 	 && writable) {
 		m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
 		n->m_len += tlen;
 		m_adj(n->m_next, tlen);
 		goto ok;
 	}
 	if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen
 	 && writable) {
 		n->m_next->m_data -= hlen;
 		n->m_next->m_len += hlen;
 		bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen);
 		n->m_len -= hlen;
 		n = n->m_next;
 		off = 0;
 		goto ok;
 	}
 
 	/*
 	 * now, we need to do the hard way.  don't m_copy as there's no room
 	 * on both end.
 	 */
 	MGET(o, M_DONTWAIT, m->m_type);
 	if (o && len > MLEN) {
 		MCLGET(o, M_DONTWAIT);
 		if ((o->m_flags & M_EXT) == 0) {
 			m_free(o);
 			o = NULL;
 		}
 	}
 	if (!o) {
 		m_freem(m);
 		return NULL;	/* ENOBUFS */
 	}
 	/* get hlen from <n, off> into <o, 0> */
 	o->m_len = hlen;
 	bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen);
 	n->m_len -= hlen;
 	/* get tlen from <n->m_next, 0> into <o, hlen> */
 	m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
 	o->m_len += tlen;
 	m_adj(n->m_next, tlen);
 	o->m_next = n->m_next;
 	n->m_next = o;
 	n = o;
 	off = 0;
 
 ok:
 #ifdef PULLDOWN_DEBUG
     {
 	struct mbuf *t;
 	printf("after:");
 	for (t = m; t; t = t->m_next)
 		printf("%c%d", t == n ? '*' : ' ', t->m_len);
 	printf(" (off=%d)\n", off);
     }
 #endif
 	if (offp)
 		*offp = off;
 	return n;
 }
 
 static struct mbuf *
 m_dup1(struct mbuf *m, int off, int len, int wait)
 {
 	struct mbuf *n;
 	int l;
 	int copyhdr;
 
 	if (len > MCLBYTES)
 		return NULL;
 	if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
 		copyhdr = 1;
 		MGETHDR(n, wait, m->m_type);
 		l = MHLEN;
 	} else {
 		copyhdr = 0;
 		MGET(n, wait, m->m_type);
 		l = MLEN;
 	}
 	if (n && len > l) {
 		MCLGET(n, wait);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_free(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return NULL;
 
 	if (copyhdr)
 		M_COPY_PKTHDR(n, m);
 	m_copydata(m, off, len, mtod(n, caddr_t));
 	return n;
 }
 
-/*
- * pkthdr.aux chain manipulation.
- * we don't allow clusters at this moment. 
- */
-struct mbuf *
-m_aux_add2(struct mbuf *m, int af, int type, void *p)
+/* Get a packet tag structure along with specified data following. */
+struct m_tag *
+m_tag_alloc(u_int32_t cookie, int type, int len, int wait)
 {
-	struct mbuf *n;
-	struct mauxtag *t;
+	struct m_tag *t;
 
-	if ((m->m_flags & M_PKTHDR) == 0)
+	if (len < 0)
 		return NULL;
+	t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait);
+	if (t == NULL)
+		return NULL;
+	t->m_tag_id = type;
+	t->m_tag_len = len;
+	t->m_tag_cookie = cookie;
+	return t;
+}
 
-	n = m_aux_find(m, af, type);
-	if (n)
-		return n;
+/* Free a packet tag. */
+void
+m_tag_free(struct m_tag *t)
+{
+	free(t, M_PACKET_TAGS);
+}
 
-	MGET(n, M_DONTWAIT, m->m_type);
-	if (n == NULL)
-		return NULL;
+/* Prepend a packet tag. */
+void
+m_tag_prepend(struct mbuf *m, struct m_tag *t)
+{
+	KASSERT(m && t, ("m_tag_prepend: null argument, m %p t %p", m, t));
+	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
+}
 
-	t = mtod(n, struct mauxtag *);
-	bzero(t, sizeof(*t));
-	t->af = af;
-	t->type = type;
-	t->p = p;
-	n->m_data += sizeof(struct mauxtag);
-	n->m_len = 0;
-	n->m_next = m->m_pkthdr.aux;
-	m->m_pkthdr.aux = n;
-	return n;
+/* Unlink a packet tag. */
+void
+m_tag_unlink(struct mbuf *m, struct m_tag *t)
+{
+	KASSERT(m && t, ("m_tag_unlink: null argument, m %p t %p", m, t));
+	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
-struct mbuf *
-m_aux_find2(struct mbuf *m, int af, int type, void *p)
+/* Unlink and free a packet tag. */
+void
+m_tag_delete(struct mbuf *m, struct m_tag *t)
 {
-	struct mbuf *n;
-	struct mauxtag *t;
+	KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t));
+	m_tag_unlink(m, t);
+	m_tag_free(t);
+}
 
-	if ((m->m_flags & M_PKTHDR) == 0)
-		return NULL;
+/* Unlink and free a packet tag chain, starting from given tag. */
+void
+m_tag_delete_chain(struct mbuf *m, struct m_tag *t)
+{
+	struct m_tag *p, *q;
 
-	for (n = m->m_pkthdr.aux; n; n = n->m_next) {
-		t = (struct mauxtag *)n->m_dat;
-		if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
-			printf("m_aux_find: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
-			continue;
-		}
-		if (t->af == af && t->type == type && t->p == p)
-			return n;
+	KASSERT(m, ("m_tag_delete_chain: null mbuf"));
+	if (t != NULL)
+		p = t;
+	else
+		p = SLIST_FIRST(&m->m_pkthdr.tags);
+	if (p == NULL)
+		return;
+	while ((q = SLIST_NEXT(p, m_tag_link)) != NULL)
+		m_tag_delete(m, q);
+	m_tag_delete(m, p);
+}
+
+/* Find a tag, starting from a given position. */
+struct m_tag *
+m_tag_locate(struct mbuf *m, u_int32_t cookie, int type, struct m_tag *t)
+{
+	struct m_tag *p;
+
+	KASSERT(m, ("m_tag_find: null mbuf"));
+	if (t == NULL)
+		p = SLIST_FIRST(&m->m_pkthdr.tags);
+	else
+		p = SLIST_NEXT(t, m_tag_link);
+	while (p != NULL) {
+		if (p->m_tag_cookie == cookie && p->m_tag_id == type)
+			return p;
+		p = SLIST_NEXT(p, m_tag_link);
 	}
 	return NULL;
 }
 
-struct mbuf *
-m_aux_find(struct mbuf *m, int af, int type)
+/* Copy a single tag. */
+struct m_tag *
+m_tag_copy(struct m_tag *t)
 {
+	struct m_tag *p;
 
-	return m_aux_find2(m, af, type, NULL);
+	KASSERT(t, ("m_tag_copy: null tag"));
+	p = m_tag_alloc(t->m_tag_cookie, t->m_tag_id, t->m_tag_len, M_NOWAIT);
+	if (p == NULL)
+		return (NULL);
+	bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */
+	return p;
 }
 
-struct mbuf *
-m_aux_add(struct mbuf *m, int af, int type)
+/*
+ * Copy two tag chains. The destination mbuf (to) loses any attached
+ * tags even if the operation fails. This should not be a problem, as
+ * m_tag_copy_chain() is typically called with a newly-allocated
+ * destination mbuf.
+ */
+int
+m_tag_copy_chain(struct mbuf *to, struct mbuf *from)
 {
+	struct m_tag *p, *t, *tprev = NULL;
 
-	return m_aux_add2(m, af, type, NULL);
+	KASSERT(to && from,
+		("m_tag_copy: null argument, to %p from %p", to, from));
+	m_tag_delete_chain(to, NULL);
+	SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) {
+		t = m_tag_copy(p);
+		if (t == NULL) {
+			m_tag_delete_chain(to, NULL);
+			return 0;
+		}
+		if (tprev == NULL)
+			SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link);
+		else {
+			SLIST_INSERT_AFTER(tprev, t, m_tag_link);
+			tprev = t;
+		}
+	}
+	return 1;
 }
 
+/* Initialize tags on an mbuf. */
 void
-m_aux_delete(struct mbuf *m, struct mbuf *victim)
+m_tag_init(struct mbuf *m)
 {
-	struct mbuf *n, *prev, *next;
-	struct mauxtag *t;
+	SLIST_INIT(&m->m_pkthdr.tags);
+}
 
-	if ((m->m_flags & M_PKTHDR) == 0)
-		return;
+/* Get first tag in chain. */
+struct m_tag *
+m_tag_first(struct mbuf *m)
+{
+	return SLIST_FIRST(&m->m_pkthdr.tags);
+}
 
-	prev = NULL;
-	n = m->m_pkthdr.aux;
-	while (n) {
-		t = (struct mauxtag *)n->m_dat;
-		next = n->m_next;
-		if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
-			printf("m_aux_delete: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
-			prev = n;
-			n = next;
-			continue;
-		}
-		if (n == victim) {
-			if (prev)
-				prev->m_next = n->m_next;
-			else
-				m->m_pkthdr.aux = n->m_next;
-			n->m_next = NULL;
-			m_free(n);
-			return;
-		} else
-			prev = n;
-		n = next;
-	}
+/* Get next tag in chain. */
+struct m_tag *
+m_tag_next(struct mbuf *m, struct m_tag *t)
+{
+	return SLIST_NEXT(t, m_tag_link);
 }
Index: head/sys/net/bridge.c
===================================================================
--- head/sys/net/bridge.c	(revision 105193)
+++ head/sys/net/bridge.c	(revision 105194)
@@ -1,1143 +1,1143 @@
 /*
  * Copyright (c) 1998-2002 Luigi Rizzo
  *
  * Work partly supported by: Cisco Systems, Inc. - NSITE lab, RTP, NC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * This code implements bridging in FreeBSD. It only acts on ethernet
  * interfaces, including VLANs (others are still usable for routing).
  * A FreeBSD host can implement multiple logical bridges, called
  * "clusters". Each cluster is made of a set of interfaces, and
  * identified by a "cluster-id" which is a number in the range 1..2^16-1.
  *
  * Bridging is enabled by the sysctl variable
  *	net.link.ether.bridge
  * the grouping of interfaces into clusters is done with
  *	net.link.ether.bridge_cfg
  * containing a list of interfaces each optionally followed by
  * a colon and the cluster it belongs to (1 is the default).
  * Separators can be * spaces, commas or tabs, e.g.
  *	net.link.ether.bridge_cfg="fxp0:2 fxp1:2 dc0 dc1:1"
  * Optionally bridged packets can be passed through the firewall,
  * this is controlled by the variable
  *	net.link.ether.bridge_ipfw
  *
  * For each cluster there is a descriptor (cluster_softc) storing
  * the following data structures:
  * - a hash table with the MAC address and destination interface for each
  *   known node. The table is indexed using a hash of the source address.
  * - an array with the MAC addresses of the interfaces used in the cluster.
  *
  * Input packets are tapped near the beginning of ether_input(), and
  * analysed by bridge_in(). Depending on the result, the packet
  * can be forwarded to one or more output interfaces using bdg_forward(),
  * and/or sent to the upper layer (e.g. in case of multicast).
  *
  * Output packets are intercepted near the end of ether_output().
  * The correct destination is selected by bridge_dst_lookup(),
  * and then forwarding is done by bdg_forward().
  *
  * The arp code is also modified to let a machine answer to requests
  * irrespective of the port the request came from.
  *
  * In case of loops in the bridging topology, the bridge detects this
  * event and temporarily mutes output bridging on one of the ports.
  * Periodically, interfaces are unmuted by bdg_timeout().
  * Muting is only implemented as a safety measure, and also as
  * a mechanism to support a user-space implementation of the spanning
  * tree algorithm.
  *
  * To build a bridging kernel, use the following option
  *    option BRIDGE
  * and then at runtime set the sysctl variable to enable bridging.
  *
  * Only one interface per cluster is supposed to have addresses set (but
  * there are no substantial problems if you set addresses for none or
  * for more than one interface).
  * Bridging will act before routing, but nothing prevents a machine
  * from doing both (modulo bugs in the implementation...).
  *
  * THINGS TO REMEMBER
  *  - bridging is incompatible with multicast routing on the same
  *    machine. There is not an easy fix to this.
  *  - be very careful when bridging VLANs
  *  - loop detection is still not very robust.
  */
 
 #include <sys/param.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/protosw.h>
 #include <sys/systm.h>
 #include <sys/socket.h> /* for net/if.h */
 #include <sys/ctype.h>	/* string functions */
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>	/* for ipfilter */
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 
 #include <netinet/in.h> /* for struct arpcom */
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/if_ether.h> /* for struct arpcom */
 
 #include <net/route.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 #include <net/bridge.h>
 
 /*--------------------*/
 
 /*
  * For each cluster, source MAC addresses are stored into a hash
  * table which locates the port they reside on.
  */
 #define HASH_SIZE 8192	/* Table size, must be a power of 2 */
 
 typedef struct hash_table {		/* each entry.		*/
     struct ifnet *	name;
     u_char		etheraddr[6];
     u_int16_t		used;		/* also, padding	*/
 } bdg_hash_table ;
 
 /*
  * The hash function applied to MAC addresses. Out of the 6 bytes,
  * the last ones tend to vary more. Since we are on a little endian machine,
  * we have to do some gimmick...
  */
 #define HASH_FN(addr)   (	\
     ntohs( ((u_int16_t *)addr)[1] ^ ((u_int16_t *)addr)[2] ) & (HASH_SIZE -1))
 
 /*
  * This is the data structure where local addresses are stored.
  */
 struct bdg_addr {
     u_char	etheraddr[6] ;
     u_int16_t	_padding ;
 };
 
 /*
  * The configuration of each cluster includes the cluster id, a pointer to
  * the hash table, and an array of local MAC addresses (of size "ports").
  */
 struct cluster_softc {
     u_int16_t	cluster_id;
     u_int16_t	ports;
     bdg_hash_table *ht;
     struct bdg_addr	*my_macs;	/* local MAC addresses */
 };
 
 
 extern struct protosw inetsw[];			/* from netinet/ip_input.c */
 extern u_char ip_protox[];			/* from netinet/ip_input.c */
 
 static int n_clusters;				/* number of clusters */
 static struct cluster_softc *clusters;
 
 #define BDG_MUTED(ifp) (ifp2sc[ifp->if_index].flags & IFF_MUTE)
 #define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE
 #define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster)
 
 #define BDG_SAMECLUSTER(ifp,src) \
 	(src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) )
 
 #ifdef __i386__
 #define BDG_MATCH(a,b) ( \
     ((u_int16_t *)(a))[2] == ((u_int16_t *)(b))[2] && \
     *((u_int32_t *)(a)) == *((u_int32_t *)(b)) )
 #define IS_ETHER_BROADCAST(a) ( \
 	*((u_int32_t *)(a)) == 0xffffffff && \
 	((u_int16_t *)(a))[2] == 0xffff )
 #else
 /* for machines that do not support unaligned access */
 #define	BDG_MATCH(a,b)		(!bcmp(a, b, ETHER_ADDR_LEN) )
 #define	IS_ETHER_BROADCAST(a)	(!bcmp(a, "\377\377\377\377\377\377", 6))
 #endif
 
 
 /*
  * For timing-related debugging, you can use the following macros.
  * remember, rdtsc() only works on Pentium-class machines
 
     quad_t ticks;
     DDB(ticks = rdtsc();)
     ... interesting code ...
     DDB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ;)
 
  *
  */
 
 #define DDB(x) x
 #define DEB(x)
 
 static int bdginit(void);
 static void parse_bdg_cfg(void);
 
 static int bdg_ipf;		/* IPFilter enabled in bridge */
 static int bdg_ipfw;
 
 #if 0 /* debugging only */
 static char *bdg_dst_names[] = {
 	"BDG_NULL    ",
 	"BDG_BCAST   ",
 	"BDG_MCAST   ",
 	"BDG_LOCAL   ",
 	"BDG_DROP    ",
 	"BDG_UNKNOWN ",
 	"BDG_IN      ",
 	"BDG_OUT     ",
 	"BDG_FORWARD " };
 #endif
 /*
  * System initialization
  */
 
 static struct bdg_stats bdg_stats ;
 static struct callout_handle bdg_timeout_h ;
 
 /*
  * Add an interface to a cluster, possibly creating a new entry in
  * the cluster table. This requires reallocation of the table and
  * updating pointers in ifp2sc.
  */
 static struct cluster_softc *
 add_cluster(u_int16_t cluster_id, struct arpcom *ac)
 {
     struct cluster_softc *c = NULL;
     int i;
 
     for (i = 0; i < n_clusters ; i++)
 	if (clusters[i].cluster_id == cluster_id)
 	    goto found;
 
     /* Not found, need to reallocate */
     c = malloc((1+n_clusters) * sizeof (*c), M_IFADDR, M_DONTWAIT | M_ZERO);
     if (c == NULL) {/* malloc failure */
 	printf("-- bridge: cannot add new cluster\n");
 	return NULL;
     }
     c[n_clusters].ht = (struct hash_table *)
 	    malloc(HASH_SIZE * sizeof(struct hash_table),
 		M_IFADDR, M_WAITOK | M_ZERO);
     if (c[n_clusters].ht == NULL) {
 	printf("-- bridge: cannot allocate hash table for new cluster\n");
 	free(c, M_IFADDR);
 	return NULL;
     }
     c[n_clusters].my_macs = (struct bdg_addr *)
 	    malloc(BDG_MAX_PORTS * sizeof(struct bdg_addr),
 		M_IFADDR, M_WAITOK | M_ZERO);
     if (c[n_clusters].my_macs == NULL) {
         printf("-- bridge: cannot allocate mac addr table for new cluster\n");
 	free(c[n_clusters].ht, M_IFADDR);
         free(c, M_IFADDR);
         return NULL;
     }
 
     c[n_clusters].cluster_id = cluster_id;
     c[n_clusters].ports = 0;
     /*
      * now copy old descriptors here
      */
     if (n_clusters > 0) {
 	for (i=0; i < n_clusters; i++)
 	    c[i] = clusters[i];
 	/*
 	 * and finally update pointers in ifp2sc
 	 */
 	for (i = 0 ; i < if_index && i < BDG_MAX_PORTS; i++)
 	    if (ifp2sc[i].cluster != NULL)
 		ifp2sc[i].cluster = c + (ifp2sc[i].cluster - clusters);
 	free(clusters, M_IFADDR);
     }
     clusters = c;
     i = n_clusters;		/* index of cluster entry */
     n_clusters++;
 found:
     c = clusters + i;		/* the right cluster ... */
     bcopy(ac->ac_enaddr, &(c->my_macs[c->ports]), 6);
     c->ports++;
     return c;
 }
 
 
 /*
  * Turn off bridging, by clearing promisc mode on the interface,
  * marking the interface as unused, and clearing the name in the
  * stats entry.
  * Also dispose the hash tables associated with the clusters.
  */
 static void
 bridge_off(void)
 {
     struct ifnet *ifp ;
     int i, s;
 
     DEB(printf("bridge_off: n_clusters %d\n", n_clusters);)
     TAILQ_FOREACH(ifp, &ifnet, if_link) {
 	struct bdg_softc *b;
 
 	if (ifp->if_index >= BDG_MAX_PORTS)
 	    continue;	/* make sure we do not go beyond the end */
 	b = &(ifp2sc[ifp->if_index]);
 
 	if ( b->flags & IFF_BDG_PROMISC ) {
 	    s = splimp();
 	    ifpromisc(ifp, 0);
 	    splx(s);
 	    b->flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ;
 	    DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n",
 		    ifp->if_name, ifp->if_unit,
 		    ifp->if_flags, b->flags);)
 	}
 	b->flags &= ~(IFF_USED) ;
 	b->cluster = NULL;
 	bdg_stats.s[ifp->if_index].name[0] = '\0';
     }
     /* flush_tables */
 
     s = splimp();
     for (i=0; i < n_clusters; i++) {
 	free(clusters[i].ht, M_IFADDR);
 	free(clusters[i].my_macs, M_IFADDR);
     }
     if (clusters != NULL)
 	free(clusters, M_IFADDR);
     clusters = NULL;
     n_clusters =0;
     splx(s);
 }
 
 /*
  * set promisc mode on the interfaces we use.
  */
 static void
 bridge_on(void)
 {
     struct ifnet *ifp ;
     int s ;
 
     TAILQ_FOREACH(ifp, &ifnet, if_link) {
 	struct bdg_softc *b = &ifp2sc[ifp->if_index];
 
 	if ( !(b->flags & IFF_USED) )
 	    continue ;
 	if ( !( ifp->if_flags & IFF_UP) ) {
 	    s = splimp();
 	    if_up(ifp);
 	    splx(s);
 	}
 	if ( !(b->flags & IFF_BDG_PROMISC) ) {
 	    int ret ;
 	    s = splimp();
 	    ret = ifpromisc(ifp, 1);
 	    splx(s);
 	    b->flags |= IFF_BDG_PROMISC ;
 	    DEB(printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n",
 		    ifp->if_name, ifp->if_unit,
 		    ifp->if_flags, b->flags);)
 	}
 	if (b->flags & IFF_MUTE) {
 	    DEB(printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit);)
 	    b->flags &= ~IFF_MUTE;
 	}
     }
 }
 
 /**
  * reconfigure bridge.
  * This is also done every time we attach or detach an interface.
  * Main use is to make sure that we do not bridge on some old
  * (ejected) device. So, it would be really useful to have a
  * pointer to the modified device as an argument. Without it, we
  * have to scan all interfaces.
  */
 static void
 reconfigure_bridge(void)
 {
     bridge_off();
     if (do_bridge) {
 	if (if_index >= BDG_MAX_PORTS) {
 	    printf("-- sorry too many interfaces (%d, max is %d),"
 		" disabling bridging\n", if_index, BDG_MAX_PORTS);
 	    do_bridge=0;
 	    return;
 	}
 	parse_bdg_cfg();
 	bridge_on();
     }
 }
 
 static char bridge_cfg[1024]; /* in BSS so initialized to all NULs */
 
 /*
  * parse the config string, set IFF_USED, name and cluster_id
  * for all interfaces found.
  * The config string is a list of "if[:cluster]" with
  * a number of possible separators (see "sep"). In particular the
  * use of the space lets you set bridge_cfg with the output from
  * "ifconfig -l"
  */
 static void
 parse_bdg_cfg()
 {
     char *p, *beg ;
     int l, cluster;
     static char *sep = ", \t";
 
     for (p = bridge_cfg; *p ; p++) {
 	struct ifnet *ifp;
 	int found = 0;
 	char c;
 
 	if (index(sep, *p))	/* skip separators */
 	    continue ;
 	/* names are lowercase and digits */
 	for ( beg = p ; islower(*p) || isdigit(*p) ; p++ )
 	    ;
 	l = p - beg ;		/* length of name string */
 	if (l == 0)		/* invalid name */
 	    break ;
 	if ( *p != ':' )	/* no ':', assume default cluster 1 */
 	    cluster = 1 ;
 	else			/* fetch cluster */
 	    cluster = strtoul( p+1, &p, 10);
 	c = *p;
 	*p = '\0';
 	/*
 	 * now search in interface list for a matching name
 	 */
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 	    char buf[IFNAMSIZ];
 
 	    snprintf(buf, sizeof(buf), "%s%d", ifp->if_name, ifp->if_unit);
 	    if (!strncmp(beg, buf, max(l, strlen(buf)))) {
 		struct bdg_softc *b = &ifp2sc[ifp->if_index];
 		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) {
 		    printf("%s is not an ethernet, continue\n", buf);
 		    continue;
 		}
 		if (b->flags & IFF_USED) {
 		    printf("%s already used, skipping\n", buf);
 		    break;
 		}
 		b->cluster = add_cluster(htons(cluster), (struct arpcom *)ifp);
 		b->flags |= IFF_USED ;
 		sprintf(bdg_stats.s[ifp->if_index].name,
 			"%s%d:%d", ifp->if_name, ifp->if_unit, cluster);
 
 		DEB(printf("--++  found %s next c %d\n",
 		    bdg_stats.s[ifp->if_index].name, c);)
 		found = 1;
 		break ;
 	    }
 	}
 	if (!found)
 	    printf("interface %s Not found in bridge\n", beg);
 	*p = c;
 	if (c == '\0')
 	    break; /* no more */
     }
 }
 
 
 /*
  * handler for net.link.ether.bridge
  */
 static int
 sysctl_bdg(SYSCTL_HANDLER_ARGS)
 {
     int error, oldval = do_bridge ;
 
     error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
     DEB( printf("called sysctl for bridge name %s arg2 %d val %d->%d\n",
 	oidp->oid_name, oidp->oid_arg2,
 	oldval, do_bridge); )
 
     if (oldval != do_bridge)
 	reconfigure_bridge();
     return error ;
 }
 
 /*
  * handler for net.link.ether.bridge_cfg
  */
 static int
 sysctl_bdg_cfg(SYSCTL_HANDLER_ARGS)
 {
     int error = 0 ;
     char old_cfg[1024] ;
 
     strcpy(old_cfg, bridge_cfg) ;
 
     error = sysctl_handle_string(oidp, bridge_cfg, oidp->oid_arg2, req);
     DEB(
 	printf("called sysctl for bridge name %s arg2 %d err %d val %s->%s\n",
 		oidp->oid_name, oidp->oid_arg2,
 		error,
 		old_cfg, bridge_cfg);
 	)
     if (strcmp(old_cfg, bridge_cfg))
 	reconfigure_bridge();
     return error ;
 }
 
 static int
 sysctl_refresh(SYSCTL_HANDLER_ARGS)
 {
     if (req->newptr)
 	reconfigure_bridge();
 
     return 0;
 }
 
 
 SYSCTL_DECL(_net_link_ether);
 SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_cfg, CTLTYPE_STRING|CTLFLAG_RW,
 	    &bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A",
 	    "Bridge configuration");
 
 SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge, CTLTYPE_INT|CTLFLAG_RW,
 	    &do_bridge, 0, &sysctl_bdg, "I", "Bridging");
 
 SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw, CTLFLAG_RW,
 	    &bdg_ipfw,0,"Pass bridged pkts through firewall");
 
 SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipf, CTLFLAG_RW,
 	    &bdg_ipf, 0,"Pass bridged pkts through IPFilter");
 
 /*
  * The follow macro declares a variable, and maps it to
  * a SYSCTL_INT entry with the same name.
  */
 #define SY(parent, var, comment)			\
 	static int var ;				\
 	SYSCTL_INT(parent, OID_AUTO, var, CTLFLAG_RW, &(var), 0, comment);
 
 int bdg_ipfw_drops;
 SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_drop,
 	CTLFLAG_RW, &bdg_ipfw_drops,0,"");
 
 int bdg_ipfw_colls;
 SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_collisions,
 	CTLFLAG_RW, &bdg_ipfw_colls,0,"");
 
 SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_refresh, CTLTYPE_INT|CTLFLAG_WR,
 	    NULL, 0, &sysctl_refresh, "I", "iface refresh");
 
 #if 1 /* diagnostic vars */
 
 SY(_net_link_ether, verbose, "Be verbose");
 SY(_net_link_ether, bdg_split_pkts, "Packets split in bdg_forward");
 
 SY(_net_link_ether, bdg_thru, "Packets through bridge");
 
 SY(_net_link_ether, bdg_copied, "Packets copied in bdg_forward");
 
 SY(_net_link_ether, bdg_copy, "Force copy in bdg_forward");
 SY(_net_link_ether, bdg_predict, "Correctly predicted header location");
 
 SY(_net_link_ether, bdg_fw_avg, "Cycle counter avg");
 SY(_net_link_ether, bdg_fw_ticks, "Cycle counter item");
 SY(_net_link_ether, bdg_fw_count, "Cycle counter count");
 #endif
 
 SYSCTL_STRUCT(_net_link_ether, PF_BDG, bdgstats,
 	CTLFLAG_RD, &bdg_stats , bdg_stats, "bridge statistics");
 
 static int bdg_loops ;
 
 /*
  * called periodically to flush entries etc.
  */
 static void
 bdg_timeout(void *dummy)
 {
     static int slowtimer; /* in BSS so initialized to 0 */
 
     if (do_bridge) {
 	static int age_index = 0 ; /* index of table position to age */
 	int l = age_index + HASH_SIZE/4 ;
 	int i;
 	/*
 	 * age entries in the forwarding table.
 	 */
 	if (l > HASH_SIZE)
 	    l = HASH_SIZE ;
 
     for (i=0; i<n_clusters; i++) {
 	bdg_hash_table *bdg_table = clusters[i].ht;
 	for (; age_index < l ; age_index++)
 	    if (bdg_table[age_index].used)
 		bdg_table[age_index].used = 0 ;
 	    else if (bdg_table[age_index].name) {
 		/* printf("xx flushing stale entry %d\n", age_index); */
 		bdg_table[age_index].name = NULL ;
 	    }
     }
 	if (age_index >= HASH_SIZE)
 	    age_index = 0 ;
 
 	if (--slowtimer <= 0 ) {
 	    slowtimer = 5 ;
 
 	    bridge_on() ; /* we just need unmute, really */
 	    bdg_loops = 0 ;
 	}
     }
     bdg_timeout_h = timeout(bdg_timeout, NULL, 2*hz );
 }
 
 /*
  * Find the right pkt destination:
  *	BDG_BCAST	is a broadcast
  *	BDG_MCAST	is a multicast
  *	BDG_LOCAL	is for a local address
  *	BDG_DROP	must be dropped
  *	other		ifp of the dest. interface (incl.self)
  *
  * We assume this is only called for interfaces for which bridging
  * is enabled, i.e. BDG_USED(ifp) is true.
  */
 static __inline
 struct ifnet *
 bridge_dst_lookup(struct ether_header *eh, struct cluster_softc *c)
 {
     struct ifnet *dst ;
     int index ;
     struct bdg_addr *p ;
     bdg_hash_table *bt;		/* pointer to entry in hash table */
 
     if (IS_ETHER_BROADCAST(eh->ether_dhost))
 	return BDG_BCAST ;
     if (eh->ether_dhost[0] & 1)
 	return BDG_MCAST ;
     /*
      * Lookup local addresses in case one matches.
      */
     for (index = c->ports, p = c->my_macs; index ; index--, p++ )
 	if (BDG_MATCH(p->etheraddr, eh->ether_dhost) )
 	    return BDG_LOCAL ;
     /*
      * Look for a possible destination in table
      */
     index= HASH_FN( eh->ether_dhost );
     bt = &(c->ht[index]);
     dst = bt->name;
     if ( dst && BDG_MATCH( bt->etheraddr, eh->ether_dhost) )
 	return dst ;
     else
 	return BDG_UNKNOWN ;
 }
 
 /**
  * bridge_in() is invoked to perform bridging decision on input packets.
  *
  * On Input:
  *   eh		Ethernet header of the incoming packet.
  *   ifp	interface the packet is coming from.
  *
  * On Return: destination of packet, one of
  *   BDG_BCAST	broadcast
  *   BDG_MCAST  multicast
  *   BDG_LOCAL  is only for a local address (do not forward)
  *   BDG_DROP   drop the packet
  *   ifp	ifp of the destination interface.
  *
  * Forwarding is not done directly to give a chance to some drivers
  * to fetch more of the packet, or simply drop it completely.
  */
 
 static struct ifnet *
 bridge_in(struct ifnet *ifp, struct ether_header *eh)
 {
     int index;
     struct ifnet *dst , *old ;
     bdg_hash_table *bt;			/* location in hash table */
     int dropit = BDG_MUTED(ifp) ;
 
     /*
      * hash the source address
      */
     index= HASH_FN(eh->ether_shost);
     bt = &(ifp2sc[ifp->if_index].cluster->ht[index]);
     bt->used = 1 ;
     old = bt->name ;
     if ( old ) { /* the entry is valid. */
 	if (!BDG_MATCH( eh->ether_shost, bt->etheraddr) ) {
 	    bdg_ipfw_colls++ ;
 	    bt->name = NULL ;
 	} else if (old != ifp) {
 	    /*
 	     * Found a loop. Either a machine has moved, or there
 	     * is a misconfiguration/reconfiguration of the network.
 	     * First, do not forward this packet!
 	     * Record the relocation anyways; then, if loops persist,
 	     * suspect a reconfiguration and disable forwarding
 	     * from the old interface.
 	     */
 	    bt->name = ifp ; /* relocate address */
 	    printf("-- loop (%d) %6D to %s%d from %s%d (%s)\n",
 			bdg_loops, eh->ether_shost, ".",
 			ifp->if_name, ifp->if_unit,
 			old->if_name, old->if_unit,
 			BDG_MUTED(old) ? "muted":"active");
 	    dropit = 1 ;
 	    if ( !BDG_MUTED(old) ) {
 		if (++bdg_loops > 10)
 		    BDG_MUTE(old) ;
 	    }
 	}
     }
 
     /*
      * now write the source address into the table
      */
     if (bt->name == NULL) {
 	DEB(printf("new addr %6D at %d for %s%d\n",
 	    eh->ether_shost, ".", index, ifp->if_name, ifp->if_unit);)
 	bcopy(eh->ether_shost, bt->etheraddr, 6);
 	bt->name = ifp ;
     }
     dst = bridge_dst_lookup(eh, ifp2sc[ifp->if_index].cluster);
     /*
      * bridge_dst_lookup can return the following values:
      *   BDG_BCAST, BDG_MCAST, BDG_LOCAL, BDG_UNKNOWN, BDG_DROP, ifp.
      * For muted interfaces, or when we detect a loop, the first 3 are
      * changed in BDG_LOCAL (we still listen to incoming traffic),
      * and others to BDG_DROP (no use for the local host).
      * Also, for incoming packets, ifp is changed to BDG_DROP if ifp == src.
      * These changes are not necessary for outgoing packets from ether_output().
      */
     BDG_STAT(ifp, BDG_IN);
     switch ((uintptr_t)dst) {
     case (uintptr_t)BDG_BCAST:
     case (uintptr_t)BDG_MCAST:
     case (uintptr_t)BDG_LOCAL:
     case (uintptr_t)BDG_UNKNOWN:
     case (uintptr_t)BDG_DROP:
 	BDG_STAT(ifp, dst);
 	break ;
     default :
 	if (dst == ifp || dropit)
 	    BDG_STAT(ifp, BDG_DROP);
 	else
 	    BDG_STAT(ifp, BDG_FORWARD);
 	break ;
     }
 
     if ( dropit ) {
 	if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_LOCAL)
 	    dst = BDG_LOCAL ;
 	else
 	    dst = BDG_DROP ;
     } else {
 	if (dst == ifp)
 	    dst = BDG_DROP;
     }
     DEB(printf("bridge_in %6D ->%6D ty 0x%04x dst %s%d\n",
 	eh->ether_shost, ".",
 	eh->ether_dhost, ".",
 	ntohs(eh->ether_type),
 	(dst <= BDG_FORWARD) ? bdg_dst_names[(int)dst] :
 		dst->if_name,
 	(dst <= BDG_FORWARD) ? 0 : dst->if_unit); )
 
     return dst ;
 }
 
 /*
  * Forward a packet to dst -- which can be a single interface or
  * an entire cluster. The src port and muted interfaces are excluded.
  *
  * If src == NULL, the pkt comes from ether_output, and dst is the real
  * interface the packet is originally sent to. In this case, we must forward
  * it to the whole cluster.
  * We never call bdg_forward from ether_output on interfaces which are
  * not part of a cluster.
  *
  * If possible (i.e. we can determine that the caller does not need
  * a copy), the packet is consumed here, and bdg_forward returns NULL.
  * Otherwise, a pointer to a copy of the packet is returned.
  *
  * XXX be careful with eh, it can be a pointer into *m
  */
 static struct mbuf *
 bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst)
 {
     struct ifnet *src;
     struct ifnet *ifp, *last;
     int shared = bdg_copy ; /* someone else is using the mbuf */
     int once = 0;      /* loop only once */
     struct ifnet *real_dst = dst ; /* real dst from ether_output */
     struct ip_fw_args args;
 #ifdef PFIL_HOOKS
     struct packet_filter_hook *pfh;
     int rv;
 #endif /* PFIL_HOOKS */
 
     /*
      * XXX eh is usually a pointer within the mbuf (some ethernet drivers
      * do that), so we better copy it before doing anything with the mbuf,
      * or we might corrupt the header.
      */
     struct ether_header save_eh = *eh ;
 
     DEB(quad_t ticks; ticks = rdtsc();)
 
     args.rule = NULL;		/* did we match a firewall rule ? */
     /* Fetch state from dummynet tag, ignore others */
     for (;m0->m_type == MT_TAG; m0 = m0->m_next)
-	if (m0->m_tag_id == PACKET_TAG_DUMMYNET) {
+	if (m0->_m_tag_id == PACKET_TAG_DUMMYNET) {
 	    args.rule = ((struct dn_pkt *)m0)->rule;
 	    shared = 0;		/* For sure this is our own mbuf. */
 	}
     if (args.rule == NULL)
 	bdg_thru++; /* first time through bdg_forward, count packet */
 
     src = m0->m_pkthdr.rcvif;
     if (src == NULL)			/* packet from ether_output */
 	dst = bridge_dst_lookup(eh, ifp2sc[real_dst->if_index].cluster);
 
     if (dst == BDG_DROP) { /* this should not happen */
 	printf("xx bdg_forward for BDG_DROP\n");
 	m_freem(m0);
 	return NULL;
     }
     if (dst == BDG_LOCAL) { /* this should not happen as well */
 	printf("xx ouch, bdg_forward for local pkt\n");
 	return m0;
     }
     if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) {
 	ifp = TAILQ_FIRST(&ifnet) ; /* scan all ports */
 	once = 0 ;
 	if (dst != BDG_UNKNOWN) /* need a copy for the local stack */
 	    shared = 1 ;
     } else {
 	ifp = dst ;
 	once = 1 ;
     }
     if ((uintptr_t)(ifp) <= (u_int)BDG_FORWARD)
 	panic("bdg_forward: bad dst");
 
     /*
      * Do filtering in a very similar way to what is done in ip_output.
      * Only if firewall is loaded, enabled, and the packet is not
      * from ether_output() (src==NULL, or we would filter it twice).
      * Additional restrictions may apply e.g. non-IP, short packets,
      * and pkts already gone through a pipe.
      */
     if (src != NULL && (
 #ifdef PFIL_HOOKS
 	((pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh)) != NULL && bdg_ipf !=0) ||
 #endif
 	(IPFW_LOADED && bdg_ipfw != 0))) {
 
 	int i;
 
 	if (args.rule != NULL && fw_one_pass)
 	    goto forward; /* packet already partially processed */
 	/*
 	 * i need some amt of data to be contiguous, and in case others need
 	 * the packet (shared==1) also better be in the first mbuf.
 	 */
 	i = min(m0->m_pkthdr.len, max_protohdr) ;
 	if ( shared || m0->m_len < i) {
 	    m0 = m_pullup(m0, i) ;
 	    if (m0 == NULL) {
 		printf("-- bdg: pullup failed.\n") ;
 		return NULL ;
 	    }
 	}
 
 #ifdef PFIL_HOOKS
 	/*
 	 * NetBSD-style generic packet filter, pfil(9), hooks.
 	 * Enables ipf(8) in bridging.
 	 */
 	if (m0->m_pkthdr.len >= sizeof(struct ip) &&
 		ntohs(save_eh.ether_type) == ETHERTYPE_IP) {
 	    /*
 	     * before calling the firewall, swap fields the same as IP does.
 	     * here we assume the pkt is an IP one and the header is contiguous
 	     */
 	    struct ip *ip = mtod(m0, struct ip *);
 
 	    ip->ip_len = ntohs(ip->ip_len);
 	    ip->ip_off = ntohs(ip->ip_off);
 
 	    for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link))
 		if (pfh->pfil_func) {
 		    rv = pfh->pfil_func(ip, ip->ip_hl << 2, src, 0, &m0);
 		    if (rv != 0 || m0 == NULL)
 			return m0;
 		    ip = mtod(m0, struct ip *);
 		}
 	    /*
 	     * If we get here, the firewall has passed the pkt, but the mbuf
 	     * pointer might have changed. Restore ip and the fields ntohs()'d.
 	     */
 	    ip = mtod(m0, struct ip *);
 	    ip->ip_len = htons(ip->ip_len);
 	    ip->ip_off = htons(ip->ip_off);
 	}
 #endif /* PFIL_HOOKS */
 
 	/*
 	 * Prepare arguments and call the firewall.
 	 */
 	if (!IPFW_LOADED || bdg_ipfw == 0)
 	    goto forward;	/* not using ipfw, accept the packet */
 
 	/*
 	 * XXX The following code is very similar to the one in
 	 * if_ethersubr.c:ether_ipfw_chk()
 	 */
 
 	args.m = m0;		/* the packet we are looking at		*/
 	args.oif = NULL;	/* this is an input packet		*/
 	args.divert_rule = 0;	/* we do not support divert yet		*/
 	args.next_hop = NULL;	/* we do not support forward yet	*/
 	args.eh = &save_eh;	/* MAC header for bridged/MAC packets	*/
 	i = ip_fw_chk_ptr(&args);
 	m0 = args.m;		/* in case the firewall used the mbuf	*/
 
 	if ( (i & IP_FW_PORT_DENY_FLAG) || m0 == NULL) /* drop */
 	    return m0 ;
 
 	if (i == 0) /* a PASS rule.  */
 	    goto forward ;
 	if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG)) {
 	    /*
 	     * Pass the pkt to dummynet, which consumes it.
 	     * If shared, make a copy and keep the original.
 	     */
 	    struct mbuf *m ;
 
 	    if (shared) {
 		m = m_copypacket(m0, M_DONTWAIT);
 		if (m == NULL)	/* copy failed, give up */
 		    return m0;
 	    } else {
 		m = m0 ; /* pass the original to dummynet */
 		m0 = NULL ; /* and nothing back to the caller */
 	    }
 	    /*
 	     * Prepend the header, optimize for the common case of
 	     * eh pointing into the mbuf.
 	     */
 	    if ( (void *)(eh + 1) == (void *)m->m_data) {
 		m->m_data -= ETHER_HDR_LEN ;
 		m->m_len += ETHER_HDR_LEN ;
 		m->m_pkthdr.len += ETHER_HDR_LEN ;
 		bdg_predict++;
 	    } else {
 		M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
 		if (m == NULL) /* nope... */
 		    return m0 ;
 		bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN);
 	    }
 
 	    args.oif = real_dst;
 	    ip_dn_io_ptr(m, (i & 0xffff),DN_TO_BDG_FWD, &args);
 	    return m0 ;
 	}
 	/*
 	 * XXX at some point, add support for divert/forward actions.
 	 * If none of the above matches, we have to drop the packet.
 	 */
 	bdg_ipfw_drops++ ;
 	return m0 ;
     }
 forward:
     /*
      * Again, bring up the headers in case of shared bufs to avoid
      * corruptions in the future.
      */
     if ( shared ) {
 	int i = min(m0->m_pkthdr.len, max_protohdr) ;
 
 	m0 = m_pullup(m0, i) ;
 	if (m0 == NULL)
 	    return NULL ;
     }
     /*
      * now real_dst is used to determine the cluster where to forward.
      * For packets coming from ether_input, this is the one of the 'src'
      * interface, whereas for locally generated packets (src==NULL) it
      * is the cluster of the original destination interface, which
      * was already saved into real_dst.
      */
     if (src != NULL)
 	real_dst = src ;
 
     last = NULL;
     for (;;) {
 	if (last) { /* need to forward packet leftover from previous loop */
 	    struct mbuf *m ;
 	    if (shared == 0 && once ) { /* no need to copy */
 		m = m0 ;
 		m0 = NULL ; /* original is gone */
 	    } else {
 		m = m_copypacket(m0, M_DONTWAIT);
 		if (m == NULL) {
 		    printf("bdg_forward: sorry, m_copypacket failed!\n");
 		    return m0 ; /* the original is still there... */
 		}
 	    }
 	    /*
 	     * Add header (optimized for the common case of eh pointing
 	     * already into the mbuf) and execute last part of ether_output:
 	     * queue pkt and start output if interface not yet active.
 	     */
 	    if ( (void *)(eh + 1) == (void *)m->m_data) {
 		m->m_data -= ETHER_HDR_LEN ;
 		m->m_len += ETHER_HDR_LEN ;
 		m->m_pkthdr.len += ETHER_HDR_LEN ;
 		bdg_predict++;
 	    } else {
 		M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
 		if (!m && verbose)
 		    printf("M_PREPEND failed\n");
 		if (m == NULL)
 		    return m0;
 		bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN);
 	    }
 	    if (!IF_HANDOFF(&last->if_snd, m, last)) {
 #if 0
 		BDG_MUTE(last); /* should I also mute ? */
 #endif
 	    }
 	    BDG_STAT(last, BDG_OUT);
 	    last = NULL ;
 	    if (once)
 		break ;
 	}
 	if (ifp == NULL)
 	    break ;
 	/*
 	 * If the interface is used for bridging, not muted, not full,
 	 * up and running, is not the source interface, and belongs to
 	 * the same cluster as the 'real_dst', then send here.
 	 */
 	if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !_IF_QFULL(&ifp->if_snd)  &&
 	     (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) &&
 	     ifp != src && BDG_SAMECLUSTER(ifp, real_dst) )
 	    last = ifp ;
 	ifp = TAILQ_NEXT(ifp, if_link) ;
 	if (ifp == NULL)
 	    once = 1 ;
     }
     DEB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ;
 	if (bdg_fw_count != 0) bdg_fw_avg = bdg_fw_ticks/bdg_fw_count; )
     return m0 ;
 }
 
 /*
  * initialization of bridge code.
  */
 static int
 bdginit(void)
 {
     printf("BRIDGE 020214 loaded\n");
 
     ifp2sc = malloc(BDG_MAX_PORTS * sizeof(struct bdg_softc),
 		M_IFADDR, M_WAITOK | M_ZERO );
     if (ifp2sc == NULL)
 	return ENOMEM ;
 
     bridge_in_ptr = bridge_in;
     bdg_forward_ptr = bdg_forward;
     bdgtakeifaces_ptr = reconfigure_bridge;
 
     n_clusters = 0;
     clusters = NULL;
     do_bridge=0;
 
     bzero(&bdg_stats, sizeof(bdg_stats) );
     bdgtakeifaces_ptr();
     bdg_timeout(0);
     return 0 ;
 }
 
 /*
  * initialization code, both for static and dynamic loading.
  */
 static int
 bridge_modevent(module_t mod, int type, void *unused)
 {
 	int s;
 	int err = 0 ;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (BDG_LOADED) {
 			err = EEXIST;
 			break ;
 		}
 		s = splimp();
 		err = bdginit();
 		splx(s);
 		break;
 	case MOD_UNLOAD:
 #if !defined(KLD_MODULE)
 		printf("bridge statically compiled, cannot unload\n");
 		err = EINVAL ;
 #else
 		s = splimp();
 		do_bridge = 0;
 		bridge_in_ptr = NULL;
 		bdg_forward_ptr = NULL;
 		bdgtakeifaces_ptr = NULL;
 		untimeout(bdg_timeout, NULL, bdg_timeout_h);
 		bridge_off();
 		if (clusters)
 		    free(clusters, M_IFADDR);
 		free(ifp2sc, M_IFADDR);
 		ifp2sc = NULL ;
 		splx(s);
 #endif
 		break;
 	default:
 		err = EINVAL ;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t bridge_mod = {
 	"bridge",
 	bridge_modevent,
 	0
 };
 
 DECLARE_MODULE(bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(bridge, 1);
Index: head/sys/net/if_gre.c
===================================================================
--- head/sys/net/if_gre.c	(revision 105193)
+++ head/sys/net/if_gre.c	(revision 105194)
@@ -1,756 +1,756 @@
 /*	$NetBSD: if_gre.c,v 1.42 2002/08/14 00:23:27 itojun Exp $ */
 /*	 $FreeBSD$ */
 
 /*
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Heiko W.Rupp <hwr@pilhuhn.de>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Encapsulate L3 protocols into IP
  * See RFC 1701 and 1702 for more details.
  * If_gre is compatible with Cisco GRE tunnels, so you can
  * have a NetBSD box as the other end of a tunnel interface of a Cisco
  * router. See gre(4) for more details.
  * Also supported:  IP in IP encaps (proto 55) as of RFC 2004
  */
 
 #include "opt_atalk.h"
 #include "opt_inet.h"
 #include "opt_ns.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_gre.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 #else
 #error "Huh? if_gre without inet?"
 #endif
 
 #include <net/bpf.h>
 
 #include <net/net_osdep.h>
 #include <net/if_gre.h>
 
 /*
  * It is not easy to calculate the right value for a GRE MTU.
  * We leave this task to the admin and use the same default that
  * other vendors use.
  */
 #define GREMTU	1476
 
 #define GRENAME	"gre"
 
 static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation");
 
 struct gre_softc_head gre_softc_list;
 
 static int	gre_clone_create __P((struct if_clone *, int));
 static void	gre_clone_destroy __P((struct ifnet *));
 static int	gre_ioctl(struct ifnet *, u_long, caddr_t);
 static int	gre_output(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *rt);
 
 static struct if_clone gre_cloner =
     IF_CLONE_INITIALIZER("gre", gre_clone_create, gre_clone_destroy, 0, IF_MAXUNIT);
 
 static int gre_compute_route(struct gre_softc *sc);
 
 static void	greattach __P((void));
 
 #ifdef INET
 extern struct domain inetdomain;
 static const struct protosw in_gre_protosw =
 { SOCK_RAW,     &inetdomain,    IPPROTO_GRE,    PR_ATOMIC|PR_ADDR,
   (pr_input_t*)gre_input, (pr_output_t*)rip_output, rip_ctlinput, rip_ctloutput,
   0,
   0,		0,		0,		0,
   &rip_usrreqs
 };
 static const struct protosw in_mobile_protosw =
 { SOCK_RAW,     &inetdomain,    IPPROTO_MOBILE, PR_ATOMIC|PR_ADDR,
   (pr_input_t*)gre_mobile_input, (pr_output_t*)rip_output, rip_ctlinput, rip_ctloutput,
   0,
   0,		0,		0,		0,
   &rip_usrreqs
 };
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_OTHER, gre, CTLFLAG_RW, 0,
     "Generic Routing Encapsulation");
 #ifndef MAX_GRE_NEST
 /*
  * This macro controls the default upper limitation on nesting of gre tunnels.
  * Since, setting a large value to this macro with a careless configuration
  * may introduce system crash, we don't allow any nestings by default.
  * If you need to configure nested gre tunnels, you can define this macro
  * in your kernel configuration file.  However, if you do so, please be
  * careful to configure the tunnels so that it won't make a loop.
  */
 #define MAX_GRE_NEST 1
 #endif
 static int max_gre_nesting = MAX_GRE_NEST;
 SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW,
     &max_gre_nesting, 0, "Max nested tunnels");
 
 /* ARGSUSED */
 static void
 greattach(void)
 {
 
 	LIST_INIT(&gre_softc_list);
 	if_clone_attach(&gre_cloner);
 }
 
 static int
 gre_clone_create(ifc, unit)
 	struct if_clone *ifc;
 	int unit;
 {
 	struct gre_softc *sc;
 
 	sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK);
 	memset(sc, 0, sizeof(struct gre_softc));
 
 	sc->sc_if.if_name = GRENAME;
 	sc->sc_if.if_softc = sc;
 	sc->sc_if.if_unit = unit;
 	sc->sc_if.if_snd.ifq_maxlen = IFQ_MAXLEN;
 	sc->sc_if.if_type = IFT_OTHER;
 	sc->sc_if.if_addrlen = 0;
 	sc->sc_if.if_hdrlen = 24; /* IP + GRE */
 	sc->sc_if.if_mtu = GREMTU;
 	sc->sc_if.if_flags = IFF_POINTOPOINT|IFF_MULTICAST;
 	sc->sc_if.if_output = gre_output;
 	sc->sc_if.if_ioctl = gre_ioctl;
 	sc->g_dst.s_addr = sc->g_src.s_addr = INADDR_ANY;
 	sc->g_proto = IPPROTO_GRE;
 	sc->sc_if.if_flags |= IFF_LINK0;
 	sc->encap = NULL;
 	sc->called = 0;
 	if_attach(&sc->sc_if);
 	bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int32_t));
 	LIST_INSERT_HEAD(&gre_softc_list, sc, sc_list);
 	return (0);
 }
 
 static void
 gre_clone_destroy(ifp)
 	struct ifnet *ifp;
 {
 	struct gre_softc *sc = ifp->if_softc;
 
 #ifdef INET
 	if (sc->encap != NULL)
 		encap_detach(sc->encap);
 #endif
 	LIST_REMOVE(sc, sc_list);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	free(sc, M_GRE);
 }
 
 /*
  * The output routine. Takes a packet and encapsulates it in the protocol
  * given by sc->g_proto. See also RFC 1701 and RFC 2004
  */
 static int
 gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 	   struct rtentry *rt)
 {
 	int error = 0;
 	struct gre_softc *sc = ifp->if_softc;
 	struct greip *gh;
 	struct ip *ip;
 	u_char osrc;
 	u_short etype = 0;
 	struct mobile_h mob_h;
 
 	/*
 	 * gre may cause infinite recursion calls when misconfigured.
 	 * We'll prevent this by introducing upper limit.
 	 */
 	if (++(sc->called) > max_gre_nesting) {
 		printf("%s: gre_output: recursively called too many "
 		       "times(%d)\n", if_name(&sc->sc_if), sc->called);
 		m_freem(m);
 		error = EIO;    /* is there better errno? */
 		goto end;
 	}
 
 	if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) == 0 ||
 	    sc->g_src.s_addr == INADDR_ANY || sc->g_dst.s_addr == INADDR_ANY) {
 		m_freem(m);
 		error = ENETDOWN;
 		goto end;
 	}
 
 	gh = NULL;
 	ip = NULL;
 	osrc = 0;
 
 	if (ifp->if_bpf) {
 		/* see comment of other if_foo.c files */
 		struct mbuf m0;
 		u_int32_t af = dst->sa_family;
 
 		m0.m_next = m;
 		m0.m_len = 4;
 		m0.m_data = (char *)&af;
 
 		bpf_mtap(ifp, &m0);
 	}
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	if (sc->g_proto == IPPROTO_MOBILE) {
 		if (dst->sa_family == AF_INET) {
 			struct mbuf *m0;
 			int msiz;
 
 			ip = mtod(m, struct ip *);
 
 			/*
 			 * RFC2004 specifies that fragmented diagrams shouldn't
 			 * be encapsulated.
 			 */
 			if ((ip->ip_off & IP_MF) != 0) {
 				_IF_DROP(&ifp->if_snd);
 				m_freem(m);
 				error = EINVAL;    /* is there better errno? */
 				goto end;
 			}
 			memset(&mob_h, 0, MOB_H_SIZ_L);
 			mob_h.proto = (ip->ip_p) << 8;
 			mob_h.odst = ip->ip_dst.s_addr;
 			ip->ip_dst.s_addr = sc->g_dst.s_addr;
 
 			/*
 			 * If the packet comes from our host, we only change
 			 * the destination address in the IP header.
 			 * Else we also need to save and change the source
 			 */
 			if (in_hosteq(ip->ip_src, sc->g_src)) {
 				msiz = MOB_H_SIZ_S;
 			} else {
 				mob_h.proto |= MOB_H_SBIT;
 				mob_h.osrc = ip->ip_src.s_addr;
 				ip->ip_src.s_addr = sc->g_src.s_addr;
 				msiz = MOB_H_SIZ_L;
 			}
 			mob_h.proto = htons(mob_h.proto);
 			mob_h.hcrc = gre_in_cksum((u_short *)&mob_h, msiz);
 
 			if ((m->m_data - msiz) < m->m_pktdat) {
 				/* need new mbuf */
 				MGETHDR(m0, M_DONTWAIT, MT_HEADER);
 				if (m0 == NULL) {
 					_IF_DROP(&ifp->if_snd);
 					m_freem(m);
 					error = ENOBUFS;
 					goto end;
 				}
 				m0->m_next = m;
 				m->m_data += sizeof(struct ip);
 				m->m_len -= sizeof(struct ip);
 				m0->m_pkthdr.len = m->m_pkthdr.len + msiz;
 				m0->m_len = msiz + sizeof(struct ip);
 				m0->m_data += max_linkhdr;
 				memcpy(mtod(m0, caddr_t), (caddr_t)ip,
 				       sizeof(struct ip));
 				m = m0;
 			} else {  /* we have some space left in the old one */
 				m->m_data -= msiz;
 				m->m_len += msiz;
 				m->m_pkthdr.len += msiz;
 				bcopy(ip, mtod(m, caddr_t),
 					sizeof(struct ip));
 			}
 			ip = mtod(m, struct ip *);
 			memcpy((caddr_t)(ip + 1), &mob_h, (unsigned)msiz);
 			ip->ip_len = ntohs(ip->ip_len) + msiz;
 		} else {  /* AF_INET */
 			_IF_DROP(&ifp->if_snd);
 			m_freem(m);
 			error = EINVAL;
 			goto end;
 		}
 	} else if (sc->g_proto == IPPROTO_GRE) {
 		switch (dst->sa_family) {
 		case AF_INET:
 			ip = mtod(m, struct ip *);
 			etype = ETHERTYPE_IP;
 			break;
 #ifdef NETATALK
 		case AF_APPLETALK:
 			etype = ETHERTYPE_ATALK;
 			break;
 #endif
 #ifdef NS
 		case AF_NS:
 			etype = ETHERTYPE_NS;
 			break;
 #endif
 		default:
 			_IF_DROP(&ifp->if_snd);
 			m_freem(m);
 			error = EAFNOSUPPORT;
 			goto end;
 		}
 		M_PREPEND(m, sizeof(struct greip), M_DONTWAIT);
 	} else {
 		_IF_DROP(&ifp->if_snd);
 		m_freem(m);
 		error = EINVAL;
 		goto end;
 	}
 
 	if (m == NULL) {	/* impossible */
 		_IF_DROP(&ifp->if_snd);
 		error = ENOBUFS;
 		goto end;
 	}
 
 	gh = mtod(m, struct greip *);
 	if (sc->g_proto == IPPROTO_GRE) {
 		/* we don't have any GRE flags for now */
 
 		memset((void *)&gh->gi_g, 0, sizeof(struct gre_h));
 		gh->gi_ptype = htons(etype);
 	}
 
 	gh->gi_pr = sc->g_proto;
 	if (sc->g_proto != IPPROTO_MOBILE) {
 		gh->gi_src = sc->g_src;
 		gh->gi_dst = sc->g_dst;
 		((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2;
 		((struct ip*)gh)->ip_ttl = GRE_TTL;
 		((struct ip*)gh)->ip_tos = ip->ip_tos;
 		((struct ip*)gh)->ip_id = ip->ip_id;
 		gh->gi_len = m->m_pkthdr.len;
 	}
 
 	ifp->if_opackets++;
 	ifp->if_obytes += m->m_pkthdr.len;
 	/* send it off */
-	error = ip_output(m, NULL, &sc->route, 0, NULL);
+	error = ip_output(m, NULL, &sc->route, 0, NULL, NULL);
   end:
 	sc->called = 0;
 	if (error)
 		ifp->if_oerrors++;
 	return (error);
 }
 
 static int
 gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct if_laddrreq *lifr = (struct if_laddrreq *)data;
 	struct in_aliasreq *aifr = (struct in_aliasreq *)data;
 	struct gre_softc *sc = ifp->if_softc;
 	int s;
 	struct sockaddr_in si;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct sockaddr_in sp, sm, dp, dm;
 
 	error = 0;
 
 	s = splnet();
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		break;
 	case SIOCSIFDSTADDR: 
 		break;
 	case SIOCSIFFLAGS:
 		if ((error = suser(curthread)) != 0)
 			break;
 		if ((ifr->ifr_flags & IFF_LINK0) != 0)
 			sc->g_proto = IPPROTO_GRE;
 		else
 			sc->g_proto = IPPROTO_MOBILE;
 		goto recompute;
 	case SIOCSIFMTU:
 		if ((error = suser(curthread)) != 0)
 			break;
 		if (ifr->ifr_mtu < 576) {
 			error = EINVAL;
 			break;
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = sc->sc_if.if_mtu;
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if ((error = suser(curthread)) != 0)
 			break;
 		if (ifr == 0) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 	case GRESPROTO:
 		if ((error = suser(curthread)) != 0)
 			break;
 		sc->g_proto = ifr->ifr_flags;
 		switch (sc->g_proto) {
 		case IPPROTO_GRE:
 			ifp->if_flags |= IFF_LINK0;
 			break;
 		case IPPROTO_MOBILE:
 			ifp->if_flags &= ~IFF_LINK0;
 			break;
 		default:
 			error = EPROTONOSUPPORT;
 			break;
 		}
 		goto recompute;
 	case GREGPROTO:
 		ifr->ifr_flags = sc->g_proto;
 		break;
 	case GRESADDRS:
 	case GRESADDRD:
 		if ((error = suser(curthread)) != 0)
 			break;
 		/*
 		 * set tunnel endpoints, compute a less specific route
 		 * to the remote end and mark if as up
 		 */
 		sa = &ifr->ifr_addr;
 		if (cmd == GRESADDRS)
 			sc->g_src = (satosin(sa))->sin_addr;
 		if (cmd == GRESADDRD)
 			sc->g_dst = (satosin(sa))->sin_addr;
 	recompute:
 #ifdef INET
 		if (sc->encap != NULL) {
 			encap_detach(sc->encap);
 			sc->encap = NULL;
 		}
 #endif
 		if ((sc->g_src.s_addr != INADDR_ANY) &&
 		    (sc->g_dst.s_addr != INADDR_ANY)) {
 			bzero(&sp, sizeof(sp));
 			bzero(&sm, sizeof(sm));
 			bzero(&dp, sizeof(dp));
 			bzero(&dm, sizeof(dm));
 			sp.sin_len = sm.sin_len = dp.sin_len = dm.sin_len =
 			    sizeof(struct sockaddr_in);
 			sp.sin_family = sm.sin_family = dp.sin_family =
 			    dm.sin_family = AF_INET;
 			sp.sin_addr = sc->g_src;
 			dp.sin_addr = sc->g_dst;
 			sm.sin_addr.s_addr = dm.sin_addr.s_addr = 
 			    INADDR_BROADCAST;
 #ifdef INET
 			sc->encap = encap_attach(AF_INET, sc->g_proto,
 			    sintosa(&sp), sintosa(&sm), sintosa(&dp),
 			    sintosa(&dm), (sc->g_proto == IPPROTO_GRE) ?
 				&in_gre_protosw : &in_mobile_protosw, sc);
 			if (sc->encap == NULL)
 				printf("%s: unable to attach encap\n",
 				    if_name(&sc->sc_if));
 #endif
 			if (sc->route.ro_rt != 0) /* free old route */
 				RTFREE(sc->route.ro_rt);
 			if (gre_compute_route(sc) == 0)
 				ifp->if_flags |= IFF_RUNNING;
 			else
 				ifp->if_flags &= ~IFF_RUNNING;
 		}
 		break;
 	case GREGADDRS:
 		memset(&si, 0, sizeof(si));
 		si.sin_family = AF_INET;
 		si.sin_len = sizeof(struct sockaddr_in);
 		si.sin_addr.s_addr = sc->g_src.s_addr;
 		sa = sintosa(&si);
 		ifr->ifr_addr = *sa;
 		break;
 	case GREGADDRD:
 		memset(&si, 0, sizeof(si));
 		si.sin_family = AF_INET;
 		si.sin_len = sizeof(struct sockaddr_in);
 		si.sin_addr.s_addr = sc->g_dst.s_addr;
 		sa = sintosa(&si);
 		ifr->ifr_addr = *sa;
 		break;
 	case SIOCSIFPHYADDR:
 		if ((error = suser(curthread)) != 0)
 			break;
 		if (aifr->ifra_addr.sin_family != AF_INET ||
 		    aifr->ifra_dstaddr.sin_family != AF_INET) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		if (aifr->ifra_addr.sin_len != sizeof(si) ||
 		    aifr->ifra_dstaddr.sin_len != sizeof(si)) {
 			error = EINVAL;
 			break;
 		}
 		sc->g_src = aifr->ifra_addr.sin_addr;
 		sc->g_dst = aifr->ifra_dstaddr.sin_addr;
 		goto recompute;
 	case SIOCSLIFPHYADDR:
 		if ((error = suser(curthread)) != 0)
 			break;
 		if (lifr->addr.ss_family != AF_INET ||
 		    lifr->dstaddr.ss_family != AF_INET) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		if (lifr->addr.ss_len != sizeof(si) ||
 		    lifr->dstaddr.ss_len != sizeof(si)) {
 			error = EINVAL;
 			break;
 		}
 		sc->g_src = (satosin((struct sockadrr *)&lifr->addr))->sin_addr;
 		sc->g_dst =
 		    (satosin((struct sockadrr *)&lifr->dstaddr))->sin_addr;
 		goto recompute;
 	case SIOCDIFPHYADDR:
 		if ((error = suser(curthread)) != 0)
 			break;
 		sc->g_src.s_addr = INADDR_ANY;
 		sc->g_dst.s_addr = INADDR_ANY;
 		goto recompute;
 	case SIOCGLIFPHYADDR:
 		if (sc->g_src.s_addr == INADDR_ANY ||
 		    sc->g_dst.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		memset(&si, 0, sizeof(si));
 		si.sin_family = AF_INET;
 		si.sin_len = sizeof(struct sockaddr_in);
 		si.sin_addr.s_addr = sc->g_src.s_addr;
 		memcpy(&lifr->addr, &si, sizeof(si));
 		si.sin_addr.s_addr = sc->g_dst.s_addr;
 		memcpy(&lifr->dstaddr, &si, sizeof(si));
 		break;
 	case SIOCGIFPSRCADDR:
 		if (sc->g_src.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		memset(&si, 0, sizeof(si));
 		si.sin_family = AF_INET;
 		si.sin_len = sizeof(struct sockaddr_in);
 		si.sin_addr.s_addr = sc->g_src.s_addr;
 		bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr));
 		break;
 	case SIOCGIFPDSTADDR:
 		if (sc->g_dst.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		memset(&si, 0, sizeof(si));
 		si.sin_family = AF_INET;
 		si.sin_len = sizeof(struct sockaddr_in);
 		si.sin_addr.s_addr = sc->g_dst.s_addr;
 		bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	splx(s);
 	return (error);
 }
 
 /*
  * computes a route to our destination that is not the one
  * which would be taken by ip_output(), as this one will loop back to
  * us. If the interface is p2p as  a--->b, then a routing entry exists
  * If we now send a packet to b (e.g. ping b), this will come down here
  * gets src=a, dst=b tacked on and would from ip_ouput() sent back to
  * if_gre.
  * Goal here is to compute a route to b that is less specific than
  * a-->b. We know that this one exists as in normal operation we have
  * at least a default route which matches.
  */
 static int
 gre_compute_route(struct gre_softc *sc)
 {
 	struct route *ro;
 	u_int32_t a, b, c;
 
 	ro = &sc->route;
 
 	memset(ro, 0, sizeof(struct route));
 	((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst;
 	ro->ro_dst.sa_family = AF_INET;
 	ro->ro_dst.sa_len = sizeof(ro->ro_dst);
 
 	/*
 	 * toggle last bit, so our interface is not found, but a less
 	 * specific route. I'd rather like to specify a shorter mask,
 	 * but this is not possible. Should work though. XXX
 	 * there is a simpler way ...
 	 */
 	if ((sc->sc_if.if_flags & IFF_LINK1) == 0) {
 		a = ntohl(sc->g_dst.s_addr);
 		b = a & 0x01;
 		c = a & 0xfffffffe;
 		b = b ^ 0x01;
 		a = b | c;
 		((struct sockaddr_in *)&ro->ro_dst)->sin_addr.s_addr
 		    = htonl(a);
 	}
 
 #ifdef DIAGNOSTIC
 	printf("%s: searching a route to %s", if_name(&sc->sc_if),
 	    inet_ntoa(((struct sockaddr_in *)&ro->ro_dst)->sin_addr));
 #endif
 
 	rtalloc(ro);
 
 	/*
 	 * check if this returned a route at all and this route is no
 	 * recursion to ourself
 	 */
 	if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp->if_softc == sc) {
 #ifdef DIAGNOSTIC
 		if (ro->ro_rt == NULL)
 			printf(" - no route found!\n");
 		else
 			printf(" - route loops back to ourself!\n");
 #endif
 		return EADDRNOTAVAIL;
 	}
 
 	/*
 	 * now change it back - else ip_output will just drop
 	 * the route and search one to this interface ...
 	 */
 	if ((sc->sc_if.if_flags & IFF_LINK1) == 0)
 		((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst;
 
 #ifdef DIAGNOSTIC
 	printf(", choosing %s with gateway %s", if_name(ro->ro_rt->rt_ifp),
 	    inet_ntoa(((struct sockaddr_in *)(ro->ro_rt->rt_gateway))->sin_addr));
 	printf("\n");
 #endif
 
 	return 0;
 }
 
 /*
  * do a checksum of a buffer - much like in_cksum, which operates on
  * mbufs.
  */
 u_short
 gre_in_cksum(u_short *p, u_int len)
 {
 	u_int sum = 0;
 	int nwords = len >> 1;
 
 	while (nwords-- != 0)
 		sum += *p++;
 
 	if (len & 1) {
 		union {
 			u_short w;
 			u_char c[2];
 		} u;
 		u.c[0] = *(u_char *)p;
 		u.c[1] = 0;
 		sum += u.w;
 	}
 
 	/* end-around-carry */
 	sum = (sum >> 16) + (sum & 0xffff);
 	sum += (sum >> 16);
 	return (~sum);
 }
 
 static int
 gremodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		greattach();
 		break;
 	case MOD_UNLOAD:
 		if_clone_detach(&gre_cloner);
 
 		while (!LIST_EMPTY(&gre_softc_list))
 			gre_clone_destroy(&LIST_FIRST(&gre_softc_list)->sc_if);
 		break;
 	}
 	return 0;
 }
 
 static moduledata_t gre_mod = {
 	"if_gre",
 	gremodevent,
 	0
 };
 
 DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_gre, 1);
Index: head/sys/net/if_loop.c
===================================================================
--- head/sys/net/if_loop.c	(revision 105193)
+++ head/sys/net/if_loop.c	(revision 105194)
@@ -1,449 +1,448 @@
 /*
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_loop.c	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 /*
  * Loopback interface driver for protocol testing and timing.
  */
 
 #include "opt_atalk.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipx.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/bpfdesc.h>
 
 #ifdef	INET
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #endif
 
 #ifdef IPX
 #include <netipx/ipx.h>
 #include <netipx/ipx_if.h>
 #endif
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #endif
 
 #ifdef NS
 #include <netns/ns.h>
 #include <netns/ns_if.h>
 #endif
 
 #ifdef NETATALK
 #include <netatalk/at.h>
 #include <netatalk/at_var.h>
 #endif
 
 #ifdef TINY_LOMTU
 #define	LOMTU	(1024+512)
 #elif defined(LARGE_LOMTU)
 #define LOMTU	131072
 #else
 #define LOMTU	16384
 #endif
 
 #define LONAME	"lo"
 
 struct lo_softc {
 	struct	ifnet sc_if;		/* network-visible interface */
 	LIST_ENTRY(lo_softc) sc_next;
 };
 
 int		loioctl(struct ifnet *, u_long, caddr_t);
 static void	lortrequest(int, struct rtentry *, struct rt_addrinfo *);
 int		looutput(struct ifnet *ifp, struct mbuf *m,
 		    struct sockaddr *dst, struct rtentry *rt);
 int		lo_clone_create(struct if_clone *, int);
 void		lo_clone_destroy(struct ifnet *);
 
 struct ifnet *loif = NULL;			/* Used externally */
 
 static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface");
 
 static LIST_HEAD(lo_list, lo_softc) lo_list;
 
 struct if_clone lo_cloner = IF_CLONE_INITIALIZER(LONAME,
     lo_clone_create, lo_clone_destroy, 1, IF_MAXUNIT);
 
 void
 lo_clone_destroy(ifp)
 	struct ifnet *ifp;
 {
 	struct lo_softc *sc;
 	
 	sc = ifp->if_softc;
 
 	/* XXX: destroying lo0 will lead to panics. */
 	KASSERT(loif != ifp, ("%s: destroying lo0", __func__));
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	LIST_REMOVE(sc, sc_next);
 	free(sc, M_LO);
 }
 
 int
 lo_clone_create(ifc, unit)
 	struct if_clone *ifc;
 	int unit;
 {
 	struct lo_softc *sc;
 
 	MALLOC(sc, struct lo_softc *, sizeof(*sc), M_LO, M_WAITOK | M_ZERO);
 
 	sc->sc_if.if_name = LONAME;
 	sc->sc_if.if_unit = unit;
 	sc->sc_if.if_mtu = LOMTU;
 	sc->sc_if.if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	sc->sc_if.if_ioctl = loioctl;
 	sc->sc_if.if_output = looutput;
 	sc->sc_if.if_type = IFT_LOOP;
 	sc->sc_if.if_snd.ifq_maxlen = ifqmaxlen;
 	sc->sc_if.if_softc = sc;
 	if_attach(&sc->sc_if);
 	bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int));
 	LIST_INSERT_HEAD(&lo_list, sc, sc_next);
 	if (loif == NULL)
 		loif = &sc->sc_if;
 
 	return (0);
 }
 
 static int
 loop_modevent(module_t mod, int type, void *data) 
 { 
 	switch (type) { 
 	case MOD_LOAD: 
 		LIST_INIT(&lo_list);
 		if_clone_attach(&lo_cloner);
 		break; 
 	case MOD_UNLOAD: 
 		printf("loop module unload - not possible for this module type\n"); 
 		return EINVAL; 
 	} 
 	return 0; 
 } 
 
 static moduledata_t loop_mod = { 
 	"loop", 
 	loop_modevent, 
 	0
 }; 
 
 DECLARE_MODULE(loop, loop_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 int
 looutput(ifp, m, dst, rt)
 	struct ifnet *ifp;
 	register struct mbuf *m;
 	struct sockaddr *dst;
 	register struct rtentry *rt;
 {
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("looutput no HDR");
 
 	if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		m_freem(m);
 		return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
 		        rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
 	}
 	/*
 	 * KAME requires that the packet to be contiguous on the
 	 * mbuf.  We need to make that sure.
 	 * this kind of code should be avoided.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
 		if (!n)
 			goto contiguousfail;
 		MCLGET(n, M_DONTWAIT);
 		if (! (n->m_flags & M_EXT)) {
 			m_freem(n);
 			goto contiguousfail;
 		}
 
 		m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_pkthdr = m->m_pkthdr;
 		n->m_len = m->m_pkthdr.len;
-		n->m_pkthdr.aux = m->m_pkthdr.aux;
-		m->m_pkthdr.aux = (struct mbuf *)NULL;
+		SLIST_INIT(&m->m_pkthdr.tags);
 		m_freem(m);
 		m = n;
 	}
 	if (0) {
 contiguousfail:
 		printf("looutput: mbuf allocation failed\n");
 	}
 
 	ifp->if_opackets++;
 	ifp->if_obytes += m->m_pkthdr.len;
 #if 1	/* XXX */
 	switch (dst->sa_family) {
 	case AF_INET:
 	case AF_INET6:
 	case AF_IPX:
 	case AF_NS:
 	case AF_APPLETALK:
 		break;
 	default:
 		printf("looutput: af=%d unexpected\n", dst->sa_family);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 #endif
 	return(if_simloop(ifp, m, dst->sa_family, 0));
 }
 
 /*
  * if_simloop()
  *
  * This function is to support software emulation of hardware loopback,
  * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't
  * hear their own broadcasts, we create a copy of the packet that we
  * would normally receive via a hardware loopback.
  *
  * This function expects the packet to include the media header of length hlen.
  */
 
 int
 if_simloop(ifp, m, af, hlen)
 	struct ifnet *ifp;
 	struct mbuf *m;
 	int af;
 	int hlen;
 {
 	int isr;
 	struct ifqueue *inq = 0;
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("if_simloop: no HDR"));
 	m->m_pkthdr.rcvif = ifp;
 
 	/* BPF write needs to be handled specially */
 	if (af == AF_UNSPEC) {
 		KASSERT(m->m_len >= sizeof(int), ("if_simloop: m_len"));
 		af = *(mtod(m, int *));
 		m->m_len -= sizeof(int);
 		m->m_pkthdr.len -= sizeof(int);
 		m->m_data += sizeof(int);
 	}
 
 	/* Let BPF see incoming packet */
 	if (ifp->if_bpf) {
 		struct mbuf m0, *n = m;
 
 		if (ifp->if_bpf->bif_dlt == DLT_NULL) {
 			/*
 			 * We need to prepend the address family as
 			 * a four byte field.  Cons up a dummy header
 			 * to pacify bpf.  This is safe because bpf
 			 * will only read from the mbuf (i.e., it won't
 			 * try to free it or keep a pointer a to it).
 			 */
 			m0.m_next = m;
 			m0.m_len = 4;
 			m0.m_data = (char *)&af;
 			n = &m0;
 		}
 		bpf_mtap(ifp, n);
 	}
 
 	/* Strip away media header */
 	if (hlen > 0) {
 		m_adj(m, hlen);
 #if defined(__alpha__) || defined(__ia64__) || defined(__sparc64__)
 		/* The alpha doesn't like unaligned data.
 		 * We move data down in the first mbuf */
 		if (mtod(m, vm_offset_t) & 3) {
 			KASSERT(hlen >= 3, ("if_simloop: hlen too small"));
 			bcopy(m->m_data, 
 			    (char *)(mtod(m, vm_offset_t) 
 				- (mtod(m, vm_offset_t) & 3)),
 			    m->m_len);
 			mtod(m,vm_offset_t) -= (mtod(m, vm_offset_t) & 3);
 		}
 #endif
 	}
 
 	/* Deliver to upper layer protocol */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		inq = &ipintrq;
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		m->m_flags |= M_LOOP;
 		inq = &ip6intrq;
 		isr = NETISR_IPV6;
 		break;
 #endif
 #ifdef IPX
 	case AF_IPX:
 		inq = &ipxintrq;
 		isr = NETISR_IPX;
 		break;
 #endif
 #ifdef NS
 	case AF_NS:
 		inq = &nsintrq;
 		isr = NETISR_NS;
 		break;
 #endif
 #ifdef NETATALK
 	case AF_APPLETALK:
 	        inq = &atintrq2;
 		isr = NETISR_ATALK;
 		break;
 #endif
 	default:
 		printf("if_simloop: can't handle af=%d\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	ifp->if_ipackets++;
 	ifp->if_ibytes += m->m_pkthdr.len;
 	(void) IF_HANDOFF(inq, m, NULL);
 	schednetisr(isr);
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 lortrequest(cmd, rt, info)
 	int cmd;
 	struct rtentry *rt;
 	struct rt_addrinfo *info;
 {
 	if (rt) {
 		rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
 		/*
 		 * For optimal performance, the send and receive buffers
 		 * should be at least twice the MTU plus a little more for
 		 * overhead.
 		 */
 		rt->rt_rmx.rmx_recvpipe =
 			rt->rt_rmx.rmx_sendpipe = 3 * LOMTU;
 	}
 }
 
 /*
  * Process an ioctl request.
  */
 /* ARGSUSED */
 int
 loioctl(ifp, cmd, data)
 	register struct ifnet *ifp;
 	u_long cmd;
 	caddr_t data;
 {
 	register struct ifaddr *ifa;
 	register struct ifreq *ifr = (struct ifreq *)data;
 	register int error = 0;
 
 	switch (cmd) {
 
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP | IFF_RUNNING;
 		ifa = (struct ifaddr *)data;
 		ifa->ifa_rtrequest = lortrequest;
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == 0) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	case SIOCSIFFLAGS:
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
Index: head/sys/net/if_stf.c
===================================================================
--- head/sys/net/if_stf.c	(revision 105193)
+++ head/sys/net/if_stf.c	(revision 105194)
@@ -1,750 +1,750 @@
 /*	$FreeBSD$	*/
 /*	$KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $	*/
 
 /*
  * Copyright (C) 2000 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * 6to4 interface, based on RFC3056.
  *
  * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting.
  * There is no address mapping defined from IPv6 multicast address to IPv4
  * address.  Therefore, we do not have IFF_MULTICAST on the interface.
  *
  * Due to the lack of address mapping for link-local addresses, we cannot
  * throw packets toward link-local addresses (fe80::x).  Also, we cannot throw
  * packets to link-local multicast addresses (ff02::x).
  *
  * Here are interesting symptoms due to the lack of link-local address:
  *
  * Unicast routing exchange:
  * - RIPng: Impossible.  Uses link-local multicast packet toward ff02::9,
  *   and link-local addresses as nexthop.
  * - OSPFv6: Impossible.  OSPFv6 assumes that there's link-local address
  *   assigned to the link, and makes use of them.  Also, HELLO packets use
  *   link-local multicast addresses (ff02::5 and ff02::6).
  * - BGP4+: Maybe.  You can only use global address as nexthop, and global
  *   address as TCP endpoint address.
  *
  * Multicast routing protocols:
  * - PIM: Hello packet cannot be used to discover adjacent PIM routers.
  *   Adjacent PIM routers must be configured manually (is it really spec-wise
  *   correct thing to do?).
  *
  * ICMPv6:
  * - Redirects cannot be used due to the lack of link-local address.
  *
  * stf interface does not have, and will not need, a link-local address.  
  * It seems to have no real benefit and does not help the above symptoms much.
  * Even if we assign link-locals to interface, we cannot really
  * use link-local unicast/multicast on top of 6to4 cloud (since there's no
  * encapsulation defined for link-local address), and the above analysis does
  * not change.  RFC3056 does not mandate the assignment of link-local address
  * either.
  *
  * 6to4 interface has security issues.  Refer to
  * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt
  * for details.  The code tries to filter out some of malicious packets.
  * Note that there is no way to be 100% secure.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <machine/cpu.h>
 
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/if_types.h>
 #include <net/if_stf.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip_ecn.h>
 
 #include <netinet/ip_encap.h>
 
 #include <machine/stdarg.h>
 
 #include <net/net_osdep.h>
 
 #include <net/bpf.h>
 
 #define STFNAME		"stf"
 
 #define IN6_IS_ADDR_6TO4(x)	(ntohs((x)->s6_addr16[0]) == 0x2002)
 #define GET_V4(x)	((struct in_addr *)(&(x)->s6_addr16[1]))
 
 struct stf_softc {
 	struct ifnet	sc_if;	   /* common area */
 	union {
 		struct route  __sc_ro4;
 		struct route_in6 __sc_ro6; /* just for safety */
 	} __sc_ro46;
 #define sc_ro	__sc_ro46.__sc_ro4
 	const struct encaptab *encap_cookie;
 	LIST_ENTRY(stf_softc) sc_list;	/* all stf's are linked */
 };
 
 static LIST_HEAD(, stf_softc) stf_softc_list;
 
 static MALLOC_DEFINE(M_STF, STFNAME, "6to4 Tunnel Interface");
 static int ip_stf_ttl = 40;
 
 extern  struct domain inetdomain;
 struct protosw in_stf_protosw =
 { SOCK_RAW,	&inetdomain,	IPPROTO_IPV6,	PR_ATOMIC|PR_ADDR,
   in_stf_input,	(pr_output_t*)rip_output, 0,	rip_ctloutput,
   0,
   0,            0,              0,              0,
   &rip_usrreqs
 };
 
 static int stfmodevent(module_t, int, void *);
 static int stf_encapcheck(const struct mbuf *, int, int, void *);
 static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *);
 static int stf_output(struct ifnet *, struct mbuf *, struct sockaddr *,
 	struct rtentry *);
 static int isrfc1918addr(struct in_addr *);
 static int stf_checkaddr4(struct stf_softc *, struct in_addr *,
 	struct ifnet *);
 static int stf_checkaddr6(struct stf_softc *, struct in6_addr *,
 	struct ifnet *);
 static void stf_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int stf_ioctl(struct ifnet *, u_long, caddr_t);
 
 int	stf_clone_create(struct if_clone *, int);
 void	stf_clone_destroy(struct ifnet *);
 
 /* only one clone is currently allowed */
 struct if_clone stf_cloner =
     IF_CLONE_INITIALIZER(STFNAME, stf_clone_create, stf_clone_destroy, 0, 0);
 
 int
 stf_clone_create(ifc, unit)
 	struct if_clone *ifc;
 	int unit;
 {
 	struct stf_softc *sc;
 
 	sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO);
 	sc->sc_if.if_name = STFNAME;
 	sc->sc_if.if_unit = unit;
 
 	sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6,
 	    stf_encapcheck, &in_stf_protosw, sc);
 	if (sc->encap_cookie == NULL) {
 		printf("%s: attach failed\n", if_name(&sc->sc_if));
 		free(sc, M_STF);
 		return (ENOMEM);
 	}
 
 	sc->sc_if.if_mtu    = IPV6_MMTU;
 	sc->sc_if.if_ioctl  = stf_ioctl;
 	sc->sc_if.if_output = stf_output;
 	sc->sc_if.if_type   = IFT_STF;
 	sc->sc_if.if_snd.ifq_maxlen = IFQ_MAXLEN;
 	if_attach(&sc->sc_if);
 	bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int));
 	LIST_INSERT_HEAD(&stf_softc_list, sc, sc_list);
 	return (0);
 }
 
 void
 stf_clone_destroy(ifp)
 	struct ifnet *ifp;
 {
 	int err;
 	struct stf_softc *sc = (void *) ifp;
 
 	LIST_REMOVE(sc, sc_list);
 	err = encap_detach(sc->encap_cookie);
 	KASSERT(err == 0, ("Unexpected error detaching encap_cookie"));
 	bpfdetach(ifp);
 	if_detach(ifp);
 
 	free(sc, M_STF);
 }
 
 static int
 stfmodevent(mod, type, data)
 	module_t mod;
 	int type;
 	void *data;
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		LIST_INIT(&stf_softc_list);
 		if_clone_attach(&stf_cloner);
 
 		break;
 	case MOD_UNLOAD:
 		if_clone_detach(&stf_cloner);
 
 		while (!LIST_EMPTY(&stf_softc_list))
 			stf_clone_destroy(&LIST_FIRST(&stf_softc_list)->sc_if);
 		break;
 	}
 
 	return (0);
 }
 
 static moduledata_t stf_mod = {
 	"if_stf",
 	stfmodevent,
 	0
 };
 
 DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 static int
 stf_encapcheck(m, off, proto, arg)
 	const struct mbuf *m;
 	int off;
 	int proto;
 	void *arg;
 {
 	struct ip ip;
 	struct in6_ifaddr *ia6;
 	struct stf_softc *sc;
 	struct in_addr a, b;
 
 	sc = (struct stf_softc *)arg;
 	if (sc == NULL)
 		return 0;
 
 	if ((sc->sc_if.if_flags & IFF_UP) == 0)
 		return 0;
 
 	/* IFF_LINK0 means "no decapsulation" */
 	if ((sc->sc_if.if_flags & IFF_LINK0) != 0)
 		return 0;
 
 	if (proto != IPPROTO_IPV6)
 		return 0;
 
 	/* LINTED const cast */
 	m_copydata((struct mbuf *)(uintptr_t)m, 0, sizeof(ip), (caddr_t)&ip);
 
 	if (ip.ip_v != 4)
 		return 0;
 
 	ia6 = stf_getsrcifa6(&sc->sc_if);
 	if (ia6 == NULL)
 		return 0;
 
 	/*
 	 * check if IPv4 dst matches the IPv4 address derived from the
 	 * local 6to4 address.
 	 * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:...
 	 */
 	if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst,
 	    sizeof(ip.ip_dst)) != 0)
 		return 0;
 
 	/*
 	 * check if IPv4 src matches the IPv4 address derived from the
 	 * local 6to4 address masked by prefixmask.
 	 * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24
 	 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24
 	 */
 	bzero(&a, sizeof(a));
 	a.s_addr = GET_V4(&ia6->ia_addr.sin6_addr)->s_addr;
 	a.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr;
 	b = ip.ip_src;
 	b.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr;
 	if (a.s_addr != b.s_addr)
 		return 0;
 
 	/* stf interface makes single side match only */
 	return 32;
 }
 
 static struct in6_ifaddr *
 stf_getsrcifa6(ifp)
 	struct ifnet *ifp;
 {
 	struct ifaddr *ia;
 	struct in_ifaddr *ia4;
 	struct sockaddr_in6 *sin6;
 	struct in_addr in;
 
 	for (ia = TAILQ_FIRST(&ifp->if_addrlist);
 	     ia;
 	     ia = TAILQ_NEXT(ia, ifa_list))
 	{
 		if (ia->ifa_addr == NULL)
 			continue;
 		if (ia->ifa_addr->sa_family != AF_INET6)
 			continue;
 		sin6 = (struct sockaddr_in6 *)ia->ifa_addr;
 		if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr))
 			continue;
 
 		bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in));
 		LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash)
 			if (ia4->ia_addr.sin_addr.s_addr == in.s_addr)
 				break;
 		if (ia4 == NULL)
 			continue;
 
 		return (struct in6_ifaddr *)ia;
 	}
 
 	return NULL;
 }
 
 static int
 stf_output(ifp, m, dst, rt)
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct sockaddr *dst;
 	struct rtentry *rt;
 {
 	struct stf_softc *sc;
 	struct sockaddr_in6 *dst6;
 	struct in_addr *in4;
 	struct sockaddr_in *dst4;
 	u_int8_t tos;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia6;
 
 	sc = (struct stf_softc*)ifp;
 	dst6 = (struct sockaddr_in6 *)dst;
 
 	/* just in case */
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		ifp->if_oerrors++;
 		return ENETDOWN;
 	}
 
 	/*
 	 * If we don't have an ip4 address that match my inner ip6 address,
 	 * we shouldn't generate output.  Without this check, we'll end up
 	 * using wrong IPv4 source.
 	 */
 	ia6 = stf_getsrcifa6(ifp);
 	if (ia6 == NULL) {
 		m_freem(m);
 		ifp->if_oerrors++;
 		return ENETDOWN;
 	}
 
 	if (m->m_len < sizeof(*ip6)) {
 		m = m_pullup(m, sizeof(*ip6));
 		if (!m) {
 			ifp->if_oerrors++;
 			return ENOBUFS;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 
 	/*
 	 * Pickup the right outer dst addr from the list of candidates.
 	 * ip6_dst has priority as it may be able to give us shorter IPv4 hops.
 	 */
 	if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst))
 		in4 = GET_V4(&ip6->ip6_dst);
 	else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr))
 		in4 = GET_V4(&dst6->sin6_addr);
 	else {
 		m_freem(m);
 		ifp->if_oerrors++;
 		return ENETUNREACH;
 	}
 
 #if NBPFILTER > 0
 	if (ifp->if_bpf) {
 		/*
 		 * We need to prepend the address family as
 		 * a four byte field.  Cons up a dummy header
 		 * to pacify bpf.  This is safe because bpf
 		 * will only read from the mbuf (i.e., it won't
 		 * try to free it or keep a pointer a to it).
 		 */
 		struct mbuf m0;
 		u_int32_t af = AF_INET6;
 		
 		m0.m_next = m;
 		m0.m_len = 4;
 		m0.m_data = (char *)&af;
 		
 #ifdef HAVE_OLD_BPF
 		bpf_mtap(ifp, &m0);
 #else
 		bpf_mtap(ifp->if_bpf, &m0);
 #endif
 	}
 #endif /*NBPFILTER > 0*/
 
 	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 	if (m && m->m_len < sizeof(struct ip))
 		m = m_pullup(m, sizeof(struct ip));
 	if (m == NULL) {
 		ifp->if_oerrors++;
 		return ENOBUFS;
 	}
 	ip = mtod(m, struct ip *);
 
 	bzero(ip, sizeof(*ip));
 
 	bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr),
 	    &ip->ip_src, sizeof(ip->ip_src));
 	bcopy(in4, &ip->ip_dst, sizeof(ip->ip_dst));
 	ip->ip_p = IPPROTO_IPV6;
 	ip->ip_ttl = ip_stf_ttl;
 	ip->ip_len = m->m_pkthdr.len;	/*host order*/
 	if (ifp->if_flags & IFF_LINK1)
 		ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos);
 	else
 		ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos);
 
 	dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst;
 	if (dst4->sin_family != AF_INET ||
 	    bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) {
 		/* cache route doesn't match */
 		dst4->sin_family = AF_INET;
 		dst4->sin_len = sizeof(struct sockaddr_in);
 		bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr));
 		if (sc->sc_ro.ro_rt) {
 			RTFREE(sc->sc_ro.ro_rt);
 			sc->sc_ro.ro_rt = NULL;
 		}
 	}
 
 	if (sc->sc_ro.ro_rt == NULL) {
 		rtalloc(&sc->sc_ro);
 		if (sc->sc_ro.ro_rt == NULL) {
 			m_freem(m);
 			ifp->if_oerrors++;
 			return ENETUNREACH;
 		}
 	}
 
 	ifp->if_opackets++;
-	return ip_output(m, NULL, &sc->sc_ro, 0, NULL);
+	return ip_output(m, NULL, &sc->sc_ro, 0, NULL, NULL);
 }
 
 static int
 isrfc1918addr(in)
 	struct in_addr *in;
 {
 	/*
 	 * returns 1 if private address range:
 	 * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16
 	 */
 	if ((ntohl(in->s_addr) & 0xff000000) >> 24 == 10 ||
 	    (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 ||
 	    (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)
 		return 1;
 
 	return 0;
 }
 
 static int
 stf_checkaddr4(sc, in, inifp)
 	struct stf_softc *sc;
 	struct in_addr *in;
 	struct ifnet *inifp;	/* incoming interface */
 {
 	struct in_ifaddr *ia4;
 
 	/*
 	 * reject packets with the following address:
 	 * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8
 	 */
 	if (IN_MULTICAST(ntohl(in->s_addr)))
 		return -1;
 	switch ((ntohl(in->s_addr) & 0xff000000) >> 24) {
 	case 0: case 127: case 255:
 		return -1;
 	}
 
 	/*
 	 * reject packets with private address range.
 	 * (requirement from RFC3056 section 2 1st paragraph)
 	 */
 	if (isrfc1918addr(in))
 		return -1;
 
 	/*
 	 * reject packets with broadcast
 	 */
 	for (ia4 = TAILQ_FIRST(&in_ifaddrhead);
 	     ia4;
 	     ia4 = TAILQ_NEXT(ia4, ia_link))
 	{
 		if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
 			continue;
 		if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr)
 			return -1;
 	}
 
 	/*
 	 * perform ingress filter
 	 */
 	if (sc && (sc->sc_if.if_flags & IFF_LINK2) == 0 && inifp) {
 		struct sockaddr_in sin;
 		struct rtentry *rt;
 
 		bzero(&sin, sizeof(sin));
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(struct sockaddr_in);
 		sin.sin_addr = *in;
 		rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
 		if (!rt || rt->rt_ifp != inifp) {
 #if 0
 			log(LOG_WARNING, "%s: packet from 0x%x dropped "
 			    "due to ingress filter\n", if_name(&sc->sc_if),
 			    (u_int32_t)ntohl(sin.sin_addr.s_addr));
 #endif
 			if (rt)
 				rtfree(rt);
 			return -1;
 		}
 		rtfree(rt);
 	}
 
 	return 0;
 }
 
 static int
 stf_checkaddr6(sc, in6, inifp)
 	struct stf_softc *sc;
 	struct in6_addr *in6;
 	struct ifnet *inifp;	/* incoming interface */
 {
 	/*
 	 * check 6to4 addresses
 	 */
 	if (IN6_IS_ADDR_6TO4(in6))
 		return stf_checkaddr4(sc, GET_V4(in6), inifp);
 
 	/*
 	 * reject anything that look suspicious.  the test is implemented
 	 * in ip6_input too, but we check here as well to
 	 * (1) reject bad packets earlier, and
 	 * (2) to be safe against future ip6_input change.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6))
 		return -1;
 
 	return 0;
 }
 
 void
 in_stf_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	int proto;
 	struct stf_softc *sc;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	u_int8_t otos, itos;
 	int len, isr;
 	struct ifqueue *ifq = NULL;
 	struct ifnet *ifp;
 
 	proto = mtod(m, struct ip *)->ip_p;
 
 	if (proto != IPPROTO_IPV6) {
 		m_freem(m);
 		return;
 	}
 
 	ip = mtod(m, struct ip *);
 
 	sc = (struct stf_softc *)encap_getarg(m);
 
 	if (sc == NULL || (sc->sc_if.if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 
 	ifp = &sc->sc_if;
 
 	/*
 	 * perform sanity check against outer src/dst.
 	 * for source, perform ingress filter as well.
 	 */
 	if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 ||
 	    stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) {
 		m_freem(m);
 		return;
 	}
 
 	otos = ip->ip_tos;
 	m_adj(m, off);
 
 	if (m->m_len < sizeof(*ip6)) {
 		m = m_pullup(m, sizeof(*ip6));
 		if (!m)
 			return;
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * perform sanity check against inner src/dst.
 	 * for source, perform ingress filter as well.
 	 */
 	if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 ||
 	    stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) {
 		m_freem(m);
 		return;
 	}
 
 	itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 	if ((ifp->if_flags & IFF_LINK1) != 0)
 		ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
 	else
 		ip_ecn_egress(ECN_NOCARE, &otos, &itos);
 	ip6->ip6_flow &= ~htonl(0xff << 20);
 	ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
 
 	m->m_pkthdr.rcvif = ifp;
 	
 	if (ifp->if_bpf) {
 		/*
 		 * We need to prepend the address family as
 		 * a four byte field.  Cons up a dummy header
 		 * to pacify bpf.  This is safe because bpf
 		 * will only read from the mbuf (i.e., it won't
 		 * try to free it or keep a pointer a to it).
 		 */
 		struct mbuf m0;
 		u_int32_t af = AF_INET6;
 		
 		m0.m_next = m;
 		m0.m_len = 4;
 		m0.m_data = (char *)&af;
 		
 #ifdef HAVE_OLD_BPF
 		bpf_mtap(ifp, &m0);
 #else
 		bpf_mtap(ifp->if_bpf, &m0);
 #endif
 	}
 
 	/*
 	 * Put the packet to the network layer input queue according to the
 	 * specified address family.
 	 * See net/if_gif.c for possible issues with packet processing
 	 * reorder due to extra queueing.
 	 */
 	ifq = &ip6intrq;
 	isr = NETISR_IPV6;
 
 	len = m->m_pkthdr.len;
 	if (! IF_HANDOFF(ifq, m, NULL))
 		return;
 	schednetisr(isr);
 	ifp->if_ipackets++;
 	ifp->if_ibytes += len;
 }
 
 /* ARGSUSED */
 static void
 stf_rtrequest(cmd, rt, info)
 	int cmd;
 	struct rtentry *rt;
 	struct rt_addrinfo *info;
 {
 
 	if (rt)
 		rt->rt_rmx.rmx_mtu = IPV6_MMTU;
 }
 
 static int
 stf_ioctl(ifp, cmd, data)
 	struct ifnet *ifp;
 	u_long cmd;
 	caddr_t data;
 {
 	struct ifaddr *ifa;
 	struct ifreq *ifr;
 	struct sockaddr_in6 *sin6;
 	int error;
 
 	error = 0;
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifa = (struct ifaddr *)data;
 		if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 		if (IN6_IS_ADDR_6TO4(&sin6->sin6_addr) &&
 		    !isrfc1918addr(GET_V4(&sin6->sin6_addr))) {
 			ifa->ifa_rtrequest = stf_rtrequest;
 			ifp->if_flags |= IFF_UP;
 		} else
 			error = EINVAL;
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		ifr = (struct ifreq *)data;
 		if (ifr && ifr->ifr_addr.sa_family == AF_INET6)
 			;
 		else
 			error = EAFNOSUPPORT;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return error;
 }
Index: head/sys/netinet/igmp.c
===================================================================
--- head/sys/netinet/igmp.c	(revision 105193)
+++ head/sys/netinet/igmp.c	(revision 105194)
@@ -1,493 +1,493 @@
 /*
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  * $FreeBSD$
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 static struct router_info *
 		find_rti(struct ifnet *ifp);
 
 static struct igmpstat igmpstat;
 
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
 	&igmpstat, igmpstat, "");
 
 static int igmp_timers_are_running;
 static u_long igmp_all_hosts_group;
 static u_long igmp_all_rtrs_group;
 static struct mbuf *router_alert;
 static struct router_info *Head;
 
 static void igmp_sendpkt(struct in_multi *, int, unsigned long);
 
 void
 igmp_init()
 {
 	struct ipoption *ra;
 
 	/*
 	 * To avoid byte-swapping the same value over and over again.
 	 */
 	igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP);
 	igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP);
 
 	igmp_timers_are_running = 0;
 
 	/*
 	 * Construct a Router Alert option to use in outgoing packets
 	 */
 	MGET(router_alert, M_DONTWAIT, MT_DATA);
 	ra = mtod(router_alert, struct ipoption *);
 	ra->ipopt_dst.s_addr = 0;
 	ra->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	ra->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	ra->ipopt_list[2] = 0x00;
 	ra->ipopt_list[3] = 0x00;
 	router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1];
 
 	Head = (struct router_info *) 0;
 }
 
 static struct router_info *
 find_rti(ifp)
 	struct ifnet *ifp;
 {
         register struct router_info *rti = Head;
 
 #ifdef IGMP_DEBUG
 	printf("[igmp.c, _find_rti] --> entering \n");
 #endif
         while (rti) {
                 if (rti->rti_ifp == ifp) {
 #ifdef IGMP_DEBUG
 			printf("[igmp.c, _find_rti] --> found old entry \n");
 #endif
                         return rti;
                 }
                 rti = rti->rti_next;
         }
 	MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT);
         rti->rti_ifp = ifp;
         rti->rti_type = IGMP_V2_ROUTER;
         rti->rti_time = 0;
         rti->rti_next = Head;
         Head = rti;
 #ifdef IGMP_DEBUG
 	printf("[igmp.c, _find_rti] --> created an entry \n");
 #endif
         return rti;
 }
 
 void
 igmp_input(m, off)
 	register struct mbuf *m;
 	int off;
 {
 	register int iphlen = off;
 	register struct igmp *igmp;
 	register struct ip *ip;
 	register int igmplen;
 	register struct ifnet *ifp = m->m_pkthdr.rcvif;
 	register int minlen;
 	register struct in_multi *inm;
 	register struct in_ifaddr *ia;
 	struct in_multistep step;
 	struct router_info *rti;
 	
 	int timer; /** timer value in the igmp query header **/
 
 	++igmpstat.igps_rcv_total;
 
 	ip = mtod(m, struct ip *);
 	igmplen = ip->ip_len;
 
 	/*
 	 * Validate lengths
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		++igmpstat.igps_rcv_tooshort;
 		m_freem(m);
 		return;
 	}
 	minlen = iphlen + IGMP_MINLEN;
 	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		++igmpstat.igps_rcv_tooshort;
 		return;
 	}
 
 	/*
 	 * Validate checksum
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		++igmpstat.igps_rcv_badsum;
 		m_freem(m);
 		return;
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	ip = mtod(m, struct ip *);
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 	rti = find_rti(ifp);
 
 	/*
 	 * In the IGMPv2 specification, there are 3 states and a flag.
 	 *
 	 * In Non-Member state, we simply don't have a membership record.
 	 * In Delaying Member state, our timer is running (inm->inm_timer)
 	 * In Idle Member state, our timer is not running (inm->inm_timer==0)
 	 *
 	 * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if
 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
 	 * if I sent the last report.
 	 */
 	switch (igmp->igmp_type) {
 
 	case IGMP_MEMBERSHIP_QUERY:
 		++igmpstat.igps_rcv_queries;
 
 		if (ifp->if_flags & IFF_LOOPBACK)
 			break;
 
 		if (igmp->igmp_code == 0) {
 			/*
 			 * Old router.  Remember that the querier on this
 			 * interface is old, and set the timer to the
 			 * value in RFC 1112.
 			 */
 
 			rti->rti_type = IGMP_V1_ROUTER;
 			rti->rti_time = 0;
 
 			timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ;
 
 			if (ip->ip_dst.s_addr != igmp_all_hosts_group ||
 			    igmp->igmp_group.s_addr != 0) {
 				++igmpstat.igps_rcv_badqueries;
 				m_freem(m);
 				return;
 			}
 		} else {
 			/*
 			 * New router.  Simply do the new validity check.
 			 */
 			
 			if (igmp->igmp_group.s_addr != 0 &&
 			    !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
 				++igmpstat.igps_rcv_badqueries;
 				m_freem(m);
 				return;
 			}
 		}
 
 		/*
 		 * - Start the timers in all of our membership records
 		 *   that the query applies to for the interface on
 		 *   which the query arrived excl. those that belong
 		 *   to the "all-hosts" group (224.0.0.1).
 		 * - Restart any timer that is already running but has
 		 *   a value longer than the requested timeout.
 		 * - Use the value specified in the query message as
 		 *   the maximum timeout.
 		 */
 		IN_FIRST_MULTI(step, inm);
 		while (inm != NULL) {
 			if (inm->inm_ifp == ifp &&
 			    inm->inm_addr.s_addr != igmp_all_hosts_group &&
 			    (igmp->igmp_group.s_addr == 0 ||
 			     igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) {
 				if (inm->inm_timer == 0 ||
 				    inm->inm_timer > timer) {
 					inm->inm_timer =
 						IGMP_RANDOM_DELAY(timer);
 					igmp_timers_are_running = 1;
 				}
 			}
 			IN_NEXT_MULTI(step, inm);
 		}
 
 		break;
 
 	case IGMP_V1_MEMBERSHIP_REPORT:
 	case IGMP_V2_MEMBERSHIP_REPORT:
 		/*
 		 * For fast leave to work, we have to know that we are the
 		 * last person to send a report for this group.  Reports
 		 * can potentially get looped back if we are a multicast
 		 * router, so discard reports sourced by me.
 		 */
 		IFP_TO_IA(ifp, ia);
 		if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr)
 			break;
 
 		++igmpstat.igps_rcv_reports;
 
 		if (ifp->if_flags & IFF_LOOPBACK)
 			break;
 
 		if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
 			++igmpstat.igps_rcv_badreports;
 			m_freem(m);
 			return;
 		}
 
 		/*
 		 * KLUDGE: if the IP source address of the report has an
 		 * unspecified (i.e., zero) subnet number, as is allowed for
 		 * a booting host, replace it with the correct subnet number
 		 * so that a process-level multicast routing daemon can
 		 * determine which subnet it arrived from.  This is necessary
 		 * to compensate for the lack of any way for a process to
 		 * determine the arrival interface of an incoming packet.
 		 */
 		if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0)
 			if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet);
 
 		/*
 		 * If we belong to the group being reported, stop
 		 * our timer for that group.
 		 */
 		IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm);
 
 		if (inm != NULL) {
 			inm->inm_timer = 0;
 			++igmpstat.igps_rcv_ourreports;
 
 			inm->inm_state = IGMP_OTHERMEMBER;
 		}
 
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening
 	 * on a raw IGMP socket.
 	 */
 	rip_input(m, off);
 }
 
 void
 igmp_joingroup(inm)
 	struct in_multi *inm;
 {
 	int s = splnet();
 
 	if (inm->inm_addr.s_addr == igmp_all_hosts_group
 	    || inm->inm_ifp->if_flags & IFF_LOOPBACK) {
 		inm->inm_timer = 0;
 		inm->inm_state = IGMP_OTHERMEMBER;
 	} else {
 		inm->inm_rti = find_rti(inm->inm_ifp);
 		igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
 		inm->inm_timer = IGMP_RANDOM_DELAY(
 					IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ);
 		inm->inm_state = IGMP_IREPORTEDLAST;
 		igmp_timers_are_running = 1;
 	}
 	splx(s);
 }
 
 void
 igmp_leavegroup(inm)
 	struct in_multi *inm;
 {
 	if (inm->inm_state == IGMP_IREPORTEDLAST &&
 	    inm->inm_addr.s_addr != igmp_all_hosts_group &&
 	    !(inm->inm_ifp->if_flags & IFF_LOOPBACK) &&
 	    inm->inm_rti->rti_type != IGMP_V1_ROUTER)
 		igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group);
 }
 
 void
 igmp_fasttimo()
 {
 	register struct in_multi *inm;
 	struct in_multistep step;
 	int s;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order
 	 * to minimize the overhead of fasttimo processing.
 	 */
 
 	if (!igmp_timers_are_running)
 		return;
 
 	s = splnet();
 	igmp_timers_are_running = 0;
 	IN_FIRST_MULTI(step, inm);
 	while (inm != NULL) {
 		if (inm->inm_timer == 0) {
 			/* do nothing */
 		} else if (--inm->inm_timer == 0) {
 			igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
 			inm->inm_state = IGMP_IREPORTEDLAST;
 		} else {
 			igmp_timers_are_running = 1;
 		}
 		IN_NEXT_MULTI(step, inm);
 	}
 	splx(s);
 }
 
 void
 igmp_slowtimo()
 {
 	int s = splnet();
 	register struct router_info *rti =  Head;
 
 #ifdef IGMP_DEBUG
 	printf("[igmp.c,_slowtimo] -- > entering \n");
 #endif
 	while (rti) {
 	    if (rti->rti_type == IGMP_V1_ROUTER) {
 		rti->rti_time++;
 		if (rti->rti_time >= IGMP_AGE_THRESHOLD) {
 			rti->rti_type = IGMP_V2_ROUTER;
 		}
 	    }
 	    rti = rti->rti_next;
 	}
 #ifdef IGMP_DEBUG	
 	printf("[igmp.c,_slowtimo] -- > exiting \n");
 #endif
 	splx(s);
 }
 
 static struct route igmprt;
 
 static void
 igmp_sendpkt(inm, type, addr)
 	struct in_multi *inm;
 	int type;
 	unsigned long addr;
 {
         struct mbuf *m;
         struct igmp *igmp;
         struct ip *ip;
         struct ip_moptions imo;
 
         MGETHDR(m, M_DONTWAIT, MT_HEADER);
         if (m == NULL)
                 return;
 
 	m->m_pkthdr.rcvif = loif;
 #ifdef MAC
 	mac_create_mbuf_linklayer(inm->inm_ifp, m);
 #endif
 	m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN;
 	MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip));
 	m->m_data += sizeof(struct ip);
         m->m_len = IGMP_MINLEN;
         igmp = mtod(m, struct igmp *);
         igmp->igmp_type   = type;
         igmp->igmp_code   = 0;
         igmp->igmp_group  = inm->inm_addr;
         igmp->igmp_cksum  = 0;
         igmp->igmp_cksum  = in_cksum(m, IGMP_MINLEN);
 
         m->m_data -= sizeof(struct ip);
         m->m_len += sizeof(struct ip);
         ip = mtod(m, struct ip *);
         ip->ip_tos        = 0;
         ip->ip_len        = sizeof(struct ip) + IGMP_MINLEN;
         ip->ip_off        = 0;
         ip->ip_p          = IPPROTO_IGMP;
         ip->ip_src.s_addr = INADDR_ANY;
         ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr;
 
         imo.imo_multicast_ifp  = inm->inm_ifp;
         imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
         /*
          * Request loopback of the report if we are acting as a multicast
          * router, so that the process-level routing daemon can hear it.
          */
         imo.imo_multicast_loop = (ip_mrouter != NULL);
 
 	/*
 	 * XXX
 	 * Do we have to worry about reentrancy here?  Don't think so.
 	 */
-        ip_output(m, router_alert, &igmprt, 0, &imo);
+        ip_output(m, router_alert, &igmprt, 0, &imo, NULL);
 
         ++igmpstat.igps_snd_reports;
 }
Index: head/sys/netinet/in_gif.c
===================================================================
--- head/sys/netinet/in_gif.c	(revision 105193)
+++ head/sys/netinet/in_gif.c	(revision 105194)
@@ -1,354 +1,354 @@
 /*	$FreeBSD$	*/
 /*	$KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_mrouting.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_gif.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_ecn.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #ifdef MROUTING
 #include <netinet/ip_mroute.h>
 #endif /* MROUTING */
 
 #include <net/if_gif.h>	
 
 #include <net/net_osdep.h>
 
 static int ip_gif_ttl = GIF_TTL;
 SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
 	&ip_gif_ttl,	0, "");
 
 int
 in_gif_output(ifp, family, m, rt)
 	struct ifnet	*ifp;
 	int		family;
 	struct mbuf	*m;
 	struct rtentry *rt;
 {
 	struct gif_softc *sc = (struct gif_softc*)ifp;
 	struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst;
 	struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc;
 	struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst;
 	struct ip iphdr;	/* capsule IP header, host byte ordered */
 	int proto, error;
 	u_int8_t tos;
 
 	if (sin_src == NULL || sin_dst == NULL ||
 	    sin_src->sin_family != AF_INET ||
 	    sin_dst->sin_family != AF_INET) {
 		m_freem(m);
 		return EAFNOSUPPORT;
 	}
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip *ip;
 
 		proto = IPPROTO_IPV4;
 		if (m->m_len < sizeof(*ip)) {
 			m = m_pullup(m, sizeof(*ip));
 			if (!m)
 				return ENOBUFS;
 		}
 		ip = mtod(m, struct ip *);
 		tos = ip->ip_tos;
 		break;
 	    }
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_hdr *ip6;
 		proto = IPPROTO_IPV6;
 		if (m->m_len < sizeof(*ip6)) {
 			m = m_pullup(m, sizeof(*ip6));
 			if (!m)
 				return ENOBUFS;
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		break;
 	    }
 #endif /* INET6 */
 	default:
 #ifdef DEBUG
 		printf("in_gif_output: warning: unknown family %d passed\n",
 			family);
 #endif
 		m_freem(m);
 		return EAFNOSUPPORT;
 	}
 
 	bzero(&iphdr, sizeof(iphdr));
 	iphdr.ip_src = sin_src->sin_addr;
 	/* bidirectional configured tunnel mode */
 	if (sin_dst->sin_addr.s_addr != INADDR_ANY)
 		iphdr.ip_dst = sin_dst->sin_addr;
 	else {
 		m_freem(m);
 		return ENETUNREACH;
 	}
 	iphdr.ip_p = proto;
 	/* version will be set in ip_output() */
 	iphdr.ip_ttl = ip_gif_ttl;
 	iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip);
 	if (ifp->if_flags & IFF_LINK1)
 		ip_ecn_ingress(ECN_ALLOWED, &iphdr.ip_tos, &tos);
 	else
 		ip_ecn_ingress(ECN_NOCARE, &iphdr.ip_tos, &tos);
 
 	/* prepend new IP header */
 	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 	if (m && m->m_len < sizeof(struct ip))
 		m = m_pullup(m, sizeof(struct ip));
 	if (m == NULL) {
 		printf("ENOBUFS in in_gif_output %d\n", __LINE__);
 		return ENOBUFS;
 	}
 	bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip));
 
 	if (dst->sin_family != sin_dst->sin_family ||
 	    dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) {
 		/* cache route doesn't match */
 		dst->sin_family = sin_dst->sin_family;
 		dst->sin_len = sizeof(struct sockaddr_in);
 		dst->sin_addr = sin_dst->sin_addr;
 		if (sc->gif_ro.ro_rt) {
 			RTFREE(sc->gif_ro.ro_rt);
 			sc->gif_ro.ro_rt = NULL;
 		}
 #if 0
 		sc->gif_if.if_mtu = GIF_MTU;
 #endif
 	}
 
 	if (sc->gif_ro.ro_rt == NULL) {
 		rtalloc(&sc->gif_ro);
 		if (sc->gif_ro.ro_rt == NULL) {
 			m_freem(m);
 			return ENETUNREACH;
 		}
 
 		/* if it constitutes infinite encapsulation, punt. */
 		if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
 			m_freem(m);
 			return ENETUNREACH;	/* XXX */
 		}
 #if 0
 		ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu
 			- sizeof(struct ip);
 #endif
 	}
 
-	error = ip_output(m, NULL, &sc->gif_ro, 0, NULL);
+	error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
 	return(error);
 }
 
 void
 in_gif_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct ifnet *gifp = NULL;
 	struct ip *ip;
 	int af;
 	u_int8_t otos;
 	int proto;
 
 	ip = mtod(m, struct ip *);
 	proto = ip->ip_p;
 
 	gifp = (struct ifnet *)encap_getarg(m);
 
 	if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		ipstat.ips_nogif++;
 		return;
 	}
 
 	otos = ip->ip_tos;
 	m_adj(m, off);
 
 	switch (proto) {
 #ifdef INET
 	case IPPROTO_IPV4:
 	    {
 		struct ip *ip;
 		af = AF_INET;
 		if (m->m_len < sizeof(*ip)) {
 			m = m_pullup(m, sizeof(*ip));
 			if (!m)
 				return;
 		}
 		ip = mtod(m, struct ip *);
 		if (gifp->if_flags & IFF_LINK1)
 			ip_ecn_egress(ECN_ALLOWED, &otos, &ip->ip_tos);
 		else
 			ip_ecn_egress(ECN_NOCARE, &otos, &ip->ip_tos);
 		break;
 	    }
 #endif
 #ifdef INET6
 	case IPPROTO_IPV6:
 	    {
 		struct ip6_hdr *ip6;
 		u_int8_t itos;
 		af = AF_INET6;
 		if (m->m_len < sizeof(*ip6)) {
 			m = m_pullup(m, sizeof(*ip6));
 			if (!m)
 				return;
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		if (gifp->if_flags & IFF_LINK1)
 			ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
 		else
 			ip_ecn_egress(ECN_NOCARE, &otos, &itos);
 		ip6->ip6_flow &= ~htonl(0xff << 20);
 		ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
 		break;
 	    }
 #endif /* INET6 */
 	default:
 		ipstat.ips_nogif++;
 		m_freem(m);
 		return;
 	}
 	gif_input(m, af, gifp);
 	return;
 }
 
 /*
  * we know that we are in IFF_UP, outer address available, and outer family
  * matched the physical addr family.  see gif_encapcheck().
  */
 int
 gif_encapcheck4(m, off, proto, arg)
 	const struct mbuf *m;
 	int off;
 	int proto;
 	void *arg;
 {
 	struct ip ip;
 	struct gif_softc *sc;
 	struct sockaddr_in *src, *dst;
 	int addrmatch;
 	struct in_ifaddr *ia4;
 
 	/* sanity check done in caller */
 	sc = (struct gif_softc *)arg;
 	src = (struct sockaddr_in *)sc->gif_psrc;
 	dst = (struct sockaddr_in *)sc->gif_pdst;
 
 	/* LINTED const cast */
 	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
 
 	/* check for address match */
 	addrmatch = 0;
 	if (src->sin_addr.s_addr == ip.ip_dst.s_addr)
 		addrmatch |= 1;
 	if (dst->sin_addr.s_addr == ip.ip_src.s_addr)
 		addrmatch |= 2;
 	if (addrmatch != 3)
 		return 0;
 
 	/* martian filters on outer source - NOT done in ip_input! */
 	if (IN_MULTICAST(ntohl(ip.ip_src.s_addr)))
 		return 0;
 	switch ((ntohl(ip.ip_src.s_addr) & 0xff000000) >> 24) {
 	case 0: case 127: case 255:
 		return 0;
 	}
 	/* reject packets with broadcast on source */
 	TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link)
 	{
 		if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
 			continue;
 		if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr)
 			return 0;
 	}
 
 	/* ingress filters on outer source */
 	if ((sc->gif_if.if_flags & IFF_LINK2) == 0 &&
 	    (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) {
 		struct sockaddr_in sin;
 		struct rtentry *rt;
 
 		bzero(&sin, sizeof(sin));
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(struct sockaddr_in);
 		sin.sin_addr = ip.ip_src;
 		rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
 		if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) {
 #if 0
 			log(LOG_WARNING, "%s: packet from 0x%x dropped "
 			    "due to ingress filter\n", if_name(&sc->gif_if),
 			    (u_int32_t)ntohl(sin.sin_addr.s_addr));
 #endif
 			if (rt)
 				rtfree(rt);
 			return 0;
 		}
 		rtfree(rt);
 	}
 
 	return 32 * 2;
 }
Index: head/sys/netinet/ip_divert.c
===================================================================
--- head/sys/netinet/ip_divert.c	(revision 105193)
+++ head/sys/netinet/ip_divert.c	(revision 105194)
@@ -1,568 +1,568 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #ifndef INET
 #error "IPDIVERT requires INET."
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw, see the divert(4)
  * manpage for features.
  * Internally, packets selected by ipfw in ip_input() or ip_output(),
  * and never diverted before, are passed to the input queue of the
  * divert socket with a given 'divert_port' number (as specified in
  * the matching ipfw rule), and they are tagged with a 16 bit cookie
  * (representing the rule number of the matching ipfw rule), which
  * is passed to process reading from the socket.
  *
  * Packets written to the divert socket are again tagged with a cookie
  * (usually the same as above) and a destination address.
  * If the destination address is INADDR_ANY then the packet is
  * treated as outgoing and sent to ip_output(), otherwise it is
  * treated as incoming and sent to ip_input().
  * In both cases, the packet is tagged with the cookie.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * ipfw processing will start at the rule number after the one
  * written in the cookie (so, tagging a packet with a cookie of 0
  * will cause it to be effectively considered as a standard packet).
  */
 
 /* Internal variables */
 static struct inpcbhead divcb;
 static struct inpcbinfo divcbinfo;
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 /* Optimization: have this preinitialized */
 static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET };
 
 /*
  * Initialize divert connection block queue.
  */
 void
 div_init(void)
 {
 	INP_INFO_LOCK_INIT(&divcbinfo, "div");
 	LIST_INIT(&divcb);
 	divcbinfo.listhead = &divcb;
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask);
 	divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
 	divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
 }
 
 /*
  * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
  * with that protocol number to enter the system from the outside.
  */
 void
 div_input(struct mbuf *m, int off)
 {
 	ipstat.ips_noproto++;
 	m_freem(m);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  *
  * Setup generic address and protocol structures for div_input routine,
  * then pass them along with mbuf chain.
  */
 void
 divert_packet(struct mbuf *m, int incoming, int port, int rule)
 {
 	struct ip *ip;
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 
 	/* Sanity check */
 	KASSERT(port != 0, ("%s: port=0", __func__));
 
 	divsrc.sin_port = rule;		/* record matching rule */
 
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == 0)
 		return;
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	divsrc.sin_addr.s_addr = 0;
 	if (incoming) {
 		struct ifaddr *ifa;
 
 		/* Sanity check */
 		KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __func__));
 
 		/* Find IP address for receive interface */
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero));
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the 
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like 
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */ 
 		snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero),
 			"%s%d", m->m_pkthdr.rcvif->if_name,
 			m->m_pkthdr.rcvif->if_unit);
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	nport = htons((u_int16_t)port);
 	LIST_FOREACH(inp, &divcb, inp_list) {
 		if (inp->inp_lport == nport)
 			sa = inp->inp_socket;
 	}
 	if (sa) {
 		if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc,
 				m, (struct mbuf *)0) == 0)
 			m_freem(m);
 		else
 			sorwakeup(sa);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m,
 	struct sockaddr_in *sin, struct mbuf *control)
 {
 	int error = 0;
 	struct m_hdr divert_tag;
 
 	/*
 	 * Prepare the tag for divert info. Note that a packet
 	 * with a 0 tag in mh_data is effectively untagged,
 	 * so we could optimize that case.
 	 */
 	divert_tag.mh_type = MT_TAG;
 	divert_tag.mh_flags = PACKET_TAG_DIVERT;
 	divert_tag.mh_next = m;
 	divert_tag.mh_data = 0;		/* the matching rule # */
 	m->m_pkthdr.rcvif = NULL;	/* XXX is it necessary ? */
 
 #ifdef MAC
 	mac_create_mbuf_from_socket(so, m);
 #endif
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		divert_tag.mh_data = (caddr_t)(int)sin->sin_port;
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; sin->sin_zero[i] && i < sizeof(sin->sin_zero); i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		struct inpcb *const inp = sotoinpcb(so);
 		struct ip *const ip = mtod(m, struct ip *);
 
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
 		     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			error = EINVAL;
 			goto cantsend;
 		}
 
 		/* Convert fields to host order for ip_output() */
 		ip->ip_len = ntohs(ip->ip_len);
 		ip->ip_off = ntohs(ip->ip_off);
 
 		/* Send packet to output processing */
 		ipstat.ips_rawout++;			/* XXX */
 		error = ip_output((struct mbuf *)&divert_tag,
 			    inp->inp_options, &inp->inp_route,
 			    (so->so_options & SO_DONTROUTE) |
 			    IP_ALLOWBROADCAST | IP_RAWOUTPUT,
-			    inp->inp_moptions);
+			    inp->inp_moptions, NULL);
 	} else {
 		if (m->m_pkthdr.rcvif == NULL) {
 			/*
 			 * No luck with the name, check by IP address.
 			 * Clear the port and the ifname to make sure
 			 * there are no distractions for ifa_ifwithaddr.
 			 */
 			struct	ifaddr *ifa;
 
 			bzero(sin->sin_zero, sizeof(sin->sin_zero));
 			sin->sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 			if (ifa == NULL) {
 				error = EADDRNOTAVAIL;
 				goto cantsend;
 			}
 			m->m_pkthdr.rcvif = ifa->ifa_ifp;
 		}
 		/* Send packet to input processing */
 		ip_input((struct mbuf *)&divert_tag);
 	}
 
 	return error;
 
 cantsend:
 	m_freem(m);
 	return error;
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error, s;
 
 	inp  = sotoinpcb(so);
 	if (inp)
 		panic("div_attach");
 	if (td && (error = suser(td)) != 0)
 		return error;
 
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &divcbinfo, td);
 	splx(s);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_flags |= INP_HDRINCL;
 	/* The socket is always "connected" because
 	   we always know "where" to send the packet */
 	so->so_state |= SS_ISCONNECTED;
 	return 0;
 }
 
 static int
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		panic("div_detach");
 	in_pcbdetach(inp);
 	return 0;
 }
 
 static int
 div_abort(struct socket *so)
 {
 	soisdisconnected(so);
 	return div_detach(so);
 }
 
 static int
 div_disconnect(struct socket *so)
 {
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	return div_abort(so);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s;
 	int error;
 
 	s = splnet();
 	inp = sotoinpcb(so);
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		error = EAFNOSUPPORT;
 	else {
 		((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 		error = in_pcbbind(inp, nam, td);
 	}
 	splx(s);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		ipstat.ips_toosmall++;
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = divcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = divcbinfo.ipi_gencnt;
 	n = divcbinfo.ipi_count;
 	splx(s);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->td, inp))
 			inp_list[i++] = inp;
 	}
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		xig.xig_gen = divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = divcbinfo.ipi_count;
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &divcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr. We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 div_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &divcbinfo));
 }
 
 
 SYSCTL_DECL(_net_inet_divert);
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
 	    div_pcblist, "S,xinpcb", "List of active divert sockets");
 
 struct pr_usrreqs div_usrreqs = {
 	div_abort, pru_accept_notsupp, div_attach, div_bind,
 	pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach,
 	div_disconnect, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown,
 	div_sockaddr, sosend, soreceive, sopoll
 };
Index: head/sys/netinet/ip_dummynet.c
===================================================================
--- head/sys/netinet/ip_dummynet.c	(revision 105193)
+++ head/sys/netinet/ip_dummynet.c	(revision 105194)
@@ -1,1971 +1,1971 @@
 /*
  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #define DEB(x)
 #define DDB(x)	x
 
 /*
  * This module implements IP dummynet, a bandwidth limiter/delay emulator
  * used in conjunction with the ipfw package.
  * Description of the data structures used is in ip_dummynet.h
  * Here you mainly find the following blocks of code:
  *  + variable declarations;
  *  + heap management functions;
  *  + scheduler and dummynet functions;
  *  + configuration and initialization.
  *
  * NOTA BENE: critical sections are protected by splimp()/splx()
  *    pairs. One would think that splnet() is enough as for most of
  *    the netinet code, but it is not so because when used with
  *    bridging, dummynet is invoked at splimp().
  *
  * Most important Changes:
  *
  * 011004: KLDable
  * 010124: Fixed WF2Q behaviour
  * 010122: Fixed spl protection.
  * 000601: WF2Q support
  * 000106: large rewrite, use heaps to handle very many pipes.
  * 980513:	initial release
  *
  * include files marked with XXX are probably not needed
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 #include <netinet/ip_var.h>
 
 #include <netinet/if_ether.h> /* for struct arpcom */
 #include <net/bridge.h>
 
 /*
  * We keep a private variable for the simulation time, but we could
  * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
  */
 static dn_key curr_time = 0 ; /* current simulation time */
 
 static int dn_hash_size = 64 ;	/* default hash size */
 
 /* statistics on number of queue searches and search steps */
 static int searches, search_steps ;
 static int pipe_expire = 1 ;   /* expire queue if empty */
 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
 
 static int red_lookup_depth = 256;	/* RED - default lookup table depth */
 static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
 static int red_max_pkt_size = 1500;     /* RED - default max packet size */
 
 /*
  * Three heaps contain queues and pipes that the scheduler handles:
  *
  * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
  *
  * wfq_ready_heap contains the pipes associated with WF2Q flows
  *
  * extract_heap contains pipes associated with delay lines.
  *
  */
 
 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
 
 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
 
 static int heap_init(struct dn_heap *h, int size) ;
 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
 static void heap_extract(struct dn_heap *h, void *obj);
 
 static void transmit_event(struct dn_pipe *pipe);
 static void ready_event(struct dn_flow_queue *q);
 
 static struct dn_pipe *all_pipes = NULL ;	/* list of all pipes */
 static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
 
 static struct callout_handle dn_timeout;
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
 		CTLFLAG_RW, 0, "Dummynet");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
 	    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
 	    CTLFLAG_RD, &curr_time, 0, "Current tick");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
 	    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
 	    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
 	    CTLFLAG_RD, &searches, 0, "Number of queue searches");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
 	    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
 	    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
 	    CTLFLAG_RW, &dn_max_ratio, 0, 
 	"Max ratio between dynamic queues and buckets");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
 	CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
 	CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
 	CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
 #endif
 
 static int config_pipe(struct dn_pipe *p);
 static int ip_dn_ctl(struct sockopt *sopt);
 
 static void rt_unref(struct rtentry *);
 static void dummynet(void *);
 static void dummynet_flush(void);
 void dummynet_drain(void);
 static ip_dn_io_t dummynet_io;
 static void dn_rule_delete(void *);
 
 int if_tx_rdy(struct ifnet *ifp);
 
 static void
 rt_unref(struct rtentry *rt)
 {
     if (rt == NULL)
 	return ;
     if (rt->rt_refcnt <= 0)
 	printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt);
     RTFREE(rt);
 }
 
 /*
  * Heap management functions.
  *
  * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
  * Some macros help finding parent/children so we can optimize them.
  *
  * heap_init() is called to expand the heap when needed.
  * Increment size in blocks of 16 entries.
  * XXX failure to allocate a new element is a pretty bad failure
  * as we basically stall a whole queue forever!!
  * Returns 1 on error, 0 on success
  */
 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
 #define HEAP_LEFT(x) ( 2*(x) + 1 )
 #define HEAP_IS_LEFT(x) ( (x) & 1 )
 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
 #define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
 #define HEAP_INCREMENT	15
 
 static int
 heap_init(struct dn_heap *h, int new_size)
 {       
     struct dn_heap_entry *p;
 
     if (h->size >= new_size ) {
 	printf("heap_init, Bogus call, have %d want %d\n",
 		h->size, new_size);
 	return 0 ;
     }   
     new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
     p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
     if (p == NULL) {
 	printf(" heap_init, resize %d failed\n", new_size );
 	return 1 ; /* error */
     }
     if (h->size > 0) {
 	bcopy(h->p, p, h->size * sizeof(*p) );
 	free(h->p, M_DUMMYNET);
     }
     h->p = p ;
     h->size = new_size ;
     return 0 ;
 }
 
 /*
  * Insert element in heap. Normally, p != NULL, we insert p in
  * a new position and bubble up. If p == NULL, then the element is
  * already in place, and key is the position where to start the
  * bubble-up.
  * Returns 1 on failure (cannot allocate new heap entry)
  *
  * If offset > 0 the position (index, int) of the element in the heap is
  * also stored in the element itself at the given offset in bytes.
  */
 #define SET_OFFSET(heap, node) \
     if (heap->offset > 0) \
 	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
 /*
  * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
  */
 #define RESET_OFFSET(heap, node) \
     if (heap->offset > 0) \
 	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
 static int
 heap_insert(struct dn_heap *h, dn_key key1, void *p)
 {   
     int son = h->elements ;
 
     if (p == NULL)	/* data already there, set starting point */
 	son = key1 ;
     else {		/* insert new element at the end, possibly resize */
 	son = h->elements ;
 	if (son == h->size) /* need resize... */
 	    if (heap_init(h, h->elements+1) )
 		return 1 ; /* failure... */
 	h->p[son].object = p ;
 	h->p[son].key = key1 ;
 	h->elements++ ;
     }
     while (son > 0) {				/* bubble up */
 	int father = HEAP_FATHER(son) ;
 	struct dn_heap_entry tmp  ;
 
 	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
 	    break ; /* found right position */ 
 	/* son smaller than father, swap and repeat */
 	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
 	SET_OFFSET(h, son);
 	son = father ;
     }
     SET_OFFSET(h, son);
     return 0 ;
 }
 
 /*
  * remove top element from heap, or obj if obj != NULL
  */
 static void
 heap_extract(struct dn_heap *h, void *obj)
 {  
     int child, father, max = h->elements - 1 ;
 
     if (max < 0) {
 	printf("warning, extract from empty heap 0x%p\n", h);
 	return ;
     }
     father = 0 ; /* default: move up smallest child */
     if (obj != NULL) { /* extract specific element, index is at offset */
 	if (h->offset <= 0)
 	    panic("*** heap_extract from middle not supported on this heap!!!\n");
 	father = *((int *)((char *)obj + h->offset)) ;
 	if (father < 0 || father >= h->elements) {
 	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
 		father, h->elements);
 	    panic("heap_extract");
 	}
     }
     RESET_OFFSET(h, father);
     child = HEAP_LEFT(father) ;		/* left child */
     while (child <= max) {		/* valid entry */
 	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
 	    child = child+1 ;		/* take right child, otherwise left */
 	h->p[father] = h->p[child] ;
 	SET_OFFSET(h, father);
 	father = child ;
 	child = HEAP_LEFT(child) ;   /* left child for next loop */
     }   
     h->elements-- ;
     if (father != max) {
 	/*
 	 * Fill hole with last entry and bubble up, reusing the insert code
 	 */
 	h->p[father] = h->p[max] ;
 	heap_insert(h, father, NULL); /* this one cannot fail */
     }
 }           
 
 #if 0
 /*
  * change object position and update references
  * XXX this one is never used!
  */
 static void
 heap_move(struct dn_heap *h, dn_key new_key, void *object)
 {
     int temp;
     int i ;
     int max = h->elements-1 ;
     struct dn_heap_entry buf ;
 
     if (h->offset <= 0)
 	panic("cannot move items on this heap");
 
     i = *((int *)((char *)object + h->offset));
     if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
 	h->p[i].key = new_key ;
 	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
 		 i = temp ) { /* bubble up */
 	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
 	    SET_OFFSET(h, i);
 	}
     } else {		/* must move down */
 	h->p[i].key = new_key ;
 	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
 	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
 		temp++ ; /* select child with min key */
 	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
 		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
 		SET_OFFSET(h, i);
 	    } else
 		break ;
 	    i = temp ;
 	}
     }
     SET_OFFSET(h, i);
 }
 #endif /* heap_move, unused */
 
 /*
  * heapify() will reorganize data inside an array to maintain the
  * heap property. It is needed when we delete a bunch of entries.
  */
 static void
 heapify(struct dn_heap *h)
 {
     int i ;
 
     for (i = 0 ; i < h->elements ; i++ )
 	heap_insert(h, i , NULL) ;
 }
 
 /*
  * cleanup the heap and free data structure
  */
 static void
 heap_free(struct dn_heap *h)
 {
     if (h->size >0 )
 	free(h->p, M_DUMMYNET);
     bzero(h, sizeof(*h) );
 }
 
 /*
  * --- end of heap management functions ---
  */
 
 /*
  * Scheduler functions:
  *
  * transmit_event() is called when the delay-line needs to enter
  * the scheduler, either because of existing pkts getting ready,
  * or new packets entering the queue. The event handled is the delivery
  * time of the packet.
  *
  * ready_event() does something similar with fixed-rate queues, and the
  * event handled is the finish time of the head pkt.
  *
  * wfq_ready_event() does something similar with WF2Q queues, and the
  * event handled is the start time of the head pkt.
  *
  * In all cases, we make sure that the data structures are consistent
  * before passing pkts out, because this might trigger recursive
  * invocations of the procedures.
  */
 static void
 transmit_event(struct dn_pipe *pipe)
 {
     struct dn_pkt *pkt ;
 
     while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) {
 	/*
 	 * first unlink, then call procedures, since ip_input() can invoke
 	 * ip_output() and viceversa, thus causing nested calls
 	 */
 	pipe->head = DN_NEXT(pkt) ;
 
 	/*
 	 * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf
 	 * (NOT A REAL one, just a small block of malloc'ed memory) with
 	 *     m_type = MT_TAG, m_flags = PACKET_TAG_DUMMYNET
 	 *     dn_m (m_next) = actual mbuf to be processed by ip_input/output
 	 * and some other fields.
 	 * The block IS FREED HERE because it contains parameters passed
 	 * to the called routine.
 	 */
 	switch (pkt->dn_dir) {
 	case DN_TO_IP_OUT:
-	    (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL);
+	    (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL, NULL);
 	    rt_unref (pkt->ro.ro_rt) ;
 	    break ;
 
 	case DN_TO_IP_IN :
 	    ip_input((struct mbuf *)pkt) ;
 	    break ;
 
 	case DN_TO_BDG_FWD :
 	    if (!BDG_LOADED) {
 		/* somebody unloaded the bridge module. Drop pkt */
 		printf("-- dropping bridged packet trapped in pipe--\n");
 		m_freem(pkt->dn_m);
 		break;
 	    } /* fallthrough */
 	case DN_TO_ETH_DEMUX:
 	    {
 		struct mbuf *m = (struct mbuf *)pkt ;
 		struct ether_header *eh;
 
 		if (pkt->dn_m->m_len < ETHER_HDR_LEN &&
 		    (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) {
 		    printf("dummynet/bridge: pullup fail, dropping pkt\n");
 		    break;
 		}
 		/*
 		 * same as ether_input, make eh be a pointer into the mbuf
 		 */
 		eh = mtod(pkt->dn_m, struct ether_header *);
 		m_adj(pkt->dn_m, ETHER_HDR_LEN);
 		/*
 		 * bdg_forward() wants a pointer to the pseudo-mbuf-header, but
 		 * on return it will supply the pointer to the actual packet
 		 * (originally pkt->dn_m, but could be something else now) if
 		 * it has not consumed it.
 		 */
 		if (pkt->dn_dir == DN_TO_BDG_FWD) {
 		    m = bdg_forward_ptr(m, eh, pkt->ifp);
 		    if (m)
 			m_freem(m);
 		} else
 		    ether_demux(NULL, eh, m); /* which consumes the mbuf */
 	    }
 	    break ;
 	case DN_TO_ETH_OUT:
 	    ether_output_frame(pkt->ifp, (struct mbuf *)pkt);
 	    break;
 
 	default:
 	    printf("dummynet: bad switch %d!\n", pkt->dn_dir);
 	    m_freem(pkt->dn_m);
 	    break ;
 	}
 	free(pkt, M_DUMMYNET);
     }
     /* if there are leftover packets, put into the heap for next event */
     if ( (pkt = pipe->head) )
          heap_insert(&extract_heap, pkt->output_time, pipe ) ;
     /* XXX should check errors on heap_insert, by draining the
      * whole pipe p and hoping in the future we are more successful
      */
 }
 
 /*
  * the following macro computes how many ticks we have to wait
  * before being able to transmit a packet. The credit is taken from
  * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
  */
 #define SET_TICKS(pkt, q, p)	\
     (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
 	    p->bandwidth ;
 
 /*
  * extract pkt from queue, compute output time (could be now)
  * and put into delay line (p_queue)
  */
 static void
 move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q,
 	struct dn_pipe *p, int len)
 {
     q->head = DN_NEXT(pkt) ;
     q->len-- ;
     q->len_bytes -= len ;
 
     pkt->output_time = curr_time + p->delay ;
 
     if (p->head == NULL)
 	p->head = pkt;
     else
 	DN_NEXT(p->tail) = pkt;
     p->tail = pkt;
     DN_NEXT(p->tail) = NULL;
 }
 
 /*
  * ready_event() is invoked every time the queue must enter the
  * scheduler, either because the first packet arrives, or because
  * a previously scheduled event fired.
  * On invokation, drain as many pkts as possible (could be 0) and then
  * if there are leftover packets reinsert the pkt in the scheduler.
  */
 static void
 ready_event(struct dn_flow_queue *q)
 {
     struct dn_pkt *pkt;
     struct dn_pipe *p = q->fs->pipe ;
     int p_was_empty ;
 
     if (p == NULL) {
 	printf("ready_event- pipe is gone\n");
 	return ;
     }
     p_was_empty = (p->head == NULL) ;
 
     /*
      * schedule fixed-rate queues linked to this pipe:
      * Account for the bw accumulated since last scheduling, then
      * drain as many pkts as allowed by q->numbytes and move to
      * the delay line (in p) computing output time.
      * bandwidth==0 (no limit) means we can drain the whole queue,
      * setting len_scaled = 0 does the job.
      */
     q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
     while ( (pkt = q->head) != NULL ) {
 	int len = pkt->dn_m->m_pkthdr.len;
 	int len_scaled = p->bandwidth ? len*8*hz : 0 ;
 	if (len_scaled > q->numbytes )
 	    break ;
 	q->numbytes -= len_scaled ;
 	move_pkt(pkt, q, p, len);
     }
     /*
      * If we have more packets queued, schedule next ready event
      * (can only occur when bandwidth != 0, otherwise we would have
      * flushed the whole queue in the previous loop).
      * To this purpose we record the current time and compute how many
      * ticks to go for the finish time of the packet.
      */
     if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
 	dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
 	q->sched_time = curr_time ;
 	heap_insert(&ready_heap, curr_time + t, (void *)q );
 	/* XXX should check errors on heap_insert, and drain the whole
 	 * queue on error hoping next time we are luckier.
 	 */
     } else	/* RED needs to know when the queue becomes empty */
 	q->q_time = curr_time;
     /*
      * If the delay line was empty call transmit_event(p) now.
      * Otherwise, the scheduler will take care of it.
      */
     if (p_was_empty)
 	transmit_event(p);
 }
 
 /*
  * Called when we can transmit packets on WF2Q queues. Take pkts out of
  * the queues at their start time, and enqueue into the delay line.
  * Packets are drained until p->numbytes < 0. As long as
  * len_scaled >= p->numbytes, the packet goes into the delay line
  * with a deadline p->delay. For the last packet, if p->numbytes<0,
  * there is an additional delay.
  */
 static void
 ready_event_wfq(struct dn_pipe *p)
 {
     int p_was_empty = (p->head == NULL) ;
     struct dn_heap *sch = &(p->scheduler_heap);
     struct dn_heap *neh = &(p->not_eligible_heap) ;
 
     if (p->if_name[0] == 0) /* tx clock is simulated */
 	p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
     else { /* tx clock is for real, the ifq must be empty or this is a NOP */
 	if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
 	    return ;
 	else {
 	    DEB(printf("pipe %d ready from %s --\n",
 		p->pipe_nr, p->if_name);)
 	}
     }
 
     /*
      * While we have backlogged traffic AND credit, we need to do
      * something on the queue.
      */
     while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
 	if (sch->elements > 0) { /* have some eligible pkts to send out */
 	    struct dn_flow_queue *q = sch->p[0].object ;
 	    struct dn_pkt *pkt = q->head;  
 	    struct dn_flow_set *fs = q->fs;   
 	    u_int64_t len = pkt->dn_m->m_pkthdr.len;
 	    int len_scaled = p->bandwidth ? len*8*hz : 0 ;
 
 	    heap_extract(sch, NULL); /* remove queue from heap */
 	    p->numbytes -= len_scaled ;
 	    move_pkt(pkt, q, p, len);
 
 	    p->V += (len<<MY_M) / p->sum ; /* update V */
 	    q->S = q->F ; /* update start time */
 	    if (q->len == 0) { /* Flow not backlogged any more */
 		fs->backlogged-- ;
 		heap_insert(&(p->idle_heap), q->F, q);
 	    } else { /* still backlogged */
 		/*
 		 * update F and position in backlogged queue, then
 		 * put flow in not_eligible_heap (we will fix this later).
 		 */
 		len = (q->head)->dn_m->m_pkthdr.len;
 		q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
 		if (DN_KEY_LEQ(q->S, p->V))
 		    heap_insert(neh, q->S, q);
 		else
 		    heap_insert(sch, q->F, q);
 	    }
 	}
 	/*
 	 * now compute V = max(V, min(S_i)). Remember that all elements in sch
 	 * have by definition S_i <= V so if sch is not empty, V is surely
 	 * the max and we must not update it. Conversely, if sch is empty
 	 * we only need to look at neh.
 	 */
 	if (sch->elements == 0 && neh->elements > 0)
 	    p->V = MAX64 ( p->V, neh->p[0].key );
 	/* move from neh to sch any packets that have become eligible */
 	while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
 	    struct dn_flow_queue *q = neh->p[0].object ;
 	    heap_extract(neh, NULL);
 	    heap_insert(sch, q->F, q);
 	}
 
 	if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
 	    p->numbytes = -1 ; /* mark not ready for I/O */
 	    break ;
 	}
     }
     if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
 	    && p->idle_heap.elements > 0) {
 	/*
 	 * no traffic and no events scheduled. We can get rid of idle-heap.
 	 */
 	int i ;
 
 	for (i = 0 ; i < p->idle_heap.elements ; i++) {
 	    struct dn_flow_queue *q = p->idle_heap.p[i].object ;
 
 	    q->F = 0 ;
 	    q->S = q->F + 1 ;
 	}
 	p->sum = 0 ;
 	p->V = 0 ;
 	p->idle_heap.elements = 0 ;
     }
     /*
      * If we are getting clocks from dummynet (not a real interface) and
      * If we are under credit, schedule the next ready event.
      * Also fix the delivery time of the last packet.
      */
     if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
 	dn_key t=0 ; /* number of ticks i have to wait */
 
 	if (p->bandwidth > 0)
 	    t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
 	p->tail->output_time += t ;
 	p->sched_time = curr_time ;
 	heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
 	/* XXX should check errors on heap_insert, and drain the whole
 	 * queue on error hoping next time we are luckier.
 	 */
     }
     /*
      * If the delay line was empty call transmit_event(p) now.
      * Otherwise, the scheduler will take care of it.
      */
     if (p_was_empty)
 	transmit_event(p);
 }
 
 /*
  * This is called once per tick, or HZ times per second. It is used to
  * increment the current tick counter and schedule expired events.
  */
 static void
 dummynet(void * __unused unused)
 {
     void *p ; /* generic parameter to handler */
     struct dn_heap *h ;
     int s ;
     struct dn_heap *heaps[3];
     int i;
     struct dn_pipe *pe ;
 
     heaps[0] = &ready_heap ;		/* fixed-rate queues */
     heaps[1] = &wfq_ready_heap ;	/* wfq queues */
     heaps[2] = &extract_heap ;		/* delay line */
     s = splimp(); /* see note on top, splnet() is not enough */
     curr_time++ ;
     for (i=0; i < 3 ; i++) {
 	h = heaps[i];
 	while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
 	    DDB(if (h->p[0].key > curr_time)
 		printf("-- dummynet: warning, heap %d is %d ticks late\n",
 		    i, (int)(curr_time - h->p[0].key));)
 	    p = h->p[0].object ; /* store a copy before heap_extract */
 	    heap_extract(h, NULL); /* need to extract before processing */
 	    if (i == 0)
 		ready_event(p) ;
 	    else if (i == 1) {
 		struct dn_pipe *pipe = p;
 		if (pipe->if_name[0] != '\0')
 		    printf("*** bad ready_event_wfq for pipe %s\n",
 			pipe->if_name);
 		else
 		    ready_event_wfq(p) ;
 	    } else
 		transmit_event(p);
 	}
     }
     /* sweep pipes trying to expire idle flow_queues */
     for (pe = all_pipes; pe ; pe = pe->next )
 	if (pe->idle_heap.elements > 0 &&
 		DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
 	    struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
 
 	    heap_extract(&(pe->idle_heap), NULL);
 	    q->S = q->F + 1 ; /* mark timestamp as invalid */
 	    pe->sum -= q->fs->weight ;
 	}
     splx(s);
     dn_timeout = timeout(dummynet, NULL, 1);
 }
  
 /*
  * called by an interface when tx_rdy occurs.
  */
 int
 if_tx_rdy(struct ifnet *ifp)
 {
     struct dn_pipe *p;
 
     for (p = all_pipes; p ; p = p->next )
 	if (p->ifp == ifp)
 	    break ;
     if (p == NULL) {
 	char buf[32];
 	sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
 	for (p = all_pipes; p ; p = p->next )
 	    if (!strcmp(p->if_name, buf) ) {
 		p->ifp = ifp ;
 		DEB(printf("++ tx rdy from %s (now found)\n", buf);)
 		break ;
 	    }
     }
     if (p != NULL) {
 	DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
 		ifp->if_unit, ifp->if_snd.ifq_len);)
 	p->numbytes = 0 ; /* mark ready for I/O */
 	ready_event_wfq(p);
     }
     return 0;
 }
 
 /*
  * Unconditionally expire empty queues in case of shortage.
  * Returns the number of queues freed.
  */
 static int
 expire_queues(struct dn_flow_set *fs)
 {
     struct dn_flow_queue *q, *prev ;
     int i, initial_elements = fs->rq_elements ;
 
     if (fs->last_expired == time_second)
 	return 0 ;
     fs->last_expired = time_second ;
     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
 	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
 	    if (q->head != NULL || q->S != q->F+1) {
   		prev = q ;
   	        q = q->next ;
   	    } else { /* entry is idle, expire it */
 		struct dn_flow_queue *old_q = q ;
 
 		if (prev != NULL)
 		    prev->next = q = q->next ;
 		else
 		    fs->rq[i] = q = q->next ;
 		fs->rq_elements-- ;
 		free(old_q, M_DUMMYNET);
 	    }
     return initial_elements - fs->rq_elements ;
 }
 
 /*
  * If room, create a new queue and put at head of slot i;
  * otherwise, create or use the default queue.
  */
 static struct dn_flow_queue *
 create_queue(struct dn_flow_set *fs, int i)
 {
     struct dn_flow_queue *q ;
 
     if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
 	    expire_queues(fs) == 0) {
 	/*
 	 * No way to get room, use or create overflow queue.
 	 */
 	i = fs->rq_size ;
 	if ( fs->rq[i] != NULL )
 	    return fs->rq[i] ;
     }
     q = malloc(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
     if (q == NULL) {
 	printf("sorry, cannot allocate queue for new flow\n");
 	return NULL ;
     }
     q->fs = fs ;
     q->hash_slot = i ;
     q->next = fs->rq[i] ;
     q->S = q->F + 1;   /* hack - mark timestamp as invalid */
     fs->rq[i] = q ;
     fs->rq_elements++ ;
     return q ;
 }
 
 /*
  * Given a flow_set and a pkt in last_pkt, find a matching queue
  * after appropriate masking. The queue is moved to front
  * so that further searches take less time.
  */
 static struct dn_flow_queue *
 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
 {
     int i = 0 ; /* we need i and q for new allocations */
     struct dn_flow_queue *q, *prev;
 
     if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
 	q = fs->rq[0] ;
     else {
 	/* first, do the masking */
 	id->dst_ip &= fs->flow_mask.dst_ip ;
 	id->src_ip &= fs->flow_mask.src_ip ;
 	id->dst_port &= fs->flow_mask.dst_port ;
 	id->src_port &= fs->flow_mask.src_port ;
 	id->proto &= fs->flow_mask.proto ;
 	id->flags = 0 ; /* we don't care about this one */
 	/* then, hash function */
 	i = ( (id->dst_ip) & 0xffff ) ^
 	    ( (id->dst_ip >> 15) & 0xffff ) ^
 	    ( (id->src_ip << 1) & 0xffff ) ^
 	    ( (id->src_ip >> 16 ) & 0xffff ) ^
 	    (id->dst_port << 1) ^ (id->src_port) ^
 	    (id->proto );
 	i = i % fs->rq_size ;
 	/* finally, scan the current list for a match */
 	searches++ ;
 	for (prev=NULL, q = fs->rq[i] ; q ; ) {
 	    search_steps++;
 	    if (bcmp(id, &(q->id), sizeof(q->id) ) == 0)
 		break ; /* found */
 	    else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
 		/* entry is idle and not in any heap, expire it */
 		struct dn_flow_queue *old_q = q ;
 
 		if (prev != NULL)
 		    prev->next = q = q->next ;
 		else
 		    fs->rq[i] = q = q->next ;
 		fs->rq_elements-- ;
 		free(old_q, M_DUMMYNET);
 		continue ;
 	    }
 	    prev = q ;
 	    q = q->next ;
 	}
 	if (q && prev != NULL) { /* found and not in front */
 	    prev->next = q->next ;
 	    q->next = fs->rq[i] ;
 	    fs->rq[i] = q ;
 	}
     }
     if (q == NULL) { /* no match, need to allocate a new entry */
 	q = create_queue(fs, i);
 	if (q != NULL)
 	q->id = *id ;
     }
     return q ;
 }
 
 static int
 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
 {
     /*
      * RED algorithm
      * 
      * RED calculates the average queue size (avg) using a low-pass filter
      * with an exponential weighted (w_q) moving average:
      * 	avg  <-  (1-w_q) * avg + w_q * q_size
      * where q_size is the queue length (measured in bytes or * packets).
      * 
      * If q_size == 0, we compute the idle time for the link, and set
      *	avg = (1 - w_q)^(idle/s)
      * where s is the time needed for transmitting a medium-sized packet.
      * 
      * Now, if avg < min_th the packet is enqueued.
      * If avg > max_th the packet is dropped. Otherwise, the packet is
      * dropped with probability P function of avg.
      * 
      */
 
     int64_t p_b = 0;
     /* queue in bytes or packets ? */
     u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
 
     DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);)
 
     /* average queue size estimation */
     if (q_size != 0) {
 	/*
 	 * queue is not empty, avg <- avg + (q_size - avg) * w_q
 	 */
 	int diff = SCALE(q_size) - q->avg;
 	int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
 
 	q->avg += (int) v;
     } else {
 	/*
 	 * queue is empty, find for how long the queue has been
 	 * empty and use a lookup table for computing
 	 * (1 - * w_q)^(idle_time/s) where s is the time to send a
 	 * (small) packet.
 	 * XXX check wraps...
 	 */
 	if (q->avg) {
 	    u_int t = (curr_time - q->q_time) / fs->lookup_step;
 
 	    q->avg = (t < fs->lookup_depth) ?
 		    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
 	}
     }
     DEB(printf("avg: %u ", SCALE_VAL(q->avg));)
 
     /* should i drop ? */
 
     if (q->avg < fs->min_th) {
 	q->count = -1;
 	return 0; /* accept packet ; */
     }
     if (q->avg >= fs->max_th) { /* average queue >=  max threshold */
 	if (fs->flags_fs & DN_IS_GENTLE_RED) {
 	    /*
 	     * According to Gentle-RED, if avg is greater than max_th the
 	     * packet is dropped with a probability
 	     *	p_b = c_3 * avg - c_4
 	     * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
 	     */
 	    p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
 	} else {
 	    q->count = -1;
 	    printf("- drop");
 	    return 1 ;
 	}
     } else if (q->avg > fs->min_th) {
 	/*
 	 * we compute p_b using the linear dropping function p_b = c_1 *
 	 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
 	 * max_p * min_th / (max_th - min_th)
 	 */
 	p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
     }
     if (fs->flags_fs & DN_QSIZE_IS_BYTES)
 	p_b = (p_b * len) / fs->max_pkt_size;
     if (++q->count == 0)
 	q->random = random() & 0xffff;
     else {
 	/*
 	 * q->count counts packets arrived since last drop, so a greater
 	 * value of q->count means a greater packet drop probability.
 	 */
 	if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
 	    q->count = 0;
 	    DEB(printf("- red drop");)
 	    /* after a drop we calculate a new random value */
 	    q->random = random() & 0xffff;
 	    return 1;    /* drop */
 	}
     }
     /* end of RED algorithm */
     return 0 ; /* accept */
 }
 
 static __inline
 struct dn_flow_set *
 locate_flowset(int pipe_nr, struct ip_fw *rule)
 {
 #if IPFW2
     struct dn_flow_set *fs;
     ipfw_insn *cmd = rule->cmd + rule->act_ofs;
 
     if (cmd->opcode == O_LOG)
 	cmd += F_LEN(cmd);
     fs = ((ipfw_insn_pipe *)cmd)->pipe_ptr;
 
     if (fs != NULL)
 	return fs;
 
     if (cmd->opcode == O_QUEUE)
 #else /* !IPFW2 */
     struct dn_flow_set *fs = NULL ;
 
     if ( (rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE )
 #endif /* !IPFW2 */
 	for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
 	    ;
     else {
 	struct dn_pipe *p1;
 	for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
 	    ;
 	if (p1 != NULL)
 	    fs = &(p1->fs) ;
     }
     /* record for the future */
 #if IPFW2
     ((ipfw_insn_pipe *)cmd)->pipe_ptr = fs;
 #else
     if (fs != NULL)
 	rule->pipe_ptr = fs;
 #endif
     return fs ;
 }
 
 /*
  * dummynet hook for packets. Below 'pipe' is a pipe or a queue
  * depending on whether WF2Q or fixed bw is used.
  *
  * pipe_nr	pipe or queue the packet is destined for.
  * dir		where shall we send the packet after dummynet.
  * m		the mbuf with the packet
  * ifp		the 'ifp' parameter from the caller.
  *		NULL in ip_input, destination interface in ip_output,
  *		real_dst in bdg_forward
  * ro		route parameter (only used in ip_output, NULL otherwise)
  * dst		destination address, only used by ip_output
  * rule		matching rule, in case of multiple passes
  * flags	flags from the caller, only used in ip_output
  * 
  */
 static int
 dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
 {
     struct dn_pkt *pkt;
     struct dn_flow_set *fs;
     struct dn_pipe *pipe ;
     u_int64_t len = m->m_pkthdr.len ;
     struct dn_flow_queue *q = NULL ;
     int s = splimp();
     int is_pipe;
 #if IPFW2
     ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
 
     if (cmd->opcode == O_LOG)
 	cmd += F_LEN(cmd);
     is_pipe = (cmd->opcode == O_PIPE);
 #else
     is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
 #endif
 
     pipe_nr &= 0xffff ;
 
     /*
      * this is a dummynet rule, so we expect a O_PIPE or O_QUEUE rule
      */
     fs = locate_flowset(pipe_nr, fwa->rule);
     if (fs == NULL)
 	goto dropit ;	/* this queue/pipe does not exist! */
     pipe = fs->pipe ;
     if (pipe == NULL) { /* must be a queue, try find a matching pipe */
 	for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
 		 pipe = pipe->next)
 	    ;
 	if (pipe != NULL)
 	    fs->pipe = pipe ;
 	else {
 	    printf("No pipe %d for queue %d, drop pkt\n",
 		fs->parent_nr, fs->fs_nr);
 	    goto dropit ;
 	}
     }
     q = find_queue(fs, &(fwa->f_id));
     if ( q == NULL )
 	goto dropit ;		/* cannot allocate queue		*/
     /*
      * update statistics, then check reasons to drop pkt
      */
     q->tot_bytes += len ;
     q->tot_pkts++ ;
     if ( fs->plr && random() < fs->plr )
 	goto dropit ;		/* random pkt drop			*/
     if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
     	if (q->len_bytes > fs->qsize)
 	    goto dropit ;	/* queue size overflow			*/
     } else {
 	if (q->len >= fs->qsize)
 	    goto dropit ;	/* queue count overflow			*/
     }
     if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
 	goto dropit ;
 
     /* XXX expensive to zero, see if we can remove it*/
     pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_DUMMYNET, M_NOWAIT|M_ZERO);
     if ( pkt == NULL )
 	goto dropit ;		/* cannot allocate packet header	*/
     /* ok, i can handle the pkt now... */
     /* build and enqueue packet + parameters */
     pkt->hdr.mh_type = MT_TAG;
     pkt->hdr.mh_flags = PACKET_TAG_DUMMYNET;
     pkt->rule = fwa->rule ;
     DN_NEXT(pkt) = NULL;
     pkt->dn_m = m;
     pkt->dn_dir = dir ;
 
     pkt->ifp = fwa->oif;
     if (dir == DN_TO_IP_OUT) {
 	/*
 	 * We need to copy *ro because for ICMP pkts (and maybe others)
 	 * the caller passed a pointer into the stack; dst might also be
 	 * a pointer into *ro so it needs to be updated.
 	 */
 	pkt->ro = *(fwa->ro);
 	if (fwa->ro->ro_rt)
 	    fwa->ro->ro_rt->rt_refcnt++ ;
 	if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
 	    fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
 
 	pkt->dn_dst = fwa->dst;
 	pkt->flags = fwa->flags;
     }
     if (q->head == NULL)
 	q->head = pkt;
     else
 	DN_NEXT(q->tail) = pkt;
     q->tail = pkt;
     q->len++;
     q->len_bytes += len ;
 
     if ( q->head != pkt )	/* flow was not idle, we are done */
 	goto done;
     /*
      * If we reach this point the flow was previously idle, so we need
      * to schedule it. This involves different actions for fixed-rate or
      * WF2Q queues.
      */
     if (is_pipe) {
 	/*
 	 * Fixed-rate queue: just insert into the ready_heap.
 	 */
 	dn_key t = 0 ;
 	if (pipe->bandwidth) 
 	    t = SET_TICKS(pkt, q, pipe);
 	q->sched_time = curr_time ;
 	if (t == 0)	/* must process it now */
 	    ready_event( q );
 	else
 	    heap_insert(&ready_heap, curr_time + t , q );
     } else {
 	/*
 	 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
 	 * set S to the virtual time V for the controlling pipe, and update
 	 * the sum of weights for the pipe; otherwise, remove flow from
 	 * idle_heap and set S to max(F,V).
 	 * Second, compute finish time F = S + len/weight.
 	 * Third, if pipe was idle, update V=max(S, V).
 	 * Fourth, count one more backlogged flow.
 	 */
 	if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
 	    q->S = pipe->V ;
 	    pipe->sum += fs->weight ; /* add weight of new queue */
 	} else {
 	    heap_extract(&(pipe->idle_heap), q);
 	    q->S = MAX64(q->F, pipe->V ) ;
 	}
 	q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
 
 	if (pipe->not_eligible_heap.elements == 0 &&
 		pipe->scheduler_heap.elements == 0)
 	    pipe->V = MAX64 ( q->S, pipe->V );
 	fs->backlogged++ ;
 	/*
 	 * Look at eligibility. A flow is not eligibile if S>V (when
 	 * this happens, it means that there is some other flow already
 	 * scheduled for the same pipe, so the scheduler_heap cannot be
 	 * empty). If the flow is not eligible we just store it in the
 	 * not_eligible_heap. Otherwise, we store in the scheduler_heap
 	 * and possibly invoke ready_event_wfq() right now if there is
 	 * leftover credit.
 	 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
 	 * and for all flows in not_eligible_heap (NEH), S_i > V .
 	 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
 	 * we only need to look into NEH.
 	 */
 	if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
 	    if (pipe->scheduler_heap.elements == 0)
 		printf("++ ouch! not eligible but empty scheduler!\n");
 	    heap_insert(&(pipe->not_eligible_heap), q->S, q);
 	} else {
 	    heap_insert(&(pipe->scheduler_heap), q->F, q);
 	    if (pipe->numbytes >= 0) { /* pipe is idle */
 		if (pipe->scheduler_heap.elements != 1)
 		    printf("*** OUCH! pipe should have been idle!\n");
 		DEB(printf("Waking up pipe %d at %d\n",
 			pipe->pipe_nr, (int)(q->F >> MY_M)); )
 		pipe->sched_time = curr_time ;
 		ready_event_wfq(pipe);
 	    }
 	}
     }
 done:
     splx(s);
     return 0;
 
 dropit:
     splx(s);
     if (q)
 	q->drops++ ;
     m_freem(m);
     return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
 }
 
 /*
  * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
  * Doing this would probably save us the initial bzero of dn_pkt
  */
 #define DN_FREE_PKT(pkt)	{		\
 	struct dn_pkt *n = pkt ;		\
 	rt_unref ( n->ro.ro_rt ) ;		\
 	m_freem(n->dn_m);			\
 	pkt = DN_NEXT(n) ;			\
 	free(n, M_DUMMYNET) ;	}
 
 /*
  * Dispose all packets and flow_queues on a flow_set.
  * If all=1, also remove red lookup table and other storage,
  * including the descriptor itself.
  * For the one in dn_pipe MUST also cleanup ready_heap...
  */
 static void
 purge_flow_set(struct dn_flow_set *fs, int all)
 {
     struct dn_pkt *pkt ;
     struct dn_flow_queue *q, *qn ;
     int i ;
 
     for (i = 0 ; i <= fs->rq_size ; i++ ) {
 	for (q = fs->rq[i] ; q ; q = qn ) {
 	    for (pkt = q->head ; pkt ; )
 		DN_FREE_PKT(pkt) ;
 	    qn = q->next ;
 	    free(q, M_DUMMYNET);
 	}
 	fs->rq[i] = NULL ;
     }
     fs->rq_elements = 0 ;
     if (all) {
 	/* RED - free lookup table */
 	if (fs->w_q_lookup)
 	    free(fs->w_q_lookup, M_DUMMYNET);
 	if (fs->rq)
 	    free(fs->rq, M_DUMMYNET);
 	/* if this fs is not part of a pipe, free it */
 	if (fs->pipe && fs != &(fs->pipe->fs) )
 	    free(fs, M_DUMMYNET);
     }
 }
 
 /*
  * Dispose all packets queued on a pipe (not a flow_set).
  * Also free all resources associated to a pipe, which is about
  * to be deleted.
  */
 static void
 purge_pipe(struct dn_pipe *pipe)
 {
     struct dn_pkt *pkt ;
 
     purge_flow_set( &(pipe->fs), 1 );
 
     for (pkt = pipe->head ; pkt ; )
 	DN_FREE_PKT(pkt) ;
 
     heap_free( &(pipe->scheduler_heap) );
     heap_free( &(pipe->not_eligible_heap) );
     heap_free( &(pipe->idle_heap) );
 }
 
 /*
  * Delete all pipes and heaps returning memory. Must also
  * remove references from all ipfw rules to all pipes.
  */
 static void
 dummynet_flush()
 {
     struct dn_pipe *curr_p, *p ;
     struct dn_flow_set *fs, *curr_fs;
     int s ;
 
     s = splimp() ;
 
     /* remove all references to pipes ...*/
     flush_pipe_ptrs(NULL);
     /* prevent future matches... */
     p = all_pipes ;
     all_pipes = NULL ; 
     fs = all_flow_sets ;
     all_flow_sets = NULL ;
     /* and free heaps so we don't have unwanted events */
     heap_free(&ready_heap);
     heap_free(&wfq_ready_heap);
     heap_free(&extract_heap);
     splx(s) ;
     /*
      * Now purge all queued pkts and delete all pipes
      */
     /* scan and purge all flow_sets. */
     for ( ; fs ; ) {
 	curr_fs = fs ;
 	fs = fs->next ;
 	purge_flow_set(curr_fs, 1);
     }
     for ( ; p ; ) {
 	purge_pipe(p);
 	curr_p = p ;
 	p = p->next ;	
 	free(curr_p, M_DUMMYNET);
     }
 }
 
 
 extern struct ip_fw *ip_fw_default_rule ;
 static void
 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
 {
     int i ;
     struct dn_flow_queue *q ;
     struct dn_pkt *pkt ;
 
     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
 	for (q = fs->rq[i] ; q ; q = q->next )
 	    for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) )
 		if (pkt->rule == r)
 		    pkt->rule = ip_fw_default_rule ;
 }
 /*
  * when a firewall rule is deleted, scan all queues and remove the flow-id
  * from packets matching this rule.
  */
 void
 dn_rule_delete(void *r)
 {
     struct dn_pipe *p ;
     struct dn_pkt *pkt ;
     struct dn_flow_set *fs ;
 
     /*
      * If the rule references a queue (dn_flow_set), then scan
      * the flow set, otherwise scan pipes. Should do either, but doing
      * both does not harm.
      */
     for ( fs = all_flow_sets ; fs ; fs = fs->next )
 	dn_rule_delete_fs(fs, r);
     for ( p = all_pipes ; p ; p = p->next ) {
 	fs = &(p->fs) ;
 	dn_rule_delete_fs(fs, r);
 	for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) )
 	    if (pkt->rule == r)
 		pkt->rule = ip_fw_default_rule ;
     }
 }
 
 /*
  * setup RED parameters
  */
 static int
 config_red(struct dn_flow_set *p, struct dn_flow_set * x) 
 {
     int i;
 
     x->w_q = p->w_q;
     x->min_th = SCALE(p->min_th);
     x->max_th = SCALE(p->max_th);
     x->max_p = p->max_p;
 
     x->c_1 = p->max_p / (p->max_th - p->min_th);
     x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
     if (x->flags_fs & DN_IS_GENTLE_RED) {
 	x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
 	x->c_4 = (SCALE(1) - 2 * p->max_p);
     }
 
     /* if the lookup table already exist, free and create it again */
     if (x->w_q_lookup) {
 	free(x->w_q_lookup, M_DUMMYNET);
 	x->w_q_lookup = NULL ;
     }
     if (red_lookup_depth == 0) {
 	printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0");
 	free(x, M_DUMMYNET);
 	return EINVAL;
     }
     x->lookup_depth = red_lookup_depth;
     x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int),
 	    M_DUMMYNET, M_DONTWAIT);
     if (x->w_q_lookup == NULL) {
 	printf("sorry, cannot allocate red lookup table\n");
 	free(x, M_DUMMYNET);
 	return ENOSPC;
     }
 
     /* fill the lookup table with (1 - w_q)^x */
     x->lookup_step = p->lookup_step ;
     x->lookup_weight = p->lookup_weight ;
     x->w_q_lookup[0] = SCALE(1) - x->w_q;
     for (i = 1; i < x->lookup_depth; i++)
 	x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
     if (red_avg_pkt_size < 1)
 	red_avg_pkt_size = 512 ;
     x->avg_pkt_size = red_avg_pkt_size ;
     if (red_max_pkt_size < 1)
 	red_max_pkt_size = 1500 ;
     x->max_pkt_size = red_max_pkt_size ;
     return 0 ;
 }
 
 static int
 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
 {
     if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
 	int l = pfs->rq_size;
 
 	if (l == 0)
 	    l = dn_hash_size;
 	if (l < 4)
 	    l = 4;
 	else if (l > DN_MAX_HASH_SIZE)
 	    l = DN_MAX_HASH_SIZE;
 	x->rq_size = l;
     } else                  /* one is enough for null mask */
 	x->rq_size = 1;
     x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
 	    M_DUMMYNET, M_DONTWAIT | M_ZERO);
     if (x->rq == NULL) {
 	printf("sorry, cannot allocate queue\n");
 	return ENOSPC;
     }
     x->rq_elements = 0;
     return 0 ;
 }
 
 static void
 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
 {
     x->flags_fs = src->flags_fs;
     x->qsize = src->qsize;
     x->plr = src->plr;
     x->flow_mask = src->flow_mask;
     if (x->flags_fs & DN_QSIZE_IS_BYTES) {
 	if (x->qsize > 1024*1024)
 	    x->qsize = 1024*1024 ;
     } else {
 	if (x->qsize == 0)
 	    x->qsize = 50 ;
 	if (x->qsize > 100)
 	    x->qsize = 50 ;
     }
     /* configuring RED */
     if ( x->flags_fs & DN_IS_RED )
 	config_red(src, x) ;    /* XXX should check errors */
 }
 
 /*
  * setup pipe or queue parameters.
  */
 
 static int 
 config_pipe(struct dn_pipe *p)
 {
     int s ;
     struct dn_flow_set *pfs = &(p->fs);
 
     /*
      * The config program passes parameters as follows:
      * bw = bits/second (0 means no limits),
      * delay = ms, must be translated into ticks.
      * qsize = slots/bytes
      */
     p->delay = ( p->delay * hz ) / 1000 ;
     /* We need either a pipe number or a flow_set number */
     if (p->pipe_nr == 0 && pfs->fs_nr == 0)
 	return EINVAL ;
     if (p->pipe_nr != 0 && pfs->fs_nr != 0)
 	return EINVAL ;
     if (p->pipe_nr != 0) { /* this is a pipe */
 	struct dn_pipe *x, *a, *b;
 	/* locate pipe */
 	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
 		 a = b , b = b->next) ;
 
 	if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
 	    x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO);
 	    if (x == NULL) {
 		printf("ip_dummynet.c: no memory for new pipe\n");
 		return ENOSPC;
 	    }
 	    x->pipe_nr = p->pipe_nr;
 	    x->fs.pipe = x ;
 	    /* idle_heap is the only one from which we extract from the middle.
 	     */
 	    x->idle_heap.size = x->idle_heap.elements = 0 ;
 	    x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
 	} else
 	    x = b;
 
 	    x->bandwidth = p->bandwidth ;
 	x->numbytes = 0; /* just in case... */
 	bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
 	x->ifp = NULL ; /* reset interface ptr */
 	    x->delay = p->delay ;
 	set_fs_parms(&(x->fs), pfs);
 
 
 	if ( x->fs.rq == NULL ) { /* a new pipe */
 	    s = alloc_hash(&(x->fs), pfs) ;
 	    if (s) {
 		free(x, M_DUMMYNET);
 		return s ;
 	    }
 	    s = splimp() ;
 	    x->next = b ;
 	    if (a == NULL)
 		all_pipes = x ;
 	    else
 		a->next = x ;
 	    splx(s);
 	}
     } else { /* config queue */
 	struct dn_flow_set *x, *a, *b ;
 
 	/* locate flow_set */
 	for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
 		 a = b , b = b->next) ;
 
 	if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new  */
 	    if (pfs->parent_nr == 0)	/* need link to a pipe */
 		return EINVAL ;
 	    x = malloc(sizeof(struct dn_flow_set),M_DUMMYNET,M_DONTWAIT|M_ZERO);
 	    if (x == NULL) {
 		printf("ip_dummynet.c: no memory for new flow_set\n");
 		return ENOSPC;
 	    }
 	    x->fs_nr = pfs->fs_nr;
 	    x->parent_nr = pfs->parent_nr;
 	    x->weight = pfs->weight ;
 	    if (x->weight == 0)
 		x->weight = 1 ;
 	    else if (x->weight > 100)
 		x->weight = 100 ;
 	} else {
 	    /* Change parent pipe not allowed; must delete and recreate */
 	    if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr)
 		return EINVAL ;
 	    x = b;
 	}
 	set_fs_parms(x, pfs);
 
 	if ( x->rq == NULL ) { /* a new flow_set */
 	    s = alloc_hash(x, pfs) ;
 	    if (s) {
 		free(x, M_DUMMYNET);
 		return s ;
 	    }
 	    s = splimp() ;
 	    x->next = b;
 	    if (a == NULL)
 		all_flow_sets = x;
 	    else
 		a->next = x;
 	    splx(s);
 	}
     }
     return 0 ;
 }
 
 /*
  * Helper function to remove from a heap queues which are linked to
  * a flow_set about to be deleted.
  */
 static void
 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
 {
     int i = 0, found = 0 ;
     for (; i < h->elements ;)
 	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
 	    h->elements-- ;
 	    h->p[i] = h->p[h->elements] ;
 	    found++ ;
 	} else
 	    i++ ;
     if (found)
 	heapify(h);
 }
 
 /*
  * helper function to remove a pipe from a heap (can be there at most once)
  */
 static void
 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
 {
     if (h->elements > 0) {
 	int i = 0 ;
 	for (i=0; i < h->elements ; i++ ) {
 	    if (h->p[i].object == p) { /* found it */
 		h->elements-- ;
 		h->p[i] = h->p[h->elements] ;
 		heapify(h);
 		break ;
 	    }
 	}
     }
 }
 
 /*
  * drain all queues. Called in case of severe mbuf shortage.
  */
 void
 dummynet_drain()
 {
     struct dn_flow_set *fs;
     struct dn_pipe *p;
     struct dn_pkt *pkt;
 
     heap_free(&ready_heap);
     heap_free(&wfq_ready_heap);
     heap_free(&extract_heap);
     /* remove all references to this pipe from flow_sets */
     for (fs = all_flow_sets; fs; fs= fs->next )
 	purge_flow_set(fs, 0);
 
     for (p = all_pipes; p; p= p->next ) {
 	purge_flow_set(&(p->fs), 0);
 	for (pkt = p->head ; pkt ; )
 	    DN_FREE_PKT(pkt) ;
 	p->head = p->tail = NULL ;
     }
 }
 
 /*
  * Fully delete a pipe or a queue, cleaning up associated info.
  */
 static int 
 delete_pipe(struct dn_pipe *p)
 {
     int s ;
 
     if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
 	return EINVAL ;
     if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
 	return EINVAL ;
     if (p->pipe_nr != 0) { /* this is an old-style pipe */
 	struct dn_pipe *a, *b;
 	struct dn_flow_set *fs;
 
 	/* locate pipe */
 	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
 		 a = b , b = b->next) ;
 	if (b == NULL || (b->pipe_nr != p->pipe_nr) )
 	    return EINVAL ; /* not found */
 
 	s = splimp() ;
 
 	/* unlink from list of pipes */
 	if (a == NULL)
 	    all_pipes = b->next ;
 	else
 	    a->next = b->next ;
 	/* remove references to this pipe from the ip_fw rules. */
 	flush_pipe_ptrs(&(b->fs));
 
 	/* remove all references to this pipe from flow_sets */
 	for (fs = all_flow_sets; fs; fs= fs->next )
 	    if (fs->pipe == b) {
 		printf("++ ref to pipe %d from fs %d\n",
 			p->pipe_nr, fs->fs_nr);
 		fs->pipe = NULL ;
 		purge_flow_set(fs, 0);
 	    }
 	fs_remove_from_heap(&ready_heap, &(b->fs));
 	purge_pipe(b);	/* remove all data associated to this pipe */
 	/* remove reference to here from extract_heap and wfq_ready_heap */
 	pipe_remove_from_heap(&extract_heap, b);
 	pipe_remove_from_heap(&wfq_ready_heap, b);
 	splx(s);
 	free(b, M_DUMMYNET);
     } else { /* this is a WF2Q queue (dn_flow_set) */
 	struct dn_flow_set *a, *b;
 
 	/* locate set */
 	for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
 		 a = b , b = b->next) ;
 	if (b == NULL || (b->fs_nr != p->fs.fs_nr) )
 	    return EINVAL ; /* not found */
 
 	s = splimp() ;
 	if (a == NULL)
 	    all_flow_sets = b->next ;
 	else
 	    a->next = b->next ;
 	/* remove references to this flow_set from the ip_fw rules. */
 	flush_pipe_ptrs(b);
 
 	if (b->pipe != NULL) {
 	    /* Update total weight on parent pipe and cleanup parent heaps */
 	    b->pipe->sum -= b->weight * b->backlogged ;
 	    fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
 	    fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
 #if 1	/* XXX should i remove from idle_heap as well ? */
 	    fs_remove_from_heap(&(b->pipe->idle_heap), b);
 #endif
 	}
 	purge_flow_set(b, 1);
 	splx(s);
     }
     return 0 ;
 }
 
 /*
  * helper function used to copy data from kernel in DUMMYNET_GET
  */
 static char *
 dn_copy_set(struct dn_flow_set *set, char *bp)
 {
     int i, copied = 0 ;
     struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
 
     for (i = 0 ; i <= set->rq_size ; i++)
 	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
 	    if (q->hash_slot != i)
 		printf("++ at %d: wrong slot (have %d, "
 		    "should be %d)\n", copied, q->hash_slot, i);
 	    if (q->fs != set)
 		printf("++ at %d: wrong fs ptr (have %p, should be %p)\n",
 			i, q->fs, set);
 	    copied++ ;
 	    bcopy(q, qp, sizeof( *q ) );
 	    /* cleanup pointers */
 	    qp->next = NULL ;
 	    qp->head = qp->tail = NULL ;
 	    qp->fs = NULL ;
 	}
     if (copied != set->rq_elements)
 	printf("++ wrong count, have %d should be %d\n",
 	    copied, set->rq_elements);
     return (char *)qp ;
 }
 
 static int
 dummynet_get(struct sockopt *sopt)
 {
     char *buf, *bp ; /* bp is the "copy-pointer" */
     size_t size ;
     struct dn_flow_set *set ;
     struct dn_pipe *p ;
     int s, error=0 ;
 
     s = splimp();
     /*
      * compute size of data structures: list of pipes and flow_sets.
      */
     for (p = all_pipes, size = 0 ; p ; p = p->next )
 	size += sizeof( *p ) +
 	    p->fs.rq_elements * sizeof(struct dn_flow_queue);
     for (set = all_flow_sets ; set ; set = set->next )
 	size += sizeof ( *set ) +
 	    set->rq_elements * sizeof(struct dn_flow_queue);
     buf = malloc(size, M_TEMP, M_DONTWAIT);
     if (buf == 0) {
 	splx(s);
 	return ENOBUFS ;
     }
     for (p = all_pipes, bp = buf ; p ; p = p->next ) {
 	struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
 
 	/*
 	 * copy pipe descriptor into *bp, convert delay back to ms,
 	 * then copy the flow_set descriptor(s) one at a time.
 	 * After each flow_set, copy the queue descriptor it owns.
 	 */
 	bcopy(p, bp, sizeof( *p ) );
 	pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
 	/*
 	 * XXX the following is a hack based on ->next being the
 	 * first field in dn_pipe and dn_flow_set. The correct
 	 * solution would be to move the dn_flow_set to the beginning
 	 * of struct dn_pipe.
 	 */
 	pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
 	/* clean pointers */
 	pipe_bp->head = pipe_bp->tail = NULL ;
 	pipe_bp->fs.next = NULL ;
 	pipe_bp->fs.pipe = NULL ;
 	pipe_bp->fs.rq = NULL ;
 
 	bp += sizeof( *p ) ;
 	bp = dn_copy_set( &(p->fs), bp );
     }
     for (set = all_flow_sets ; set ; set = set->next ) {
 	struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
 	bcopy(set, bp, sizeof( *set ) );
 	/* XXX same hack as above */
 	fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
 	fs_bp->pipe = NULL ;
 	fs_bp->rq = NULL ;
 	bp += sizeof( *set ) ;
 	bp = dn_copy_set( set, bp );
     }
     splx(s);
     error = sooptcopyout(sopt, buf, size);
     free(buf, M_TEMP);
     return error ;
 }
 
 /*
  * Handler for the various dummynet socket options (get, flush, config, del)
  */
 static int
 ip_dn_ctl(struct sockopt *sopt)
 {
     int error = 0 ;
     struct dn_pipe *p, tmp_pipe;
 
     /* Disallow sets in really-really secure mode. */
     if (sopt->sopt_dir == SOPT_SET) {
 #if __FreeBSD_version >= 500034
 	error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
 	if (error)
 	    return (error);
 #else
 	if (securelevel >= 3)
 	    return (EPERM);
 #endif
     }
 
     switch (sopt->sopt_name) {
     default :
 	printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name);
 	return EINVAL ;
 
     case IP_DUMMYNET_GET :
 	error = dummynet_get(sopt);
 	break ;
 
     case IP_DUMMYNET_FLUSH :
 	dummynet_flush() ;
 	break ;
 
     case IP_DUMMYNET_CONFIGURE :
 	p = &tmp_pipe ;
 	error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 	if (error)
 	    break ;
 	error = config_pipe(p);
 	break ;
 
     case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
 	p = &tmp_pipe ;
 	error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 	if (error)
 	    break ;
 
 	error = delete_pipe(p);
 	break ;
     }
     return error ;
 }
 
 static void
 ip_dn_init(void)
 {
     printf("DUMMYNET initialized (011031)\n");
     all_pipes = NULL ;
     all_flow_sets = NULL ;
     ready_heap.size = ready_heap.elements = 0 ;
     ready_heap.offset = 0 ;
 
     wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
     wfq_ready_heap.offset = 0 ;
 
     extract_heap.size = extract_heap.elements = 0 ;
     extract_heap.offset = 0 ;
     ip_dn_ctl_ptr = ip_dn_ctl;
     ip_dn_io_ptr = dummynet_io;
     ip_dn_ruledel_ptr = dn_rule_delete;
     bzero(&dn_timeout, sizeof(struct callout_handle));
     dn_timeout = timeout(dummynet, NULL, 1);
 }
 
 static int
 dummynet_modevent(module_t mod, int type, void *data)
 {
 	int s;
 	switch (type) {
 	case MOD_LOAD:
 		s = splimp();
 		if (DUMMYNET_LOADED) {
 		    splx(s);
 		    printf("DUMMYNET already loaded\n");
 		    return EEXIST ;
 		}
 		ip_dn_init();
 		splx(s);
 		break;
 
 	case MOD_UNLOAD:
 #if !defined(KLD_MODULE)
 		printf("dummynet statically compiled, cannot unload\n");
 		return EINVAL ;
 #else
 		s = splimp();
 		untimeout(dummynet, NULL, dn_timeout);
 		dummynet_flush();
 		ip_dn_ctl_ptr = NULL;
 		ip_dn_io_ptr = NULL;
 		ip_dn_ruledel_ptr = NULL;
 		splx(s);
 #endif
 		break ;
 	default:
 		break ;
 	}
 	return 0 ;
 }
 
 static moduledata_t dummynet_mod = {
 	"dummynet",
 	dummynet_modevent,
 	NULL
 };
 DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_DEPEND(dummynet, ipfw, 1, 1, 1);
 MODULE_VERSION(dummynet, 1);
Index: head/sys/netinet/ip_encap.c
===================================================================
--- head/sys/netinet/ip_encap.c	(revision 105193)
+++ head/sys/netinet/ip_encap.c	(revision 105194)
@@ -1,522 +1,510 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * My grandfather said that there's a devil inside tunnelling technology...
  *
  * We have surprisingly many protocols that want packets with IP protocol
  * #4 or #41.  Here's a list of protocols that want protocol #41:
  *	RFC1933 configured tunnel
  *	RFC1933 automatic tunnel
  *	RFC2401 IPsec tunnel
  *	RFC2473 IPv6 generic packet tunnelling
  *	RFC2529 6over4 tunnel
  *	mobile-ip6 (uses RFC2473)
  *	RFC3056 6to4 tunnel
  *	isatap tunnel
  * Here's a list of protocol that want protocol #4:
  *	RFC1853 IPv4-in-IPv4 tunnelling
  *	RFC2003 IPv4 encapsulation within IPv4
  *	RFC2344 reverse tunnelling for mobile-ip4
  *	RFC2401 IPsec tunnel
  * Well, what can I say.  They impose different en/decapsulation mechanism
  * from each other, so they need separate protocol handler.  The only one
  * we can easily determine by protocol # is IPsec, which always has
  * AH/ESP/IPComp header right after outer IP header.
  *
  * So, clearly good old protosw does not work for protocol #4 and #41.
  * The code will let you match protocol via src/dst address pair.
  */
 /* XXX is M_NETADDR correct? */
 
 #include "opt_mrouting.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/ip6protosw.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #include <net/net_osdep.h>
 
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void encap_add(struct encaptab *);
 static int mask_match(const struct encaptab *, const struct sockaddr *,
 		const struct sockaddr *);
 static void encap_fillarg(struct mbuf *, const struct encaptab *);
 
 #ifndef LIST_HEAD_INITIALIZER
 /* rely upon BSS initialization */
 LIST_HEAD(, encaptab) encaptab;
 #else
 LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab);
 #endif
 
 void
 encap_init()
 {
 	static int initialized = 0;
 
 	if (initialized)
 		return;
 	initialized++;
 #if 0
 	/*
 	 * we cannot use LIST_INIT() here, since drivers may want to call
 	 * encap_attach(), on driver attach.  encap_init() will be called
 	 * on AF_INET{,6} initialization, which happens after driver
 	 * initialization - using LIST_INIT() here can nuke encap_attach()
 	 * from drivers.
 	 */
 	LIST_INIT(&encaptab);
 #endif
 }
 
 #ifdef INET
 void
 encap4_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct ip *ip;
 	int proto;
 	struct sockaddr_in s, d;
 	const struct protosw *psw;
 	struct encaptab *ep, *match;
 	int prio, matchprio;
 
 	ip = mtod(m, struct ip *);
 	proto = ip->ip_p;
 
 	bzero(&s, sizeof(s));
 	s.sin_family = AF_INET;
 	s.sin_len = sizeof(struct sockaddr_in);
 	s.sin_addr = ip->ip_src;
 	bzero(&d, sizeof(d));
 	d.sin_family = AF_INET;
 	d.sin_len = sizeof(struct sockaddr_in);
 	d.sin_addr = ip->ip_dst;
 
 	match = NULL;
 	matchprio = 0;
 	LIST_FOREACH(ep, &encaptab, chain) {
 		if (ep->af != AF_INET)
 			continue;
 		if (ep->proto >= 0 && ep->proto != proto)
 			continue;
 		if (ep->func)
 			prio = (*ep->func)(m, off, proto, ep->arg);
 		else {
 			/*
 			 * it's inbound traffic, we need to match in reverse
 			 * order
 			 */
 			prio = mask_match(ep, (struct sockaddr *)&d,
 			    (struct sockaddr *)&s);
 		}
 
 		/*
 		 * We prioritize the matches by using bit length of the
 		 * matches.  mask_match() and user-supplied matching function
 		 * should return the bit length of the matches (for example,
 		 * if both src/dst are matched for IPv4, 64 should be returned).
 		 * 0 or negative return value means "it did not match".
 		 *
 		 * The question is, since we have two "mask" portion, we
 		 * cannot really define total order between entries.
 		 * For example, which of these should be preferred?
 		 * mask_match() returns 48 (32 + 16) for both of them.
 		 *	src=3ffe::/16, dst=3ffe:501::/32
 		 *	src=3ffe:501::/32, dst=3ffe::/16
 		 *
 		 * We need to loop through all the possible candidates
 		 * to get the best match - the search takes O(n) for
 		 * n attachments (i.e. interfaces).
 		 */
 		if (prio <= 0)
 			continue;
 		if (prio > matchprio) {
 			matchprio = prio;
 			match = ep;
 		}
 	}
 
 	if (match) {
 		/* found a match, "match" has the best one */
 		psw = match->psw;
 		if (psw && psw->pr_input) {
 			encap_fillarg(m, match);
 			(*psw->pr_input)(m, off);
 		} else
 			m_freem(m);
 		return;
 	}
 
 	/* last resort: inject to raw socket */
 	rip_input(m, off);
 }
 #endif
 
 #ifdef INET6
 int
 encap6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp;
 	int proto;
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6;
 	struct sockaddr_in6 s, d;
 	const struct ip6protosw *psw;
 	struct encaptab *ep, *match;
 	int prio, matchprio;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	bzero(&s, sizeof(s));
 	s.sin6_family = AF_INET6;
 	s.sin6_len = sizeof(struct sockaddr_in6);
 	s.sin6_addr = ip6->ip6_src;
 	bzero(&d, sizeof(d));
 	d.sin6_family = AF_INET6;
 	d.sin6_len = sizeof(struct sockaddr_in6);
 	d.sin6_addr = ip6->ip6_dst;
 
 	match = NULL;
 	matchprio = 0;
 	LIST_FOREACH(ep, &encaptab, chain) {
 		if (ep->af != AF_INET6)
 			continue;
 		if (ep->proto >= 0 && ep->proto != proto)
 			continue;
 		if (ep->func)
 			prio = (*ep->func)(m, *offp, proto, ep->arg);
 		else {
 			/*
 			 * it's inbound traffic, we need to match in reverse
 			 * order
 			 */
 			prio = mask_match(ep, (struct sockaddr *)&d,
 			    (struct sockaddr *)&s);
 		}
 
 		/* see encap4_input() for issues here */
 		if (prio <= 0)
 			continue;
 		if (prio > matchprio) {
 			matchprio = prio;
 			match = ep;
 		}
 	}
 
 	if (match) {
 		/* found a match */
 		psw = (const struct ip6protosw *)match->psw;
 		if (psw && psw->pr_input) {
 			encap_fillarg(m, match);
 			return (*psw->pr_input)(mp, offp, proto);
 		} else {
 			m_freem(m);
 			return IPPROTO_DONE;
 		}
 	}
 
 	/* last resort: inject to raw socket */
 	return rip6_input(mp, offp, proto);
 }
 #endif
 
 static void
 encap_add(ep)
 	struct encaptab *ep;
 {
 
 	LIST_INSERT_HEAD(&encaptab, ep, chain);
 }
 
 /*
  * sp (src ptr) is always my side, and dp (dst ptr) is always remote side.
  * length of mask (sm and dm) is assumed to be same as sp/dp.
  * Return value will be necessary as input (cookie) for encap_detach().
  */
 const struct encaptab *
 encap_attach(af, proto, sp, sm, dp, dm, psw, arg)
 	int af;
 	int proto;
 	const struct sockaddr *sp, *sm;
 	const struct sockaddr *dp, *dm;
 	const struct protosw *psw;
 	void *arg;
 {
 	struct encaptab *ep;
 	int error;
 	int s;
 
 	s = splnet();
 	/* sanity check on args */
 	if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) {
 		error = EINVAL;
 		goto fail;
 	}
 	if (sp->sa_len != dp->sa_len) {
 		error = EINVAL;
 		goto fail;
 	}
 	if (af != sp->sa_family || af != dp->sa_family) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	/* check if anyone have already attached with exactly same config */
 	LIST_FOREACH(ep, &encaptab, chain) {
 		if (ep->af != af)
 			continue;
 		if (ep->proto != proto)
 			continue;
 		if (ep->src.ss_len != sp->sa_len ||
 		    bcmp(&ep->src, sp, sp->sa_len) != 0 ||
 		    bcmp(&ep->srcmask, sm, sp->sa_len) != 0)
 			continue;
 		if (ep->dst.ss_len != dp->sa_len ||
 		    bcmp(&ep->dst, dp, dp->sa_len) != 0 ||
 		    bcmp(&ep->dstmask, dm, dp->sa_len) != 0)
 			continue;
 
 		error = EEXIST;
 		goto fail;
 	}
 
 	ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT);	/*XXX*/
 	if (ep == NULL) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	bzero(ep, sizeof(*ep));
 
 	ep->af = af;
 	ep->proto = proto;
 	bcopy(sp, &ep->src, sp->sa_len);
 	bcopy(sm, &ep->srcmask, sp->sa_len);
 	bcopy(dp, &ep->dst, dp->sa_len);
 	bcopy(dm, &ep->dstmask, dp->sa_len);
 	ep->psw = psw;
 	ep->arg = arg;
 
 	encap_add(ep);
 
 	error = 0;
 	splx(s);
 	return ep;
 
 fail:
 	splx(s);
 	return NULL;
 }
 
 const struct encaptab *
 encap_attach_func(af, proto, func, psw, arg)
 	int af;
 	int proto;
 	int (*func)(const struct mbuf *, int, int, void *);
 	const struct protosw *psw;
 	void *arg;
 {
 	struct encaptab *ep;
 	int error;
 	int s;
 
 	s = splnet();
 	/* sanity check on args */
 	if (!func) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT);	/*XXX*/
 	if (ep == NULL) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	bzero(ep, sizeof(*ep));
 
 	ep->af = af;
 	ep->proto = proto;
 	ep->func = func;
 	ep->psw = psw;
 	ep->arg = arg;
 
 	encap_add(ep);
 
 	error = 0;
 	splx(s);
 	return ep;
 
 fail:
 	splx(s);
 	return NULL;
 }
 
 int
 encap_detach(cookie)
 	const struct encaptab *cookie;
 {
 	const struct encaptab *ep = cookie;
 	struct encaptab *p;
 
 	LIST_FOREACH(p, &encaptab, chain) {
 		if (p == ep) {
 			LIST_REMOVE(p, chain);
 			free(p, M_NETADDR);	/*XXX*/
 			return 0;
 		}
 	}
 
 	return EINVAL;
 }
 
 static int
 mask_match(ep, sp, dp)
 	const struct encaptab *ep;
 	const struct sockaddr *sp;
 	const struct sockaddr *dp;
 {
 	struct sockaddr_storage s;
 	struct sockaddr_storage d;
 	int i;
 	const u_int8_t *p, *q;
 	u_int8_t *r;
 	int matchlen;
 
 	if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d))
 		return 0;
 	if (sp->sa_family != ep->af || dp->sa_family != ep->af)
 		return 0;
 	if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len)
 		return 0;
 
 	matchlen = 0;
 
 	p = (const u_int8_t *)sp;
 	q = (const u_int8_t *)&ep->srcmask;
 	r = (u_int8_t *)&s;
 	for (i = 0 ; i < sp->sa_len; i++) {
 		r[i] = p[i] & q[i];
 		/* XXX estimate */
 		matchlen += (q[i] ? 8 : 0);
 	}
 
 	p = (const u_int8_t *)dp;
 	q = (const u_int8_t *)&ep->dstmask;
 	r = (u_int8_t *)&d;
 	for (i = 0 ; i < dp->sa_len; i++) {
 		r[i] = p[i] & q[i];
 		/* XXX rough estimate */
 		matchlen += (q[i] ? 8 : 0);
 	}
 
 	/* need to overwrite len/family portion as we don't compare them */
 	s.ss_len = sp->sa_len;
 	s.ss_family = sp->sa_family;
 	d.ss_len = dp->sa_len;
 	d.ss_family = dp->sa_family;
 
 	if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 &&
 	    bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) {
 		return matchlen;
 	} else
 		return 0;
 }
 
 static void
 encap_fillarg(m, ep)
 	struct mbuf *m;
 	const struct encaptab *ep;
 {
-#if 0
-	m->m_pkthdr.aux = ep->arg;
-#else
-	struct mbuf *n;
+	struct m_tag *tag;
 
-	n = m_aux_add(m, AF_INET, IPPROTO_IPV4);
-	if (n) {
-		*mtod(n, void **) = ep->arg;
-		n->m_len = sizeof(void *);
+	tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT);
+	if (tag) {
+		*(void**)(tag+1) = ep->arg;
+		m_tag_prepend(m, tag);
 	}
-#endif
 }
 
 void *
 encap_getarg(m)
 	struct mbuf *m;
 {
-	void *p;
-#if 0
-	p = m->m_pkthdr.aux;
-	m->m_pkthdr.aux = NULL;
-	return p;
-#else
-	struct mbuf *n;
+	void *p = NULL;
+	struct m_tag *tag;
 
-	p = NULL;
-	n = m_aux_find(m, AF_INET, IPPROTO_IPV4);
-	if (n) {
-		if (n->m_len == sizeof(void *))
-			p = *mtod(n, void **);
-		m_aux_delete(m, n);
+	tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL);
+	if (tag) {
+		p = *(void**)(tag+1);
+		m_tag_delete(m, tag);
 	}
 	return p;
-#endif
 }
Index: head/sys/netinet/ip_fw2.c
===================================================================
--- head/sys/netinet/ip_fw2.c	(revision 105193)
+++ head/sys/netinet/ip_fw2.c	(revision 105194)
@@ -1,2770 +1,2770 @@
 /*
  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright  
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #define        DEB(x)
 #define        DDB(x) x
 
 /*
  * Implement IP packet firewall (new version)
  */
 
 #if !defined(KLD_MODULE)
 #include "opt_ipfw.h"
 #include "opt_ipdn.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #endif
 
 #define IPFW2	1
 #if IPFW2
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 /*
  * XXX This one should go in sys/mbuf.h. It is used to avoid that
  * a firewall-generated packet loops forever through the firewall.
  */
 #ifndef	M_SKIP_FIREWALL
 #define M_SKIP_FIREWALL         0x4000
 #endif
 
 /*
  * set_disable contains one bit per set value (0..31).
  * If the bit is set, all rules with the corresponding set
  * are disabled. Set 31 is reserved for the default rule
  * and CANNOT be disabled.
  */
 static u_int32_t set_disable;
 
 static int fw_verbose;
 static int verbose_limit;
 
 static struct callout_handle ipfw_timeout_h;
 #define	IPFW_DEFAULT_RULE	65535
 
 /*
  * list of rules for layer 3
  */
 static struct ip_fw *layer3_chain;
 
 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 
 static int fw_debug = 1;
 int fw_one_pass = 1;
 static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
 
 #ifdef SYSCTL_NODE
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable,
     CTLFLAG_RW | CTLFLAG_SECURE,
     &fw_enable, 0, "Enable ipfw");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
     &autoinc_step, 0, "Rule number autincrement step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_RW | CTLFLAG_SECURE, 
     &fw_one_pass, 0, 
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, 
     &fw_debug, 0, "Enable printing of debug ip_fw statements");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_RW | CTLFLAG_SECURE, 
     &fw_verbose, 0, "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, 
     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
 
 /*
  * Description of dynamic rules.
  *
  * Dynamic rules are stored in lists accessed through a hash table
  * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
  * be modified through the sysctl variable dyn_buckets which is
  * updated when the table becomes empty.
  *
  * XXX currently there is only one list, ipfw_dyn.
  *
  * When a packet is received, its address fields are first masked
  * with the mask defined for the rule, then hashed, then matched
  * against the entries in the corresponding list.
  * Dynamic rules can be used for different purposes:
  *  + stateful rules;
  *  + enforcing limits on the number of sessions;
  *  + in-kernel NAT (not implemented yet)
  *
  * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
  * measured in seconds and depending on the flags.
  *
  * The total number of dynamic rules is stored in dyn_count.
  * The max number of dynamic rules is dyn_max. When we reach
  * the maximum number of rules we do not create anymore. This is
  * done to avoid consuming too much memory, but also too much
  * time when searching on each packet (ideally, we should try instead
  * to put a limit on the length of the list on each bucket...).
  *
  * Each dynamic rule holds a pointer to the parent ipfw rule so
  * we know what action to perform. Dynamic rules are removed when
  * the parent rule is deleted. XXX we should make them survive.
  *
  * There are some limitations with dynamic rules -- we do not
  * obey the 'randomized match', and we do not do multiple
  * passes through the firewall. XXX check the latter!!!
  */
 static ipfw_dyn_rule **ipfw_dyn_v = NULL;
 static u_int32_t dyn_buckets = 256; /* must be power of 2 */
 static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */
 
 /*
  * Timeouts for various events in handing dynamic rules.
  */
 static u_int32_t dyn_ack_lifetime = 300;
 static u_int32_t dyn_syn_lifetime = 20;
 static u_int32_t dyn_fin_lifetime = 1;
 static u_int32_t dyn_rst_lifetime = 1;
 static u_int32_t dyn_udp_lifetime = 10;
 static u_int32_t dyn_short_lifetime = 5;
 
 /*
  * Keepalives are sent if dyn_keepalive is set. They are sent every
  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
  * seconds of lifetime of a rule.
  * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
  * than dyn_keepalive_period.
  */
  
 static u_int32_t dyn_keepalive_interval = 20;
 static u_int32_t dyn_keepalive_period = 5;
 static u_int32_t dyn_keepalive = 1;	/* do send keepalives */
 
 static u_int32_t static_count;	/* # of static rules */
 static u_int32_t static_len;	/* size in bytes of static rules */
 static u_int32_t dyn_count;		/* # of dynamic rules */
 static u_int32_t dyn_max = 4096;	/* max # of dynamic rules */
 
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
     &dyn_buckets, 0, "Number of dyn. buckets");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
     &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
     &dyn_count, 0, "Number of dyn. rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
     &dyn_max, 0, "Max number of dyn. rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
     &static_count, 0, "Number of static rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
 
 #endif /* SYSCTL_NODE */
 
 
 static ip_fw_chk_t	ipfw_chk;
 
 ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL;	/* hook into dummynet */
 
 /*
  * This macro maps an ip pointer into a layer3 header pointer of type T
  */
 #define	L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 
 static __inline int
 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
 {
 	int type = L3HDR(struct icmp,ip)->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct ip *ip)
 {
 	int type = L3HDR(struct icmp, ip)->icmp_type;
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
 	u_char *cp = (u_char *)(tcp + 1);
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 
 		case TCPOPT_CC:
 		case TCPOPT_CCNEW:
 		case TCPOPT_CCECHO:
 			bits |= IP_FW_TCPOPT_CC;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 {
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return 0;
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		/* Check unit number (-1 is wildcard) */
 		if (cmd->p.unit != -1 && cmd->p.unit != ifp->if_unit)
 			return(0);
 		/* Check name */
 		if (!strncmp(ifp->if_name, cmd->name, IFNAMSIZ))
 			return(1);
 	} else {
 		struct ifaddr *ia;
 
 		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr == NULL)
 				continue;
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr)
 				return(1);	/* match */
 		}
 	}
 	return(0);	/* no match, fail ... */
 }
 
 static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
 
 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
 #define SNP(buf) buf, sizeof(buf)
 
 /*
  * We enter here when we have a rule with O_LOG.
  * XXX this function alone takes about 2Kbytes of code!
  */
 static void
 ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh,
 	struct mbuf *m, struct ifnet *oif)
 {
 	char *action;
 	int limit_reached = 0;
 	char action2[40], proto[48], fragment[28];
 
 	fragment[0] = '\0';
 	proto[0] = '\0';
 
 	if (f == NULL) {	/* bogus pkt */
 		if (verbose_limit != 0 && norule_counter >= verbose_limit)
 			return;
 		norule_counter++;
 		if (norule_counter == verbose_limit)
 			limit_reached = verbose_limit;
 		action = "Refuse";
 	} else {	/* O_LOG is the first action, find the real one */
 		ipfw_insn *cmd = ACTION_PTR(f);
 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
 
 		if (l->max_log != 0 && l->log_left == 0)
 			return;
 		l->log_left--;
 		if (l->log_left == 0)
 			limit_reached = l->max_log;
 		cmd += F_LEN(cmd);	/* point to first action */
 		if (cmd->opcode == O_PROB)
 			cmd += F_LEN(cmd);
 
 		action = action2;
 		switch (cmd->opcode) {
 		case O_DENY:
 			action = "Deny";
 			break;
 
 		case O_REJECT:
 			if (cmd->arg1==ICMP_REJECT_RST)
 				action = "Reset";
 			else if (cmd->arg1==ICMP_UNREACH_HOST)
 				action = "Reject";
 			else
 				snprintf(SNPARGS(action2, 0), "Unreach %d",
 					cmd->arg1);
 			break;
 
 		case O_ACCEPT:
 			action = "Accept";
 			break;
 		case O_COUNT:
 			action = "Count";
 			break;
 		case O_DIVERT:
 			snprintf(SNPARGS(action2, 0), "Divert %d",
 				cmd->arg1);
 			break;
 		case O_TEE:
 			snprintf(SNPARGS(action2, 0), "Tee %d",
 				cmd->arg1);
 			break;
 		case O_SKIPTO:
 			snprintf(SNPARGS(action2, 0), "SkipTo %d",
 				cmd->arg1);
 			break;
 		case O_PIPE:
 			snprintf(SNPARGS(action2, 0), "Pipe %d",
 				cmd->arg1);
 			break;
 		case O_QUEUE:
 			snprintf(SNPARGS(action2, 0), "Queue %d",
 				cmd->arg1);
 			break;
 		case O_FORWARD_IP: {
 			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
 			int len;
 
 			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
 				inet_ntoa(sa->sa.sin_addr));
 			if (sa->sa.sin_port)
 				snprintf(SNPARGS(action2, len), ":%d",
 					ntohs(sa->sa.sin_port));
 			}
 			break;
 		default:	
 			action = "UNKNOWN";
 			break;
 		}
 	}
 
 	if (hlen == 0) {	/* non-ip */
 		snprintf(SNPARGS(proto, 0), "MAC");
 	} else {
 		struct ip *ip = mtod(m, struct ip *);
 		/* these three are all aliases to the same thing */
 		struct icmp *const icmp = L3HDR(struct icmp, ip);
 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
 		struct udphdr *const udp = (struct udphdr *)icmp;
 
 		int ip_off, offset, ip_len;
 
 		int len;
 
 		if (eh != NULL) { /* layer 2 packets are as on the wire */
 			ip_off = ntohs(ip->ip_off);
 			ip_len = ntohs(ip->ip_len);
 		} else {
 			ip_off = ip->ip_off;
 			ip_len = ip->ip_len;
 		}
 		offset = ip_off & IP_OFFMASK;
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			len = snprintf(SNPARGS(proto, 0), "TCP %s",
 			    inet_ntoa(ip->ip_src));
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(tcp->th_sport),
 				    inet_ntoa(ip->ip_dst),
 				    ntohs(tcp->th_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s",
 				    inet_ntoa(ip->ip_dst));
 			break;
 
 		case IPPROTO_UDP:
 			len = snprintf(SNPARGS(proto, 0), "UDP %s",
 				inet_ntoa(ip->ip_src));
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(udp->uh_sport),
 				    inet_ntoa(ip->ip_dst),
 				    ntohs(udp->uh_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s",
 				    inet_ntoa(ip->ip_dst));
 			break;
 
 		case IPPROTO_ICMP:
 			if (offset == 0)
 				len = snprintf(SNPARGS(proto, 0),
 				    "ICMP:%u.%u ",
 				    icmp->icmp_type, icmp->icmp_code);
 			else
 				len = snprintf(SNPARGS(proto, 0), "ICMP ");
 			len += snprintf(SNPARGS(proto, len), "%s",
 			    inet_ntoa(ip->ip_src));
 			snprintf(SNPARGS(proto, len), " %s",
 			    inet_ntoa(ip->ip_dst));
 			break;
 
 		default:
 			len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
 			    inet_ntoa(ip->ip_src));
 			snprintf(SNPARGS(proto, len), " %s",
 			    inet_ntoa(ip->ip_dst));
 			break;
 		}
 
 		if (ip_off & (IP_MF | IP_OFFMASK))
 			snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
 			     ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
 			     offset << 3,
 			     (ip_off & IP_MF) ? "+" : "");
 	}
 	if (oif || m->m_pkthdr.rcvif)
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s %s via %s%d%s\n",
 		    f ? f->rulenum : -1,
 		    action, proto, oif ? "out" : "in",
 		    oif ? oif->if_name : m->m_pkthdr.rcvif->if_name,
 		    oif ? oif->if_unit : m->m_pkthdr.rcvif->if_unit,
 		    fragment);
 	else
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s [no if info]%s\n",
 		    f ? f->rulenum : -1,
 		    action, proto, fragment);
 	if (limit_reached)
 		log(LOG_SECURITY | LOG_NOTICE,
 		    "ipfw: limit %d reached on entry %d\n",
 		    limit_reached, f ? f->rulenum : -1);
 }
 
 /*
  * IMPORTANT: the hash function for dynamic rules must be commutative
  * in source and destination (ip,port), because rules are bidirectional
  * and we want to find both in the same bucket.
  */
 static __inline int
 hash_packet(struct ipfw_flow_id *id)
 {
 	u_int32_t i;
 
 	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
 	i &= (curr_dyn_buckets - 1);
 	return i;
 }
 
 /**
  * unlink a dynamic rule from a chain. prev is a pointer to
  * the previous one, q is a pointer to the rule to delete,
  * head is a pointer to the head of the queue.
  * Modifies q and potentially also head.
  */
 #define UNLINK_DYN_RULE(prev, head, q) {				\
 	ipfw_dyn_rule *old_q = q;					\
 									\
 	/* remove a refcount to the parent */				\
 	if (q->dyn_type == O_LIMIT)					\
 		q->parent->count--;					\
 	DEB(printf("-- unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",	\
 		(q->id.src_ip), (q->id.src_port),			\
 		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
 	if (prev != NULL)						\
 		prev->next = q = q->next;				\
 	else								\
 		head = q = q->next;					\
 	dyn_count--;							\
 	free(old_q, M_IPFW); }
 
 #define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
 
 /**
  * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
  *
  * If keep_me == NULL, rules are deleted even if not expired,
  * otherwise only expired rules are removed.
  *
  * The value of the second parameter is also used to point to identify
  * a rule we absolutely do not want to remove (e.g. because we are
  * holding a reference to it -- this is the case with O_LIMIT_PARENT
  * rules). The pointer is only used for comparison, so any non-null
  * value will do.
  */
 static void
 remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
 {
 	static u_int32_t last_remove = 0;
 
 #define FORCE (keep_me == NULL)
 
 	ipfw_dyn_rule *prev, *q;
 	int i, pass = 0, max_pass = 0;
 
 	if (ipfw_dyn_v == NULL || dyn_count == 0)
 		return;
 	/* do not expire more than once per second, it is useless */
 	if (!FORCE && last_remove == time_second)
 		return;
 	last_remove = time_second;
 
 	/*
 	 * because O_LIMIT refer to parent rules, during the first pass only
 	 * remove child and mark any pending LIMIT_PARENT, and remove
 	 * them in a second pass.
 	 */
 next_pass:
 	for (i = 0 ; i < curr_dyn_buckets ; i++) {
 		for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
 			/*
 			 * Logic can become complex here, so we split tests.
 			 */
 			if (q == keep_me)
 				goto next;
 			if (rule != NULL && rule != q->rule)
 				goto next; /* not the one we are looking for */
 			if (q->dyn_type == O_LIMIT_PARENT) {
 				/*
 				 * handle parent in the second pass,
 				 * record we need one.
 				 */
 				max_pass = 1;
 				if (pass == 0)
 					goto next;
 				if (FORCE && q->count != 0 ) {
 					/* XXX should not happen! */
 					printf( "OUCH! cannot remove rule,"
 					     " count %d\n", q->count);
 				}
 			} else {
 				if (!FORCE &&
 				    !TIME_LEQ( q->expire, time_second ))
 					goto next;
 			}
 			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
 			continue;
 next:
 			prev=q;
 			q=q->next;
 		}
 	}
 	if (pass++ < max_pass)
 		goto next_pass;
 }
 
 
 /**
  * lookup a dynamic rule.
  */
 static ipfw_dyn_rule *
 lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
 	struct tcphdr *tcp)
 {
 	/*
 	 * stateful ipfw extensions.
 	 * Lookup into dynamic session queue
 	 */
 #define MATCH_REVERSE	0
 #define MATCH_FORWARD	1
 #define MATCH_NONE	2
 #define MATCH_UNKNOWN	3
 	int i, dir = MATCH_NONE;
 	ipfw_dyn_rule *prev, *q=NULL;
 
 	if (ipfw_dyn_v == NULL)
 		goto done;	/* not found */
 	i = hash_packet( pkt );
 	for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
 		if (q->dyn_type == O_LIMIT_PARENT)
 			goto next;
 		if (TIME_LEQ( q->expire, time_second)) { /* expire entry */
 			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
 			continue;
 		}
 		if ( pkt->proto == q->id.proto) {
 			if (pkt->src_ip == q->id.src_ip &&
 			    pkt->dst_ip == q->id.dst_ip &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port ) {
 				dir = MATCH_FORWARD;
 				break;
 			}
 			if (pkt->src_ip == q->id.dst_ip &&
 			    pkt->dst_ip == q->id.src_ip &&
 			    pkt->src_port == q->id.dst_port &&
 			    pkt->dst_port == q->id.src_port ) {
 				dir = MATCH_REVERSE;
 				break;
 			}
 		}
 next:
 		prev = q;
 		q = q->next;
 	}
 	if (q == NULL)
 		goto done; /* q = NULL, not found */
 
 	if ( prev != NULL) { /* found and not in front */
 		prev->next = q->next;
 		q->next = ipfw_dyn_v[i];
 		ipfw_dyn_v[i] = q;
 	}
 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
 		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
 
 #define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
 #define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
 		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
 		switch (q->state) {
 		case TH_SYN:				/* opening */
 			q->expire = time_second + dyn_syn_lifetime;
 			break;
 
 		case BOTH_SYN:			/* move to established */
 		case BOTH_SYN | TH_FIN :	/* one side tries to close */
 		case BOTH_SYN | (TH_FIN << 8) :
  			if (tcp) {
 #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
 			    u_int32_t ack = ntohl(tcp->th_ack);
 			    if (dir == MATCH_FORWARD) {
 				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
 				    q->ack_fwd = ack;
 				else { /* ignore out-of-sequence */
 				    break;
 				}
 			    } else {
 				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
 				    q->ack_rev = ack;
 				else { /* ignore out-of-sequence */
 				    break;
 				}
 			    }
 			}
 			q->expire = time_second + dyn_ack_lifetime;
 			break;
 
 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
 			if (dyn_fin_lifetime >= dyn_keepalive_period)
 				dyn_fin_lifetime = dyn_keepalive_period - 1;
 			q->expire = time_second + dyn_fin_lifetime;
 			break;
 
 		default:
 #if 0
 			/*
 			 * reset or some invalid combination, but can also
 			 * occur if we use keep-state the wrong way.
 			 */
 			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
 				printf("invalid state: 0x%x\n", q->state);
 #endif
 			if (dyn_rst_lifetime >= dyn_keepalive_period)
 				dyn_rst_lifetime = dyn_keepalive_period - 1;
 			q->expire = time_second + dyn_rst_lifetime;
 			break;
 		}
 	} else if (pkt->proto == IPPROTO_UDP) {
 		q->expire = time_second + dyn_udp_lifetime;
 	} else {
 		/* other protocols */
 		q->expire = time_second + dyn_short_lifetime;
 	}
 done:
 	if (match_direction)
 		*match_direction = dir;
 	return q;
 }
 
 static void
 realloc_dynamic_table(void)
 {
 	/*
 	 * Try reallocation, make sure we have a power of 2 and do
 	 * not allow more than 64k entries. In case of overflow,
 	 * default to 1024.
 	 */
 
 	if (dyn_buckets > 65536)
 		dyn_buckets = 1024;
 	if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */
 		dyn_buckets = curr_dyn_buckets; /* reset */
 		return;
 	}
 	curr_dyn_buckets = dyn_buckets;
 	if (ipfw_dyn_v != NULL)
 		free(ipfw_dyn_v, M_IPFW);
 	for (;;) {
 		ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
 		       M_IPFW, M_DONTWAIT | M_ZERO);
 		if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2)
 			break;
 		curr_dyn_buckets /= 2;
 	}
 }
 
 /**
  * Install state of type 'type' for a dynamic session.
  * The hash table contains two type of rules:
  * - regular rules (O_KEEP_STATE)
  * - rules for sessions with limited number of sess per user
  *   (O_LIMIT). When they are created, the parent is
  *   increased by 1, and decreased on delete. In this case,
  *   the third parameter is the parent rule and not the chain.
  * - "parent" rules for the above (O_LIMIT_PARENT).
  */
 static ipfw_dyn_rule *
 add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *r;
 	int i;
 
 	if (ipfw_dyn_v == NULL ||
 	    (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
 		realloc_dynamic_table();
 		if (ipfw_dyn_v == NULL)
 			return NULL; /* failed ! */
 	}
 	i = hash_packet(id);
 
 	r = malloc(sizeof *r, M_IPFW, M_DONTWAIT | M_ZERO);
 	if (r == NULL) {
 		printf ("sorry cannot allocate state\n");
 		return NULL;
 	}
 
 	/* increase refcount on parent, and set pointer */
 	if (dyn_type == O_LIMIT) {
 		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
 		if ( parent->dyn_type != O_LIMIT_PARENT)
 			panic("invalid parent");
 		parent->count++;
 		r->parent = parent;
 		rule = parent->rule;
 	}
 
 	r->id = *id;
 	r->expire = time_second + dyn_syn_lifetime;
 	r->rule = rule;
 	r->dyn_type = dyn_type;
 	r->pcnt = r->bcnt = 0;
 	r->count = 0;
 
 	r->bucket = i;
 	r->next = ipfw_dyn_v[i];
 	ipfw_dyn_v[i] = r;
 	dyn_count++;
 	DEB(printf("-- add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
 	   dyn_type,
 	   (r->id.src_ip), (r->id.src_port),
 	   (r->id.dst_ip), (r->id.dst_port),
 	   dyn_count ); )
 	return r;
 }
 
 /**
  * lookup dynamic parent rule using pkt and rule as search keys.
  * If the lookup fails, then install one.
  */
 static ipfw_dyn_rule *
 lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *q;
 	int i;
 
 	if (ipfw_dyn_v) {
 		i = hash_packet( pkt );
 		for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
 			if (q->dyn_type == O_LIMIT_PARENT &&
 			    rule== q->rule &&
 			    pkt->proto == q->id.proto &&
 			    pkt->src_ip == q->id.src_ip &&
 			    pkt->dst_ip == q->id.dst_ip &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port) {
 				q->expire = time_second + dyn_short_lifetime;
 				DEB(printf("lookup_dyn_parent found 0x%p\n",q);)
 				return q;
 			}
 	}
 	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
 }
 
 /**
  * Install dynamic state for rule type cmd->o.opcode
  *
  * Returns 1 (failure) if state is not installed because of errors or because
  * session limitations are enforced.
  */
 static int
 install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
 	struct ip_fw_args *args)
 {
 	static int last_log;
 
 	ipfw_dyn_rule *q;
 
 	DEB(printf("-- install state type %d 0x%08x %u -> 0x%08x %u\n",
 	    cmd->o.opcode,
 	    (args->f_id.src_ip), (args->f_id.src_port),
 	    (args->f_id.dst_ip), (args->f_id.dst_port) );)
 
 	q = lookup_dyn_rule(&args->f_id, NULL, NULL);
 
 	if (q != NULL) { /* should never occur */
 		if (last_log != time_second) {
 			last_log = time_second;
 			printf(" install_state: entry already present, done\n");
 		}
 		return 0;
 	}
 
 	if (dyn_count >= dyn_max)
 		/*
 		 * Run out of slots, try to remove any expired rule.
 		 */
 		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
 
 	if (dyn_count >= dyn_max) {
 		if (last_log != time_second) {
 			last_log = time_second;
 			printf("install_state: Too many dynamic rules\n");
 		}
 		return 1; /* cannot install, notify caller */
 	}
 
 	switch (cmd->o.opcode) {
 	case O_KEEP_STATE: /* bidir rule */
 		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
 		break;
 
 	case O_LIMIT: /* limit number of sessions */
 	    {
 		u_int16_t limit_mask = cmd->limit_mask;
 		struct ipfw_flow_id id;
 		ipfw_dyn_rule *parent;
 
 		DEB(printf("installing dyn-limit rule %d\n", cmd->conn_limit);)
 
 		id.dst_ip = id.src_ip = 0;
 		id.dst_port = id.src_port = 0;
 		id.proto = args->f_id.proto;
 
 		if (limit_mask & DYN_SRC_ADDR)
 			id.src_ip = args->f_id.src_ip;
 		if (limit_mask & DYN_DST_ADDR)
 			id.dst_ip = args->f_id.dst_ip;
 		if (limit_mask & DYN_SRC_PORT)
 			id.src_port = args->f_id.src_port;
 		if (limit_mask & DYN_DST_PORT)
 			id.dst_port = args->f_id.dst_port;
 		parent = lookup_dyn_parent(&id, rule);
 		if (parent == NULL) {
 			printf("add parent failed\n");
 			return 1;
 		}
 		if (parent->count >= cmd->conn_limit) {
 			/*
 			 * See if we can remove some expired rule.
 			 */
 			remove_dyn_rule(rule, parent);
 			if (parent->count >= cmd->conn_limit) {
 				if (fw_verbose && last_log != time_second) {
 					last_log = time_second;
 					printf(
 					    "drop session, too many entries\n");
 				}
 				return 1;
 			}
 		}
 		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
 	    }
 		break;
 	default:
 		printf("unknown dynamic rule type %u\n", cmd->o.opcode);
 		return 1;
 	}
 	lookup_dyn_rule(&args->f_id, NULL, NULL); /* XXX just set lifetime */
 	return 0;
 }
 
 /*
  * Transmit a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  */
 static void
 send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
 {
 	struct mbuf *m;
 	struct ip *ip;
 	struct tcphdr *tcp;
 	struct route sro;	/* fake route */
 
 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
 	if (m == 0)  
 		return;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
 	m->m_data += max_linkhdr;
 
 	ip = mtod(m, struct ip *);
 	bzero(ip, m->m_len);
 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
 	ip->ip_p = IPPROTO_TCP;
 	tcp->th_off = 5;
 	/*
 	 * Assume we are sending a RST (or a keepalive in the reverse
 	 * direction), swap src and destination addresses and ports.
 	 */
 	ip->ip_src.s_addr = htonl(id->dst_ip);
 	ip->ip_dst.s_addr = htonl(id->src_ip);
 	tcp->th_sport = htons(id->dst_port);
 	tcp->th_dport = htons(id->src_port);
 	if (flags & TH_RST) {	/* we are sending a RST */
 		if (flags & TH_ACK) {
 			tcp->th_seq = htonl(ack);
 			tcp->th_ack = htonl(0);
 			tcp->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			tcp->th_seq = htonl(0);
 			tcp->th_ack = htonl(seq);
 			tcp->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * We are sending a keepalive. flags & TH_SYN determines
 		 * the direction, forward if set, reverse if clear.
 		 * NOTE: seq and ack are always assumed to be correct
 		 * as set by the caller. This may be confusing...
 		 */
 		if (flags & TH_SYN) {
 			/*
 			 * we have to rewrite the correct addresses!
 			 */
 			ip->ip_dst.s_addr = htonl(id->dst_ip);
 			ip->ip_src.s_addr = htonl(id->src_ip);
 			tcp->th_dport = htons(id->dst_port);
 			tcp->th_sport = htons(id->src_port);
 		}
 		tcp->th_seq = htonl(seq);
 		tcp->th_ack = htonl(ack);
 		tcp->th_flags = TH_ACK;
 	}
 	/*
 	 * set ip_len to the payload size so we can compute
 	 * the tcp checksum on the pseudoheader
 	 * XXX check this, could save a couple of words ?
 	 */
 	ip->ip_len = htons(sizeof(struct tcphdr));
 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
 	/*
 	 * now fill fields left out earlier
 	 */
 	ip->ip_ttl = ip_defttl;
 	ip->ip_len = m->m_pkthdr.len;
 	bzero (&sro, sizeof (sro));
 	ip_rtaddr(ip->ip_dst, &sro);
 	m->m_flags |= M_SKIP_FIREWALL;
-	ip_output(m, NULL, &sro, 0, NULL);
+	ip_output(m, NULL, &sro, 0, NULL, NULL);
 	if (sro.ro_rt)
 		RTFREE(sro.ro_rt);
 }
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
 {
 
 	if (code != ICMP_REJECT_RST) /* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0)
 			send_pkt(&(args->f_id), ntohl(tcp->th_seq),
 				ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 		m_freem(args->m);
 	} else
 		m_freem(args->m);
 	args->m = NULL;
 }
 
 /**
  *
  * Given an ip_fw *, lookup_next_rule will return a pointer
  * to the next rule, which can be either the jump
  * target (for skipto instructions) or the next one in the list (in
  * all other cases including a missing jump target).
  * The result is also written in the "next_rule" field of the rule.
  * Backward jumps are not allowed, so start looking from the next
  * rule...
  *
  * This never returns NULL -- in case we do not have an exact match,
  * the next rule is returned. When the ruleset is changed,
  * pointers are flushed so we are always correct.
  */ 
 
 static struct ip_fw *
 lookup_next_rule(struct ip_fw *me)
 {
 	struct ip_fw *rule = NULL;
 	ipfw_insn *cmd;
 
 	/* look for action, in case it is a skipto */
 	cmd = ACTION_PTR(me);
 	if ( cmd->opcode == O_SKIPTO )
 		for (rule = me->next; rule ; rule = rule->next)
 			if (rule->rulenum >= cmd->arg1)
 				break;
 	if (rule == NULL)			/* failure or not a skipto */
 		rule = me->next;
 	me->next_rule = rule;
 	return rule;
 }
 
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
  *	args->oif	Outgoing interface, or NULL if packet is incoming.
  *		The incoming interface is in the mbuf. (in)
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  *
  * Return value:
  *
  *	IP_FW_PORT_DENY_FLAG	the packet must be dropped.
  *	0	The packet is to be accepted and routed normally OR
  *      	the packet was denied/rejected and has been dropped;
  *		in the latter case, *m is equal to NULL upon return.
  *	port	Divert the packet to port, with these caveats:
  *
  *		- If IP_FW_PORT_TEE_FLAG is set, tee the packet instead
  *		  of diverting it (ie, 'ipfw tee').
  *
  *		- If IP_FW_PORT_DYNT_FLAG is set, interpret the lower
  *		  16 bits as a dummynet pipe number instead of diverting
  */
 
 static int 
 ipfw_chk(struct ip_fw_args *args)
 {
 	/*
 	 * Local variables hold state during the processing of a packet.
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * args->eh	The MAC header. It is non-null for a layer2
 	 *	packet, it is NULL for a layer-3 packet.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 * ip	is simply an alias of the value of m, and it is kept
 	 *	in sync with it (the packet is	supposed to start with
 	 *	the ip header).
 	 */
 	struct mbuf *m = args->m;
 	struct ip *ip = mtod(m, struct ip *);
 
 	/*
 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
 	 *	inbound path (ether_input, bdg_forward, ip_input).
 	 *	If non-NULL, ipfw_chk has been called on the outbound path
 	 *	(ether_output, ip_output).
 	 */
 	struct ifnet *oif = args->oif;
 
 	struct ip_fw *f = NULL;		/* matching rule */
 	int retval = 0;
 
 	/*
 	 * hlen	The length of the IPv4 header.
 	 *	hlen >0 means we have an IPv4 packet.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 */
 	u_short offset = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	u_int8_t proto;
 	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	u_int16_t ip_len=0;
 	int dyn_dir = MATCH_UNKNOWN;
 	ipfw_dyn_rule *q = NULL;
 
 	if (m->m_flags & M_SKIP_FIREWALL)
 		return 0;	/* accept */
 	/*
 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
 	 * 	MATCH_NONE when checked and not matched (q = NULL),
 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
 	 */
 
 	if (args->eh == NULL ||		/* layer 3 packet */
 		( m->m_pkthdr.len >= sizeof(struct ip) &&
 		    ntohs(args->eh->ether_type) == ETHERTYPE_IP))
 			hlen = ip->ip_hl << 2;
 
 	/*
 	 * Collect parameters into local variables for faster matching.
 	 */
 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
 		proto = args->f_id.proto = 0;	/* mark f_id invalid */
 		goto after_ip_checks;
 	}
 
 	proto = args->f_id.proto = ip->ip_p;
 	src_ip = ip->ip_src;
 	dst_ip = ip->ip_dst;
 	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		ip_len = ntohs(ip->ip_len);
 	} else {
 		offset = ip->ip_off & IP_OFFMASK;
 		ip_len = ip->ip_len;
 	}
 
 #define PULLUP_TO(len)						\
 		do {						\
 			if ((m)->m_len < (len)) {		\
 			    args->m = m = m_pullup(m, (len));	\
 			    if (m == 0)				\
 				goto pullup_failed;		\
 			    ip = mtod(m, struct ip *);		\
 			}					\
 		} while (0)
 
 	if (offset == 0) {
 		switch (proto) {
 		case IPPROTO_TCP:
 		    {
 			struct tcphdr *tcp;
 
 			PULLUP_TO(hlen + sizeof(struct tcphdr));
 			tcp = L3HDR(struct tcphdr, ip);
 			dst_port = tcp->th_dport;
 			src_port = tcp->th_sport;
 			args->f_id.flags = tcp->th_flags;
 			}
 			break;
 
 		case IPPROTO_UDP:
 		    {
 			struct udphdr *udp;
 
 			PULLUP_TO(hlen + sizeof(struct udphdr));
 			udp = L3HDR(struct udphdr, ip);
 			dst_port = udp->uh_dport;
 			src_port = udp->uh_sport;
 			}
 			break;
 
 		case IPPROTO_ICMP:
 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
 			break;
 
 		default:
 			break;
 		}
 #undef PULLUP_TO
 	}
 
 	args->f_id.src_ip = ntohl(src_ip.s_addr);
 	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	args->f_id.src_port = src_port = ntohs(src_port);
 	args->f_id.dst_port = dst_port = ntohs(dst_port);
 
 after_ip_checks:
 	if (args->rule) {
 		/*
 		 * Packet has already been tagged. Look for the next rule
 		 * to restart processing.
 		 *
 		 * If fw_one_pass != 0 then just accept it.
 		 * XXX should not happen here, but optimized out in
 		 * the caller.
 		 */
 		if (fw_one_pass)
 			return 0;
 
 		f = args->rule->next_rule;
 		if (f == NULL)
 			f = lookup_next_rule(args->rule);
 	} else {
 		/*
 		 * Find the starting rule. It can be either the first
 		 * one, or the one after divert_rule if asked so.
 		 */
 		int skipto = args->divert_rule;
 
 		f = layer3_chain;
 		if (args->eh == NULL && skipto != 0) {
 			if (skipto >= IPFW_DEFAULT_RULE)
 				return(IP_FW_PORT_DENY_FLAG); /* invalid */
 			while (f && f->rulenum <= skipto)
 				f = f->next;
 			if (f == NULL)	/* drop packet */
 				return(IP_FW_PORT_DENY_FLAG);
 		}
 	}
 	args->divert_rule = 0;	/* reset to avoid confusion later */
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 */
 	for (; f; f = f->next) {
 		int l, cmdlen;
 		ipfw_insn *cmd;
 		int skip_or; /* skip rest of OR block */
 
 again:
 		if (set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 check_body:
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have an IPv4
 				 * packet with the ports info.
 				 */
 				if (offset!=0)
 					break;
 			    {
 				struct inpcbinfo *pi;
 				int wildcard;
 				struct inpcb *pcb;
 
 				if (proto == IPPROTO_TCP) {
 					wildcard = 0;
 					pi = &tcbinfo;
 				} else if (proto == IPPROTO_UDP) {
 					wildcard = 1;
 					pi = &udbinfo;
 				} else
 					break;
 
 				pcb =  (oif) ?
 					in_pcblookup_hash(pi,
 					    dst_ip, htons(dst_port),
 					    src_ip, htons(src_port),
 					    wildcard, oif) :
 					in_pcblookup_hash(pi,
 					    src_ip, htons(src_port),
 					    dst_ip, htons(dst_port),
 					    wildcard, NULL);
 
 				if (pcb == NULL || pcb->inp_socket == NULL)
 					break;
 #if __FreeBSD_version < 500034
 #define socheckuid(a,b)	((a)->so_cred->cr_uid == (b))
 #endif
 				if (cmd->opcode == O_UID) {
 					match =
 					  socheckuid(pcb->inp_socket,
 					   (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
 				} else  {
 					match = groupmember(
 					    (uid_t)((ipfw_insn_u32 *)cmd)->d[0],
 					    pcb->inp_socket->so_cred);
 				}
 			    }
 				break;
 
 			case O_RECV:
 				match = iface_match(m->m_pkthdr.rcvif,
 				    (ipfw_insn_if *)cmd);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd);
 				break;
 
 			case O_VIA:
 				match = iface_match(oif ? oif :
 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
 				break;
 
 			case O_MACADDR2:
 				if (args->eh != NULL) {	/* have MAC header */
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)args->eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->eh != NULL) {
 					u_int16_t t =
 					    ntohs(args->eh->ether_type);
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (t>=p[0] && t<=p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (hlen > 0 && offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->eh != NULL);
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = (hlen > 0 &&
 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 		
 			case O_IP_SRC_MASK:
 				match = (hlen > 0 &&
 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				     (src_ip.s_addr &
 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
 				break;
 
 			case O_IP_SRC_ME:
 				if (hlen > 0) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(src_ip, tif);
 					match = (tif != NULL);
 				}
 				break;
 		
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (hlen > 0) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.src_ip :
 						args->f_id.dst_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = (hlen > 0 &&
 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_MASK:
 				match = (hlen > 0) &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				     (dst_ip.s_addr &
 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
 				break;
 
 			case O_IP_DST_ME:
 				if (hlen > 0) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(dst_ip, tif);
 					match = (tif != NULL);
 				}
 				break;
 		
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have an IPv4
 				 * packet with port info.
 				 */
 				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
 				    && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd) );
 				break;
 
 			case O_IPOPT:
 				match = (hlen > 0 && ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPTTL:
 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
 				break;
 
 			case O_IPID:
 				match = (hlen > 0 &&
 				    cmd->arg1 == ntohs(ip->ip_id));
 				break;
 
 			case O_IPLEN:
 				match = (hlen > 0 && cmd->arg1 == ip_len);
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (hlen > 0 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (hlen > 0 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd,
 					L3HDR(struct tcphdr,ip)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    tcpopts_match(ip, cmd));
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					L3HDR(struct tcphdr,ip)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					L3HDR(struct tcphdr,ip)->th_ack);
 				break;
 
 			case O_TCPWIN:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    cmd->arg1 ==
 					L3HDR(struct tcphdr,ip)->th_win);
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (L3HDR(struct tcphdr,ip)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_LOG:
 				if (fw_verbose)
 					ipfw_log(f, hlen, args->eh, m, oif);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to do a 'goto done').
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   ('goto next_rule', equivalent to a 'break 2'),
 			 *   or to the SKIPTO target ('goto again' after
 			 *   having set f, cmd and l), respectively.
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule.
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet *   must be dropped
 			 *   ('goto done' after setting retval);
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   ('goto check_body') if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found ('goto next_rule').
 			 *   The result of the lookup is cached to make
 			 *   further instances of these opcodes are
 			 *   effectively NOPs.
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (install_state(f,
 				    (ipfw_insn_limit *)cmd, args)) {
 					retval = IP_FW_PORT_DENY_FLAG;
 					goto done; /* error/limit violation */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_dir.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (dyn_dir == MATCH_UNKNOWN &&
 				    (q = lookup_dyn_rule(&args->f_id,
 				     &dyn_dir, proto == IPPROTO_TCP ?
 					L3HDR(struct tcphdr, ip) : NULL))
 					!= NULL) {
 					/*
 					 * Found dynamic entry, update stats
 					 * and jump to the 'action' part of
 					 * the parent rule.
 					 */
 					q->pcnt++;
 					q->bcnt += ip_len;
 					f = q->rule;
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					goto check_body;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					goto next_rule;
 				match = 1;
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				goto done;
 
 			case O_PIPE:
 			case O_QUEUE:
 				args->rule = f; /* report matching rule */
 				retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG;
 				goto done;
 			
 			case O_DIVERT:
 			case O_TEE:
 				if (args->eh) /* not on layer 2 */
 					break;
 				args->divert_rule = f->rulenum;
 				retval = (cmd->opcode == O_DIVERT) ?
 				    cmd->arg1 :
 				    cmd->arg1 | IP_FW_PORT_TEE_FLAG;
 				goto done;
 
 			case O_COUNT:
 			case O_SKIPTO:
 				f->pcnt++;	/* update stats */
 				f->bcnt += ip_len;
 				f->timestamp = time_second;
 				if (cmd->opcode == O_COUNT)
 					goto next_rule;
 				/* handle skipto */
 				if (f->next_rule == NULL)
 					lookup_next_rule(f);
 				f = f->next_rule;
 				goto again;
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ip)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(dst_ip.s_addr)) {
 					send_reject(args, cmd->arg1,
 					    offset,ip_len);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 			case O_DENY:
 				retval = IP_FW_PORT_DENY_FLAG;
 				goto done;
 
 			case O_FORWARD_IP:
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (!q || dyn_dir == MATCH_FORWARD)
 					args->next_hop =
 					    &((ipfw_insn_sa *)cmd)->sa;
 				retval = 0;
 				goto done;
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner for, scan opcodes */
 
 next_rule:;		/* try next rule		*/
 	    
 	}		/* end of outer for, scan rules */
 	printf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
 	return(IP_FW_PORT_DENY_FLAG);
 
 done:
 	/* Update statistics */
 	f->pcnt++;
 	f->bcnt += ip_len;
 	f->timestamp = time_second;
 	return retval;
 
 pullup_failed:
 	if (fw_verbose)
 		printf("pullup failed\n");
 	return(IP_FW_PORT_DENY_FLAG);
 }
 
 /*
  * When a rule is added/deleted, clear the next_rule pointers in all rules.
  * These will be reconstructed on the fly as packets are matched.
  * Must be called at splimp().
  */
 static void
 flush_rule_ptrs(void)
 {
 	struct ip_fw *rule;
 
 	for (rule = layer3_chain; rule; rule = rule->next)
 		rule->next_rule = NULL;
 }
 
 /*
  * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given
  * pipe/queue, or to all of them (match == NULL).
  * Must be called at splimp().
  */
 void
 flush_pipe_ptrs(struct dn_flow_set *match)
 {
 	struct ip_fw *rule;
 
 	for (rule = layer3_chain; rule; rule = rule->next) {
 		ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule);
                 
 		if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE)
 			continue;
   
 		if (match == NULL || cmd->pipe_ptr == match)
 			cmd->pipe_ptr = NULL;
 	}
 }
 
 /*
  * Add a new rule to the list. Copy the rule into a malloc'ed area, then
  * possibly create a rule number and add the rule to the list.
  * Update the rule_number in the input struct so the caller knows it as well.
  */
 static int
 add_rule(struct ip_fw **head, struct ip_fw *input_rule)
 {
 	struct ip_fw *rule, *f, *prev;
 	int s;
 	int l = RULESIZE(input_rule);
 
 	if (*head == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
 		return (EINVAL);
 
 	rule = malloc(l, M_IPFW, M_DONTWAIT | M_ZERO);
 	if (rule == NULL)
 		return (ENOSPC);
 
 	bcopy(input_rule, rule, l);
 
 	rule->next = NULL;
 	rule->next_rule = NULL;
 
 	rule->pcnt = 0;
 	rule->bcnt = 0;
 	rule->timestamp = 0;
 
 	s = splimp();
 
 	if (*head == NULL) {	/* default rule */
 		*head = rule;
 		goto done;
         }
 
 	/*
 	 * If rulenum is 0, find highest numbered rule before the
 	 * default rule, and add autoinc_step
 	 */
 	if (autoinc_step < 1)
 		autoinc_step = 1;
 	else if (autoinc_step > 1000)
 		autoinc_step = 1000;
 	if (rule->rulenum == 0) {
 		/*
 		 * locate the highest numbered rule before default
 		 */
 		for (f = *head; f; f = f->next) {
 			if (f->rulenum == IPFW_DEFAULT_RULE)
 				break;
 			rule->rulenum = f->rulenum;
 		}
 		if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step)
 			rule->rulenum += autoinc_step;
 		input_rule->rulenum = rule->rulenum;
 	}
 
 	/*
 	 * Now insert the new rule in the right place in the sorted list.
 	 */
 	for (prev = NULL, f = *head; f; prev = f, f = f->next) {
 		if (f->rulenum > rule->rulenum) { /* found the location */
 			if (prev) {
 				rule->next = f;
 				prev->next = rule;
 			} else { /* head insert */
 				rule->next = *head;
 				*head = rule;
 			}
 			break;
 		}
 	}
 	flush_rule_ptrs();
 done:
 	static_count++;
 	static_len += l;
 	splx(s);
 	DEB(printf("++ installed rule %d, static count now %d\n",
 		rule->rulenum, static_count);)
 	return (0);
 }
 
 /**
  * Free storage associated with a static rule (including derived
  * dynamic rules).
  * The caller is in charge of clearing rule pointers to avoid
  * dangling pointers.
  * @return a pointer to the next entry.
  * Arguments are not checked, so they better be correct.
  * Must be called at splimp().
  */
 static struct ip_fw *
 delete_rule(struct ip_fw **head, struct ip_fw *prev, struct ip_fw *rule)
 {
 	struct ip_fw *n;
 	int l = RULESIZE(rule);
 
 	n = rule->next;
 	remove_dyn_rule(rule, NULL /* force removal */);
 	if (prev == NULL)
 		*head = n;
 	else
 		prev->next = n;
 	static_count--;
 	static_len -= l;
 
 	if (DUMMYNET_LOADED)
 		ip_dn_ruledel_ptr(rule);
 	free(rule, M_IPFW);
 	return n;
 }
 
 /*
  * Deletes all rules from a chain (including the default rule
  * if the second argument is set).
  * Must be called at splimp().
  */
 static void
 free_chain(struct ip_fw **chain, int kill_default)
 {
 	struct ip_fw *rule;
 
 	flush_rule_ptrs(); /* more efficient to do outside the loop */
 
 	while ( (rule = *chain) != NULL &&
 	    (kill_default || rule->rulenum != IPFW_DEFAULT_RULE) )
 		delete_rule(chain, NULL, rule);
 }
 
 /**
  * Remove all rules with given number, and also do set manipulation.
  *
  * The argument is an u_int32_t. The low 16 bit are the rule or set number,
  * the next 8 bits are the new set, the top 8 bits are the command:
  * 
  *	0	delete rules with given number
  *	1	delete rules with given set number
  *	2	move rules with given number to new set
  *	3	move rules with given set number to new set
  *	4	swap sets with given numbers
  */
 static int
 del_entry(struct ip_fw **chain, u_int32_t arg)
 {
 	struct ip_fw *prev, *rule;
 	int s;
 	u_int16_t rulenum;
 	u_int8_t cmd, new_set;
 
 	rulenum = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 4)
 		return EINVAL;
 	if (new_set > 30)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2) {
 		if (rulenum == IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
 		if (rulenum > 30)
 			return EINVAL;
 	}
 	
 	switch (cmd) {
 	case 0:	/* delete rules with given number */
 		/*
 		 * locate first rule to delete
 		 */
 		for (prev = NULL, rule = *chain;
 		    rule && rule->rulenum < rulenum;
 		     prev = rule, rule = rule->next)
 			;
 		if (rule->rulenum != rulenum)
 			return EINVAL;
 
 		s = splimp(); /* no access to rules while removing */
 		/*
 		 * flush pointers outside the loop, then delete all matching
 		 * rules. prev remains the same throughout the cycle.
 		 */
 		flush_rule_ptrs();
 		while (rule && rule->rulenum == rulenum)
 			rule = delete_rule(chain, prev, rule);
 		splx(s);
 		break;
 
 	case 1:	/* delete all rules with given set number */
 		s = splimp();
 		flush_rule_ptrs();
 		for (prev = NULL, rule = *chain; rule ; )
 			if (rule->set == rulenum)
 				rule = delete_rule(chain, prev, rule);
 			else {
 				prev = rule;
 				rule = rule->next;
 			}
 		splx(s);
 		break;
 
 	case 2:	/* move rules with given number to new set */
 		s = splimp();
 		for (rule = *chain; rule ; rule = rule->next)
 			if (rule->rulenum == rulenum)
 				rule->set = new_set;
 		splx(s);
 		break;
 
 	case 3: /* move rules with given set number to new set */
 		s = splimp();
 		for (rule = *chain; rule ; rule = rule->next)
 			if (rule->set == rulenum)
 				rule->set = new_set;
 		splx(s);
 		break;
 
 	case 4: /* swap two sets */
 		s = splimp();
 		for (rule = *chain; rule ; rule = rule->next)
 			if (rule->set == rulenum)
 				rule->set = new_set;
 			else if (rule->set == new_set)
 				rule->set = rulenum;
 		splx(s);
 		break;
 	}
 	return 0;
 }
 
 /*
  * Clear counters for a specific rule.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0) {
 		rule->bcnt = rule->pcnt = 0;
 		rule->timestamp = 0;
 	}
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
 }
 
 /**
  * Reset some or all counters on firewall rules.
  * @arg frwl is null to clear all entries, or contains a specific
  * rule number.
  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
  */
 static int
 zero_entry(int rulenum, int log_only)
 {
 	struct ip_fw *rule;
 	int s;
 	char *msg;
 
 	if (rulenum == 0) {
 		s = splimp();
 		norule_counter = 0;
 		for (rule = layer3_chain; rule; rule = rule->next)
 			clear_counters(rule, log_only);
 		splx(s);
 		msg = log_only ? "ipfw: All logging counts reset.\n" :
 				"ipfw: Accounting cleared.\n";
 	} else {
 		int cleared = 0;
 		/*
 		 * We can have multiple rules with the same number, so we
 		 * need to clear them all.
 		 */
 		for (rule = layer3_chain; rule; rule = rule->next)
 			if (rule->rulenum == rulenum) {
 				s = splimp();
 				while (rule && rule->rulenum == rulenum) {
 					clear_counters(rule, log_only);
 					rule = rule->next;
 				}
 				splx(s);
 				cleared = 1;
 				break;
 			}
 		if (!cleared)	/* we did not find any matching rules */
 			return (EINVAL);
 		msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
 				"ipfw: Entry %d cleared.\n";
 	}
 	if (fw_verbose)
 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
 	return (0);
 }
 
 /*
  * Check validity of the structure before insert.
  * Fortunately rules are simple, so this mostly need to check rule sizes.
  */
 static int
 check_ipfw_struct(struct ip_fw *rule, int size)
 {
 	int l, cmdlen = 0;
 	int have_action=0;
 	ipfw_insn *cmd;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 	/* first, check for valid size */
 	l = RULESIZE(rule);
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	/*
 	 * Now go for the individual checks. Very simple ones, basically only
 	 * instruction sizes.
 	 */
 	for (l = rule->cmd_len, cmd = rule->cmd ;
 			l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		if (cmdlen > l) {
 			printf("ipfw: opcode %d size truncated\n",
 			    cmd->opcode);
 			return EINVAL;
 		}
 		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
 		switch (cmd->opcode) {
 		case O_NOP:
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
 		case O_PROTO:
 		case O_IP_SRC_ME:
 		case O_IP_DST_ME:
 		case O_LAYER2:
 		case O_IN:
 		case O_FRAG:
 		case O_IPOPT:
 		case O_IPLEN:
 		case O_IPID:
 		case O_IPTOS:
 		case O_IPPRECEDENCE:
 		case O_IPTTL:
 		case O_IPVER:
 		case O_TCPWIN:
 		case O_TCPFLAGS:
 		case O_TCPOPTS:
 		case O_ESTAB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_UID:
 		case O_GID:
 		case O_IP_SRC:
 		case O_IP_DST:
 		case O_TCPSEQ:
 		case O_TCPACK:
 		case O_PROB:
 		case O_ICMPTYPE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_LIMIT:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
 				goto bad_size;
 			break;
 
 		case O_LOG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
 				goto bad_size;
 			
 			((ipfw_insn_log *)cmd)->log_left =
 			    ((ipfw_insn_log *)cmd)->max_log;
 
 			break;
 
 		case O_IP_SRC_MASK:
 		case O_IP_DST_MASK:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
 				goto bad_size;
 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
 				printf("ipfw: opcode %d, useless rule\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 			break;
 
 		case O_IP_SRC_SET:
 		case O_IP_DST_SET:
 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
 				printf("ipfw: invalid set size %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    (cmd->arg1+31)/32 )
 				goto bad_size;
 			break;
 
 		case O_MACADDR2:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
 				goto bad_size;
 			break;
 
 		case O_MAC_TYPE:
 		case O_IP_SRCPORT:
 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
 			if (cmdlen < 2 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_RECV:
 		case O_XMIT:
 		case O_VIA:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
 				goto bad_size;
 			break;
 
 		case O_PIPE:
 		case O_QUEUE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_IP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_MAC: /* XXX not implemented yet */
 		case O_CHECK_STATE:
 		case O_COUNT:
 		case O_ACCEPT:
 		case O_DENY:
 		case O_REJECT:
 		case O_SKIPTO:
 		case O_DIVERT:
 		case O_TEE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 check_action:
 			if (have_action) {
 				printf("ipfw: opcode %d, multiple actions"
 					" not allowed\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 			have_action = 1;
 			if (l != cmdlen) {
 				printf("ipfw: opcode %d, action must be"
 					" last opcode\n",
 					cmd->opcode);
 				return EINVAL;
 			}
 			break;
 		default:
 			printf("ipfw: opcode %d, unknown opcode\n",
 				cmd->opcode);
 			return EINVAL;
 		}
 	}
 	if (have_action == 0) {
 		printf("ipfw: missing action\n");
 		return EINVAL;
 	}
 	return 0;
 
 bad_size:
 	printf("ipfw: opcode %d size %d wrong\n",
 		cmd->opcode, cmdlen);
 	return EINVAL;
 }
 
 
 /**
  * {set|get}sockopt parser.
  */
 static int
 ipfw_ctl(struct sockopt *sopt)
 {
 	int error, s, rulenum;
 	size_t size;
 	struct ip_fw *bp , *buf, *rule;
 
 	static u_int32_t rule_buf[255];	/* we copy the data here */
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if (sopt->sopt_name == IP_FW_ADD ||
 	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
 #if __FreeBSD_version >= 500034
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error)
 			return (error);
 #else /* FreeBSD 4.x */
 		if (securelevel >= 3)
 			return (EPERM);
 #endif
 	}
 
 	error = 0;
 
 	switch (sopt->sopt_name) {
 	case IP_FW_GET:
 		/*
 		 * pass up a copy of the current rules. Static rules
 		 * come first (the last of which has number IPFW_DEFAULT_RULE),
 		 * followed by a possibly empty list of dynamic rule.
 		 * The last dynamic rule has NULL in the "next" field.
 		 */
 		s = splimp();
 		size = static_len;	/* size of static rules */
 		if (ipfw_dyn_v)		/* add size of dyn.rules */
 			size += (dyn_count * sizeof(ipfw_dyn_rule));
 
 		/*
 		 * XXX todo: if the user passes a short length just to know
 		 * how much room is needed, do not bother filling up the
 		 * buffer, just jump to the sooptcopyout.
 		 */
 		buf = malloc(size, M_TEMP, M_WAITOK);
 		if (buf == 0) {
 			splx(s);
 			error = ENOBUFS;
 			break;
 		}
 
 		bp = buf;
 		for (rule = layer3_chain; rule ; rule = rule->next) {
 			int i = RULESIZE(rule);
 			bcopy(rule, bp, i);
 			/*
 			 * abuse 'next_rule' to store the set_disable word
 			 */
 			(u_int32_t)(((struct ip_fw *)bp)->next_rule) =
 				set_disable;
 			bp = (struct ip_fw *)((char *)bp + i);
 		}
 		if (ipfw_dyn_v) {
 			int i;
 			ipfw_dyn_rule *p, *dst, *last = NULL;
 
 			dst = (ipfw_dyn_rule *)bp;
 			for (i = 0 ; i < curr_dyn_buckets ; i++ )
 				for ( p = ipfw_dyn_v[i] ; p != NULL ;
 				    p = p->next, dst++ ) {
 					bcopy(p, dst, sizeof *p);
 					(int)dst->rule = p->rule->rulenum ;
 					/*
 					 * store a non-null value in "next".
 					 * The userland code will interpret a
 					 * NULL here as a marker
 					 * for the last dynamic rule.
 					 */
 					dst->next = dst ;
 					last = dst ;
 					dst->expire =
 					    TIME_LEQ(dst->expire, time_second) ?
 						0 : dst->expire - time_second ;
 				}
 			if (last != NULL) /* mark last dynamic rule */
 				last->next = NULL;
 		}
 		splx(s);
 
 		error = sooptcopyout(sopt, buf, size);
 		free(buf, M_TEMP);
 		break;
 
 	case IP_FW_FLUSH:
 		/*
 		 * Normally we cannot release the lock on each iteration.
 		 * We could do it here only because we start from the head all
 		 * the times so there is no risk of missing some entries.
 		 * On the other hand, the risk is that we end up with
 		 * a very inconsistent ruleset, so better keep the lock
 		 * around the whole cycle.
 		 * 
 		 * XXX this code can be improved by resetting the head of
 		 * the list to point to the default rule, and then freeing
 		 * the old list without the need for a lock.
 		 */
 
 		s = splimp();
 		free_chain(&layer3_chain, 0 /* keep default rule */);
 		splx(s);
 		break;
 
 	case IP_FW_ADD:
 		rule = (struct ip_fw *)rule_buf; /* XXX do a malloc */
 		error = sooptcopyin(sopt, rule, sizeof(rule_buf),
 			sizeof(struct ip_fw) );
 		size = sopt->sopt_valsize;
 		if (error || (error = check_ipfw_struct(rule, size)))
 			break;
 
 		error = add_rule(&layer3_chain, rule);
 		size = RULESIZE(rule);
 		if (!error && sopt->sopt_dir == SOPT_GET)
 			error = sooptcopyout(sopt, rule, size);
 		break;
 
 	case IP_FW_DEL:
 		/*
 		 * IP_FW_DEL is used for deleting single rules or sets,
 		 * and (ab)used to atomically manipulate sets. Argument size
 		 * is used to distinguish between the two:
 		 *    sizeof(u_int32_t)
 		 *	delete single rule or set of rules,
 		 *	or reassign rules (or sets) to a different set.
 		 *    2*sizeof(u_int32_t)
 		 *	atomic disable/enable sets.
 		 *	first u_int32_t contains sets to be disabled,
 		 *	second u_int32_t contains sets to be enabled.
 		 */
 		error = sooptcopyin(sopt, rule_buf,
 			2*sizeof(u_int32_t), sizeof(u_int32_t));
 		if (error)
 			break;
 		size = sopt->sopt_valsize;
 		if (size == sizeof(u_int32_t))	/* delete or reassign */
 			error = del_entry(&layer3_chain, rule_buf[0]);
 		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
 			set_disable =
 			    (set_disable | rule_buf[0]) & ~rule_buf[1] &
 			    ~(1<<31); /* set 31 always enabled */
 		else
 			error = EINVAL;
 		break;
 
 	case IP_FW_ZERO:
 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
 		rulenum=0;
 
 		if (sopt->sopt_val != 0) {
 		    error = sooptcopyin(sopt, &rulenum,
 			    sizeof(int), sizeof(int));
 		    if (error)
 			break;
 		}
 		error = zero_entry(rulenum, sopt->sopt_name == IP_FW_RESETLOG);
 		break;
 
 	default:
 		printf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 /**
  * dummynet needs a reference to the default rule, because rules can be
  * deleted while packets hold a reference to them. When this happens,
  * dummynet changes the reference to the default rule (it could well be a
  * NULL pointer, but this way we do not need to check for the special
  * case, plus here he have info on the default behaviour).
  */
 struct ip_fw *ip_fw_default_rule;
 
 /*
  * This procedure is only used to handle keepalives. It is invoked
  * every dyn_keepalive_period
  */
 static void
 ipfw_tick(void * __unused unused)
 {
 	int i;
 	int s;
 	ipfw_dyn_rule *q;
 
 	if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
 		goto done;
 
 	s = splimp();
 	for (i = 0 ; i < curr_dyn_buckets ; i++) {
 		for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
 			if (q->dyn_type == O_LIMIT_PARENT)
 				continue;
 			if (q->id.proto != IPPROTO_TCP)
 				continue;
 			if ( (q->state & BOTH_SYN) != BOTH_SYN)
 				continue;
 			if (TIME_LEQ( time_second+dyn_keepalive_interval,
 			    q->expire))
 				continue;	/* too early */
 			if (TIME_LEQ(q->expire, time_second))
 				continue;	/* too late, rule expired */
 
 			send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
 			send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
 		}
 	}
 	splx(s);
 done:
 	ipfw_timeout_h = timeout(ipfw_tick, NULL, dyn_keepalive_period*hz);
 }
 
 static void
 ipfw_init(void)
 {
 	struct ip_fw default_rule;
 
 	ip_fw_chk_ptr = ipfw_chk;
 	ip_fw_ctl_ptr = ipfw_ctl;
 	layer3_chain = NULL;
 
 	bzero(&default_rule, sizeof default_rule);
 
 	default_rule.act_ofs = 0;
 	default_rule.rulenum = IPFW_DEFAULT_RULE;
 	default_rule.cmd_len = 1;
 	default_rule.set = 31;
 
 	default_rule.cmd[0].len = 1;
 	default_rule.cmd[0].opcode =
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 				1 ? O_ACCEPT :
 #endif
 				O_DENY;
 
 	add_rule(&layer3_chain, &default_rule);
 
 	ip_fw_default_rule = layer3_chain;
 	printf("ipfw2 initialized, divert %s, "
 		"rule-based forwarding enabled, default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"disabled",
 #endif
 		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
 
 #ifdef IPFIREWALL_VERBOSE
 	fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 	if (fw_verbose == 0)
 		printf("disabled\n");
 	else if (verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    verbose_limit);
 	bzero(&ipfw_timeout_h, sizeof(struct callout_handle));
 	ipfw_timeout_h = timeout(ipfw_tick, NULL, hz);
 }
 
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int s;
 	int err = 0;
 	
 	switch (type) {
 	case MOD_LOAD:
 		s = splimp();
 		if (IPFW_LOADED) {
 			splx(s);
 			printf("IP firewall already loaded\n");
 			err = EEXIST;
 		} else {
 			ipfw_init();
 			splx(s);
 		}
 		break;
 
 	case MOD_UNLOAD:
 #if !defined(KLD_MODULE)
 		printf("ipfw statically compiled, cannot unload\n");
 		err = EBUSY;
 #else
                 s = splimp();
 		untimeout(ipfw_tick, NULL, ipfw_timeout_h);
 		ip_fw_chk_ptr = NULL;
 		ip_fw_ctl_ptr = NULL;
 		free_chain(&layer3_chain, 1 /* kill default rule */);
 		splx(s);
 		printf("IP firewall unloaded\n");
 #endif
 		break;
 	default:
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(ipfw, 1);
 #endif /* IPFW2 */
Index: head/sys/netinet/ip_icmp.c
===================================================================
--- head/sys/netinet/ip_icmp.c	(revision 105193)
+++ head/sys/netinet/ip_icmp.c	(revision 105194)
@@ -1,876 +1,876 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  * $FreeBSD$
  */
 
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/icmp_var.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netkey/key.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 /*
  * ICMP routines: error generation, receive packet processing, and
  * routines to turnaround packets back to the originator, and
  * host table maintenance routines.
  */
 
 static struct	icmpstat icmpstat;
 SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
 	&icmpstat, icmpstat, "");
 
 static int	icmpmaskrepl = 0;
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
 	&icmpmaskrepl, 0, "");
 
 static int	drop_redirect = 0;
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, 
 	&drop_redirect, 0, "");
 
 static int	log_redirect = 0;
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, 
 	&log_redirect, 0, "");
 
 static int      icmplim = 200;
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
 	&icmplim, 0, "");
 
 static int	icmplim_output = 1;
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
 	&icmplim_output, 0, "");
 
 /*
  * ICMP broadcast echo sysctl
  */
 
 static int	icmpbmcastecho = 0;
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
 	&icmpbmcastecho, 0, "");
 
 
 #ifdef ICMPPRINTFS
 int	icmpprintfs = 0;
 #endif
 
 static void	icmp_reflect(struct mbuf *);
 static void	icmp_send(struct mbuf *, struct mbuf *, struct route *);
 static int	ip_next_mtu(int, int);
 
 extern	struct protosw inetsw[];
 
 /*
  * Generate an error packet of type error
  * in response to bad packet ip.
  */
 void
 icmp_error(n, type, code, dest, destifp)
 	struct mbuf *n;
 	int type, code;
 	n_long dest;
 	struct ifnet *destifp;
 {
 	register struct ip *oip = mtod(n, struct ip *), *nip;
 	register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2;
 	register struct icmp *icp;
 	register struct mbuf *m;
 	unsigned icmplen;
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_error(%p, %x, %d)\n", oip, type, code);
 #endif
 	if (type != ICMP_REDIRECT)
 		icmpstat.icps_error++;
 	/*
 	 * Don't send error if not the first fragment of message.
 	 * Don't error if the old packet protocol was ICMP
 	 * error message, only known informational types.
 	 */
 	if (oip->ip_off &~ (IP_MF|IP_DF))
 		goto freeit;
 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
 	  n->m_len >= oiplen + ICMP_MINLEN &&
 	  !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) {
 		icmpstat.icps_oldicmp++;
 		goto freeit;
 	}
 	/* Don't send error in response to a multicast or broadcast packet */
 	if (n->m_flags & (M_BCAST|M_MCAST))
 		goto freeit;
 	/*
 	 * First, formulate icmp message
 	 */
 	m = m_gethdr(M_DONTWAIT, MT_HEADER);
 	if (m == NULL)
 		goto freeit;
 #ifdef MAC
 	mac_create_mbuf_netlayer(n, m);
 #endif
 	icmplen = min(oiplen + 8, oip->ip_len);
 	if (icmplen < sizeof(struct ip))
 		panic("icmp_error: bad length");
 	m->m_len = icmplen + ICMP_MINLEN;
 	MH_ALIGN(m, m->m_len);
 	icp = mtod(m, struct icmp *);
 	if ((u_int)type > ICMP_MAXTYPE)
 		panic("icmp_error");
 	icmpstat.icps_outhist[type]++;
 	icp->icmp_type = type;
 	if (type == ICMP_REDIRECT)
 		icp->icmp_gwaddr.s_addr = dest;
 	else {
 		icp->icmp_void = 0;
 		/*
 		 * The following assignments assume an overlay with the
 		 * zeroed icmp_void field.
 		 */
 		if (type == ICMP_PARAMPROB) {
 			icp->icmp_pptr = code;
 			code = 0;
 		} else if (type == ICMP_UNREACH &&
 			code == ICMP_UNREACH_NEEDFRAG && destifp) {
 			icp->icmp_nextmtu = htons(destifp->if_mtu);
 		}
 	}
 
 	icp->icmp_code = code;
 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
 	nip = &icp->icmp_ip;
 
 	/*
 	 * Convert fields to network representation.
 	 */
 	nip->ip_len = htons(nip->ip_len);
 	nip->ip_off = htons(nip->ip_off);
 
 	/*
 	 * Now, copy old ip header (without options)
 	 * in front of icmp message.
 	 */
 	if (m->m_data - sizeof(struct ip) < m->m_pktdat)
 		panic("icmp len");
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
 	nip = mtod(m, struct ip *);
 	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
 	nip->ip_len = m->m_len;
 	nip->ip_vhl = IP_VHL_BORING;
 	nip->ip_p = IPPROTO_ICMP;
 	nip->ip_tos = 0;
 	icmp_reflect(m);
 
 freeit:
 	m_freem(n);
 }
 
 static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET };
 static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET };
 static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET };
 
 /*
  * Process a received ICMP message.
  */
 void
 icmp_input(m, off)
 	register struct mbuf *m;
 	int off;
 {
 	int hlen = off;
 	register struct icmp *icp;
 	register struct ip *ip = mtod(m, struct ip *);
 	int icmplen = ip->ip_len;
 	register int i;
 	struct in_ifaddr *ia;
 	void (*ctlfunc)(int, struct sockaddr *, void *);
 	int code;
 
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
 	 */
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_src));
 		printf("icmp_input from %s to %s, len %d\n",
 		       buf, inet_ntoa(ip->ip_dst), icmplen);
 	}
 #endif
 	if (icmplen < ICMP_MINLEN) {
 		icmpstat.icps_tooshort++;
 		goto freeit;
 	}
 	i = hlen + min(icmplen, ICMP_ADVLENMIN);
 	if (m->m_len < i && (m = m_pullup(m, i)) == 0)  {
 		icmpstat.icps_tooshort++;
 		return;
 	}
 	ip = mtod(m, struct ip *);
 	m->m_len -= hlen;
 	m->m_data += hlen;
 	icp = mtod(m, struct icmp *);
 	if (in_cksum(m, icmplen)) {
 		icmpstat.icps_checksum++;
 		goto freeit;
 	}
 	m->m_len += hlen;
 	m->m_data -= hlen;
 
 	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
 		/*
 		 * Deliver very specific ICMP type only.
 		 */
 		switch (icp->icmp_type) {
 		case ICMP_UNREACH:
 		case ICMP_TIMXCEED:
 			break;
 		default:
 			goto freeit;
 		}
 	}
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
 		    icp->icmp_code);
 #endif
 
 	/*
 	 * Message type specific processing.
 	 */
 	if (icp->icmp_type > ICMP_MAXTYPE)
 		goto raw;
 	icmpstat.icps_inhist[icp->icmp_type]++;
 	code = icp->icmp_code;
 	switch (icp->icmp_type) {
 
 	case ICMP_UNREACH:
 		switch (code) {
 			case ICMP_UNREACH_NET:
 			case ICMP_UNREACH_HOST:
 			case ICMP_UNREACH_SRCFAIL:
 			case ICMP_UNREACH_NET_UNKNOWN:
 			case ICMP_UNREACH_HOST_UNKNOWN:
 			case ICMP_UNREACH_ISOLATED:
 			case ICMP_UNREACH_TOSNET:
 			case ICMP_UNREACH_TOSHOST:
 			case ICMP_UNREACH_HOST_PRECEDENCE:
 			case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 				code = PRC_UNREACH_NET;
 				break;
 
 			case ICMP_UNREACH_NEEDFRAG:
 				code = PRC_MSGSIZE;
 				break;
 
 			/*
 			 * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
 			 * Treat subcodes 2,3 as immediate RST
 			 */
 			case ICMP_UNREACH_PROTOCOL:
 			case ICMP_UNREACH_PORT:
 				code = PRC_UNREACH_PORT;
 				break;
 
 			case ICMP_UNREACH_NET_PROHIB:
 			case ICMP_UNREACH_HOST_PROHIB:
 			case ICMP_UNREACH_FILTER_PROHIB:
 				code = PRC_UNREACH_ADMIN_PROHIB;
 				break;
 
 			default:
 				goto badcode;
 		}
 		goto deliver;
 
 	case ICMP_TIMXCEED:
 		if (code > 1)
 			goto badcode;
 		code += PRC_TIMXCEED_INTRANS;
 		goto deliver;
 
 	case ICMP_PARAMPROB:
 		if (code > 1)
 			goto badcode;
 		code = PRC_PARAMPROB;
 		goto deliver;
 
 	case ICMP_SOURCEQUENCH:
 		if (code)
 			goto badcode;
 		code = PRC_QUENCH;
 	deliver:
 		/*
 		 * Problem with datagram; advise higher level routines.
 		 */
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) {
 			icmpstat.icps_badlen++;
 			goto freeit;
 		}
 		icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len);
 		/* Discard ICMP's in response to multicast packets */
 		if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
 			goto badcode;
 #ifdef ICMPPRINTFS
 		if (icmpprintfs)
 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 #if 1
 		/*
 		 * MTU discovery:
 		 * If we got a needfrag and there is a host route to the
 		 * original destination, and the MTU is not locked, then
 		 * set the MTU in the route to the suggested new value
 		 * (if given) and then notify as usual.  The ULPs will
 		 * notice that the MTU has changed and adapt accordingly.
 		 * If no new MTU was suggested, then we guess a new one
 		 * less than the current value.  If the new MTU is 
 		 * unreasonably small (arbitrarily set at 296), then
 		 * we reset the MTU to the interface value and enable the
 		 * lock bit, indicating that we are no longer doing MTU
 		 * discovery.
 		 */
 		if (code == PRC_MSGSIZE) {
 			struct rtentry *rt;
 			int mtu;
 
 			rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
 				      RTF_CLONING | RTF_PRCLONING);
 			if (rt && (rt->rt_flags & RTF_HOST)
 			    && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
 				mtu = ntohs(icp->icmp_nextmtu);
 				if (!mtu)
 					mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu,
 							  1);
 #ifdef DEBUG_MTUDISC
 				printf("MTU for %s reduced to %d\n",
 					inet_ntoa(icmpsrc.sin_addr), mtu);
 #endif
 				if (mtu < 296) {
 					/* rt->rt_rmx.rmx_mtu =
 						rt->rt_ifp->if_mtu; */
 					rt->rt_rmx.rmx_locks |= RTV_MTU;
 				} else if (rt->rt_rmx.rmx_mtu > mtu) {
 					rt->rt_rmx.rmx_mtu = mtu;
 				}
 			}
 			if (rt)
 				RTFREE(rt);
 		}
 
 #endif
 		/*
 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
 		 * notification to TCP layer.
 		 */
 		ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
 		if (ctlfunc)
 			(*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
 				   (void *)&icp->icmp_ip);
 		break;
 
 	badcode:
 		icmpstat.icps_badcode++;
 		break;
 
 	case ICMP_ECHO:
 		if (!icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			icmpstat.icps_bmcastecho++;
 			break;
 		}
 		icp->icmp_type = ICMP_ECHOREPLY;
 		if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_TSTAMP:
 		if (!icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			icmpstat.icps_bmcasttstamp++;
 			break;
 		}
 		if (icmplen < ICMP_TSLEN) {
 			icmpstat.icps_badlen++;
 			break;
 		}
 		icp->icmp_type = ICMP_TSTAMPREPLY;
 		icp->icmp_rtime = iptime();
 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
 		if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_MASKREQ:
 		if (icmpmaskrepl == 0)
 			break;
 		/*
 		 * We are not able to respond with all ones broadcast
 		 * unless we receive it over a point-to-point interface.
 		 */
 		if (icmplen < ICMP_MASKLEN)
 			break;
 		switch (ip->ip_dst.s_addr) {
 
 		case INADDR_BROADCAST:
 		case INADDR_ANY:
 			icmpdst.sin_addr = ip->ip_src;
 			break;
 
 		default:
 			icmpdst.sin_addr = ip->ip_dst;
 		}
 		ia = (struct in_ifaddr *)ifaof_ifpforaddr(
 			    (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
 		if (ia == 0)
 			break;
 		if (ia->ia_ifp == 0)
 			break;
 		icp->icmp_type = ICMP_MASKREPLY;
 		icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
 		if (ip->ip_src.s_addr == 0) {
 			if (ia->ia_ifp->if_flags & IFF_BROADCAST)
 			    ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
 			else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
 			    ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
 		}
 reflect:
 		ip->ip_len += hlen;	/* since ip_input deducts this */
 		icmpstat.icps_reflect++;
 		icmpstat.icps_outhist[icp->icmp_type]++;
 		icmp_reflect(m);
 		return;
 
 	case ICMP_REDIRECT:
 		if (log_redirect) {
 			u_long src, dst, gw;
 
 			src = ntohl(ip->ip_src.s_addr);
 			dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
 			gw = ntohl(icp->icmp_gwaddr.s_addr);
 			printf("icmp redirect from %d.%d.%d.%d: "
 			       "%d.%d.%d.%d => %d.%d.%d.%d\n",
 			       (int)(src >> 24), (int)((src >> 16) & 0xff),
 			       (int)((src >> 8) & 0xff), (int)(src & 0xff),
 			       (int)(dst >> 24), (int)((dst >> 16) & 0xff),
 			       (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
 			       (int)(gw >> 24), (int)((gw >> 16) & 0xff),
 			       (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
 		}
 		if (drop_redirect)
 			break;
 		if (code > 3)
 			goto badcode;
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) {
 			icmpstat.icps_badlen++;
 			break;
 		}
 		/*
 		 * Short circuit routing redirects to force
 		 * immediate change in the kernel's routing
 		 * tables.  The message is also handed to anyone
 		 * listening on a raw socket (e.g. the routing
 		 * daemon for use in updating its tables).
 		 */
 		icmpgw.sin_addr = ip->ip_src;
 		icmpdst.sin_addr = icp->icmp_gwaddr;
 #ifdef	ICMPPRINTFS
 		if (icmpprintfs) {
 			char buf[4 * sizeof "123"];
 			strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));
 
 			printf("redirect dst %s to %s\n",
 			       buf, inet_ntoa(icp->icmp_gwaddr));
 		}
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		rtredirect((struct sockaddr *)&icmpsrc,
 		  (struct sockaddr *)&icmpdst,
 		  (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
 		  (struct sockaddr *)&icmpgw, (struct rtentry **)0);
 		pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
 #ifdef IPSEC
 		key_sa_routechange((struct sockaddr *)&icmpsrc);
 #endif
 		break;
 
 	/*
 	 * No kernel processing for the following;
 	 * just fall through to send to raw listener.
 	 */
 	case ICMP_ECHOREPLY:
 	case ICMP_ROUTERADVERT:
 	case ICMP_ROUTERSOLICIT:
 	case ICMP_TSTAMPREPLY:
 	case ICMP_IREQREPLY:
 	case ICMP_MASKREPLY:
 	default:
 		break;
 	}
 
 raw:
 	rip_input(m, off);
 	return;
 
 freeit:
 	m_freem(m);
 }
 
 /*
  * Reflect the ip packet back to the source
  */
 static void
 icmp_reflect(m)
 	struct mbuf *m;
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct in_addr t;
 	struct mbuf *opts = 0;
 	int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
 	struct route *ro = NULL, rt;
 
 	if (!in_canforward(ip->ip_src) &&
 	    ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) !=
 	     (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
 		m_freem(m);	/* Bad return address */
 		icmpstat.icps_badaddr++;
 		goto done;	/* Ip_output() will check for broadcast */
 	}
 	t = ip->ip_dst;
 	ip->ip_dst = ip->ip_src;
 	ro = &rt;
 	bzero(ro, sizeof(*ro));
 	/*
 	 * If the incoming packet was addressed directly to us,
 	 * use dst as the src for the reply.  Otherwise (broadcast
 	 * or anonymous), use the address which corresponds
 	 * to the incoming interface.
 	 */
 	LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash)
 		if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr)
 			goto match;
 	if (m->m_pkthdr.rcvif != NULL &&
 	    m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
 		TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    t.s_addr)
 				goto match;
 		}
 	}
 	ia = ip_rtaddr(ip->ip_dst, ro);
 	/* We need a route to do anything useful. */
 	if (ia == NULL) {
 		m_freem(m);
 		icmpstat.icps_noroute++;
 		goto done;
 	}
 match:
 	t = IA_SIN(ia)->sin_addr;
 	ip->ip_src = t;
 	ip->ip_ttl = ip_defttl;
 
 	if (optlen > 0) {
 		register u_char *cp;
 		int opt, cnt;
 		u_int len;
 
 		/*
 		 * Retrieve any source routing from the incoming packet;
 		 * add on any record-route or timestamp options.
 		 */
 		cp = (u_char *) (ip + 1);
 		if ((opts = ip_srcroute()) == 0 &&
 		    (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
 			opts->m_len = sizeof(struct in_addr);
 			mtod(opts, struct in_addr *)->s_addr = 0;
 		}
 		if (opts) {
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("icmp_reflect optlen %d rt %d => ",
 				optlen, opts->m_len);
 #endif
 		    for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
 			    opt = cp[IPOPT_OPTVAL];
 			    if (opt == IPOPT_EOL)
 				    break;
 			    if (opt == IPOPT_NOP)
 				    len = 1;
 			    else {
 				    if (cnt < IPOPT_OLEN + sizeof(*cp))
 					    break;
 				    len = cp[IPOPT_OLEN];
 				    if (len < IPOPT_OLEN + sizeof(*cp) ||
 				        len > cnt)
 					    break;
 			    }
 			    /*
 			     * Should check for overflow, but it "can't happen"
 			     */
 			    if (opt == IPOPT_RR || opt == IPOPT_TS ||
 				opt == IPOPT_SECURITY) {
 				    bcopy((caddr_t)cp,
 					mtod(opts, caddr_t) + opts->m_len, len);
 				    opts->m_len += len;
 			    }
 		    }
 		    /* Terminate & pad, if necessary */
 		    cnt = opts->m_len % 4;
 		    if (cnt) {
 			    for (; cnt < 4; cnt++) {
 				    *(mtod(opts, caddr_t) + opts->m_len) =
 					IPOPT_EOL;
 				    opts->m_len++;
 			    }
 		    }
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("%d\n", opts->m_len);
 #endif
 		}
 		/*
 		 * Now strip out original options by copying rest of first
 		 * mbuf's data back, and adjust the IP length.
 		 */
 		ip->ip_len -= optlen;
 		ip->ip_vhl = IP_VHL_BORING;
 		m->m_len -= optlen;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len -= optlen;
 		optlen += sizeof(struct ip);
 		bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1),
 			 (unsigned)(m->m_len - sizeof(struct ip)));
 	}
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	icmp_send(m, opts, ro);
 done:
 	if (opts)
 		(void)m_free(opts);
 	if (ro && ro->ro_rt)
 		RTFREE(ro->ro_rt);
 }
 
 /*
  * Send an icmp packet back to the ip level,
  * after supplying a checksum.
  */
 static void
 icmp_send(m, opts, rt)
 	register struct mbuf *m;
 	struct mbuf *opts;
 	struct route *rt;
 {
 	register struct ip *ip = mtod(m, struct ip *);
 	register int hlen;
 	register struct icmp *icp;
 
 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 	m->m_data += hlen;
 	m->m_len -= hlen;
 	icp = mtod(m, struct icmp *);
 	icp->icmp_cksum = 0;
 	icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen);
 	m->m_data -= hlen;
 	m->m_len += hlen;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_dst));
 		printf("icmp_send dst %s src %s\n",
 		       buf, inet_ntoa(ip->ip_src));
 	}
 #endif
-	(void) ip_output(m, opts, rt, 0, NULL);
+	(void) ip_output(m, opts, rt, 0, NULL, NULL);
 }
 
 n_time
 iptime()
 {
 	struct timeval atv;
 	u_long t;
 
 	getmicrotime(&atv);
 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
 	return (htonl(t));
 }
 
 #if 1
 /*
  * Return the next larger or smaller MTU plateau (table from RFC 1191)
  * given current value MTU.  If DIR is less than zero, a larger plateau
  * is returned; otherwise, a smaller value is returned.
  */
 static int
 ip_next_mtu(mtu, dir)
 	int mtu;
 	int dir;
 {
 	static int mtutab[] = {
 		65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296,
 		68, 0
 	};
 	int i;
 
 	for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) {
 		if (mtu >= mtutab[i])
 			break;
 	}
 
 	if (dir < 0) {
 		if (i == 0) {
 			return 0;
 		} else {
 			return mtutab[i - 1];
 		}
 	} else {
 		if (mtutab[i] == 0) {
 			return 0;
 		} else if(mtu > mtutab[i]) {
 			return mtutab[i];
 		} else {
 			return mtutab[i + 1];
 		}
 	}
 }
 #endif
 
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
  *
  *	Return 0 if it is ok to send an ICMP error response, -1 if we have
  *	hit our bandwidth limit and it is not ok.  
  *
  *	If icmplim is <= 0, the feature is disabled and 0 is returned.
  *
  *	For now we separate the TCP and UDP subsystems w/ different 'which'
  *	values.  We may eventually remove this separation (and simplify the
  *	code further).
  *
  *	Note that the printing of the error message is delayed so we can
  *	properly print the icmp error rate that the system was trying to do
  *	(i.e. 22000/100 pps, etc...).  This can cause long delays in printing
  *	the 'final' error, but it doesn't make sense to solve the printing 
  *	delay with more complex code.
  */
 
 int
 badport_bandlim(int which)
 {
 	static int lticks[BANDLIM_MAX + 1];
 	static int lpackets[BANDLIM_MAX + 1];
 	int dticks;
 	const char *bandlimittype[] = {
 		"Limiting icmp unreach response",
 		"Limiting icmp ping response",
 		"Limiting icmp tstamp response",
 		"Limiting closed port RST response",
 		"Limiting open port RST response"
 		};
 
 	/*
 	 * Return ok status if feature disabled or argument out of
 	 * ranage.
 	 */
 
 	if (icmplim <= 0 || which > BANDLIM_MAX || which < 0)
 		return(0);
 	dticks = ticks - lticks[which];
 
 	/*
 	 * reset stats when cumulative dt exceeds one second.
 	 */
 
 	if ((unsigned int)dticks > hz) {
 		if (lpackets[which] > icmplim && icmplim_output) {
 			printf("%s from %d to %d packets per second\n",
 				bandlimittype[which],
 				lpackets[which],
 				icmplim
 			);
 		}
 		lticks[which] = ticks;
 		lpackets[which] = 0;
 	}
 
 	/*
 	 * bump packet count
 	 */
 
 	if (++lpackets[which] > icmplim) {
 		return(-1);
 	}
 	return(0);
 }
 
Index: head/sys/netinet/ip_input.c
===================================================================
--- head/sys/netinet/ip_input.c	(revision 105193)
+++ head/sys/netinet/ip_input.c	(revision 105194)
@@ -1,1984 +1,1984 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  * $FreeBSD$
  */
 
 #define	_IP_VHL
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipdn.h"
 #include "opt_ipdivert.h"
 #include "opt_ipfilter.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_pfil_hooks.h"
 #include "opt_random_ip_id.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/intrq.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <machine/in_cksum.h>
 
 #include <sys/socketvar.h>
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netkey/key.h>
 #endif
 
 int rsvp_on = 0;
 
 int	ipforwarding = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
     &ipforwarding, 0, "Enable IP forwarding between interfaces");
 
 static int	ipsendredirects = 1; /* XXX */
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
     &ipsendredirects, 0, "Enable sending IP redirects");
 
 int	ip_defttl = IPDEFTTL;
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
     &ip_defttl, 0, "Maximum TTL on IP packets");
 
 static int	ip_dosourceroute = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
     &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
 
 static int	ip_acceptsourceroute = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, 
     CTLFLAG_RW, &ip_acceptsourceroute, 0, 
     "Enable accepting source routed IP packets");
 
 static int	ip_keepfaith = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
 	&ip_keepfaith,	0,
 	"Enable packet capture for FAITH IPv4->IPv6 translater daemon");
 
 static int	ip_nfragpackets = 0;
 static int	ip_maxfragpackets;	/* initialized in ip_init() */
 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW,
 	&ip_maxfragpackets, 0,
 	"Maximum number of IPv4 fragment reassembly queue entries");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static int	ip_checkinterface = 1;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
     &ip_checkinterface, 0, "Verify packet arrives on correct interface");
 
 #ifdef DIAGNOSTIC
 static int	ipprintfs = 0;
 #endif
 
 static int	ipqmaxlen = IFQ_MAXLEN;
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 struct	in_ifaddrhead in_ifaddrhead; 		/* first inet address */
 struct	in_ifaddrhashhead *in_ifaddrhashtbl;	/* inet addr hash table  */
 u_long 	in_ifaddrhmask;				/* mask for hash table */
 
 SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
     &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
 SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
     &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue");
 
 struct ipstat ipstat;
 SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
     &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 /* Packet reassembly stuff */
 #define IPREASS_NHASH_LOG2      6
 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
 #define IPREASS_HASH(x,y) \
 	(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
 
 static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH];
 static int    nipq = 0;         /* total # of reass queues */
 static int    maxnipq;
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 static int	ipstealth = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
     &ipstealth, 0, "");
 #endif
 
 
 /* Firewall hooks */
 ip_fw_chk_t *ip_fw_chk_ptr;
 int fw_enable = 1 ;
 
 /* Dummynet hooks */
 ip_dn_io_t *ip_dn_io_ptr;
 
 
 /*
  * XXX this is ugly -- the following two global variables are
  * used to store packet state while it travels through the stack.
  * Note that the code even makes assumptions on the size and
  * alignment of fields inside struct ip_srcrt so e.g. adding some
  * fields will break the code. This needs to be fixed.
  *
  * We need to save the IP options in case a protocol wants to respond
  * to an incoming packet over the same route if the packet got here
  * using IP source routing.  This allows connection establishment and
  * maintenance when the remote end is on a network that is not known
  * to us.
  */
 static int	ip_nhops = 0;
 static	struct ip_srcrt {
 	struct	in_addr dst;			/* final destination */
 	char	nop;				/* one NOP to align */
 	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
 	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
 } ip_srcrt;
 
 static void	save_rte(u_char *, struct in_addr);
 static int	ip_dooptions(struct mbuf *m, int,
 			struct sockaddr_in *next_hop);
 static void	ip_forward(struct mbuf *m, int srcrt,
 			struct sockaddr_in *next_hop);
 static void	ip_freef(struct ipqhead *, struct ipq *);
 static struct	mbuf *ip_reass(struct mbuf *, struct ipqhead *,
 		struct ipq *, u_int32_t *, u_int16_t *);
 static void	ipintr(void);
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init()
 {
 	register struct protosw *pr;
 	register int i;
 
 	TAILQ_INIT(&in_ifaddrhead);
 	in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask);
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == 0)
 		panic("ip_init");
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 
 	for (i = 0; i < IPREASS_NHASH; i++)
 	    TAILQ_INIT(&ipq[i]);
 
 	maxnipq = nmbclusters / 4;
 	ip_maxfragpackets = nmbclusters / 4;
 
 #ifndef RANDOM_IP_ID
 	ip_id = time_second & 0xffff;
 #endif
 	ipintrq.ifq_maxlen = ipqmaxlen;
 	mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF);
 	ipintrq_present = 1;
 
 	register_netisr(NETISR_IP, ipintr);
 }
 
 /*
  * XXX watch out this one. It is perhaps used as a cache for
  * the most recently used route ? it is cleared in in_addroute()
  * when a new route is successfully created.
  */
 struct	route ipforward_rt;
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip;
 	struct ipq *fp;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	int    i, hlen, checkif;
 	u_short sum;
 	struct in_addr pkt_dst;
 	u_int32_t divert_info = 0;		/* packet divert/tee info */
 	struct ip_fw_args args;
 #ifdef PFIL_HOOKS
 	struct packet_filter_hook *pfh;
 	struct mbuf *m0;
 	int rv;
 #endif /* PFIL_HOOKS */
 
 	args.eh = NULL;
 	args.oif = NULL;
 	args.rule = NULL;
 	args.divert_rule = 0;			/* divert cookie */
 	args.next_hop = NULL;
 
 	/* Grab info from MT_TAG mbufs prepended to the chain.	*/
 	for (; m && m->m_type == MT_TAG; m = m->m_next) {
-		switch(m->m_tag_id) {
+		switch(m->_m_tag_id) {
 		default:
 			printf("ip_input: unrecognised MT_TAG tag %d\n",
-			    m->m_tag_id);
+			    m->_m_tag_id);
 			break;
 
 		case PACKET_TAG_DUMMYNET:
 			args.rule = ((struct dn_pkt *)m)->rule;
 			break;
 
 		case PACKET_TAG_DIVERT:
 			args.divert_rule = (intptr_t)m->m_hdr.mh_data & 0xffff;
 			break;
 
 		case PACKET_TAG_IPFORWARD:
 			args.next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
 			break;
 		}
 	}
 
 	KASSERT(m != NULL && (m->m_flags & M_PKTHDR) != 0,
 	    ("ip_input: no HDR"));
 
 	if (args.rule) {	/* dummynet already filtered us */
 		ip = mtod(m, struct ip *);
 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 		goto iphack ;
 	}
 
 	ipstat.ips_total++;
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
 		ipstat.ips_toosmall++;
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
 		ipstat.ips_badvers++;
 		goto bad;
 	}
 
 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		ipstat.ips_badhlen++;
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == 0) {
 			ipstat.ips_badhlen++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
 			ipstat.ips_badaddr++;
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		ipstat.ips_badsum++;
 		goto bad;
 	}
 
 	/*
 	 * Convert fields to host representation.
 	 */
 	ip->ip_len = ntohs(ip->ip_len);
 	if (ip->ip_len < hlen) {
 		ipstat.ips_badlen++;
 		goto bad;
 	}
 	ip->ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip->ip_len) {
 tooshort:
 		ipstat.ips_tooshort++;
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip->ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip->ip_len;
 			m->m_pkthdr.len = ip->ip_len;
 		} else
 			m_adj(m, ip->ip_len - m->m_pkthdr.len);
 	}
 
 #ifdef IPSEC
 	if (ipsec_gethist(m, NULL))
 		goto pass;
 #endif
 
 	/*
 	 * IpHack's section.
 	 * Right now when no processing on packet has done
 	 * and it is still fresh out of network we do our black
 	 * deals with it.
 	 * - Firewall: deny/allow/divert
 	 * - Xlate: translate packet's addr/port (NAT).
 	 * - Pipe: pass pkt through dummynet.
 	 * - Wrap: fake packet's addr/port <unimpl.>
 	 * - Encapsulate: put it in another IP and send out. <unimp.>
  	 */
 
 iphack:
 
 #ifdef PFIL_HOOKS
 	/*
 	 * Run through list of hooks for input packets.  If there are any
 	 * filters which require that additional packets in the flow are
 	 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
 	 * Note that filters must _never_ set this flag, as another filter
 	 * in the list may have previously cleared it.
 	 */
 	m0 = m;
 	pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
 	for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link))
 		if (pfh->pfil_func) {
 			rv = pfh->pfil_func(ip, hlen,
 					    m->m_pkthdr.rcvif, 0, &m0);
 			if (rv)
 				return;
 			m = m0;
 			if (m == NULL)
 				return;
 			ip = mtod(m, struct ip *);
 		}
 #endif /* PFIL_HOOKS */
 
 	if (fw_enable && IPFW_LOADED) {
 		/*
 		 * If we've been forwarded from the output side, then
 		 * skip the firewall a second time
 		 */
 		if (args.next_hop)
 			goto ours;
 
 		args.m = m;
 		i = ip_fw_chk_ptr(&args);
 		m = args.m;
 
 		if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
 			if (m)
 				m_freem(m);
 			return;
 		}
 		ip = mtod(m, struct ip *); /* just in case m changed */
 		if (i == 0 && args.next_hop == NULL)	/* common case */
 			goto pass;
                 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
 			/* Send packet to the appropriate pipe */
 			ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args);
 			return;
 		}
 #ifdef IPDIVERT
 		if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
 			/* Divert or tee packet */
 			divert_info = i;
 			goto ours;
 		}
 #endif
 		if (i == 0 && args.next_hop != NULL)
 			goto pass;
 		/*
 		 * if we get here, the packet must be dropped
 		 */
 		m_freem(m);
 		return;
 	}
 pass:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	ip_nhops = 0;		/* for source routed packets */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Cache the destination address of the packet; this may be
 	 * changed by use of 'ipfw fwd'.
 	 */
 	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 */
 	checkif = ip_checkinterface && (ipforwarding == 0) && 
 	    m->m_pkthdr.rcvif != NULL &&
 	    ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) &&
 	    (args.next_hop == NULL);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	LIST_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif))
 			goto ours;
 	}
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
 	        TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    pkt_dst.s_addr)
 				goto ours;
 			if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr)
 				goto ours;
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
 				goto ours;
 #endif
 		}
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		struct in_multi *inm;
 		if (ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) {
 				ipstat.ips_cantforward++;
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			ipstat.ips_forward++;
 		}
 		/*
 		 * See if we belong to the destination multicast group on the
 		 * arrival interface.
 		 */
 		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
 		if (inm == NULL) {
 			ipstat.ips_notmember++;
 			m_freem(m);
 			return;
 		}
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * FAITH(Firewall Aided Internet Translator)
 	 */
 	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
 		if (ip_keepfaith) {
 			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 
 				goto ours;
 		}
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (ipforwarding == 0) {
 		ipstat.ips_cantforward++;
 		m_freem(m);
 	} else {
 #ifdef IPSEC
 		/*
 		 * Enforce inbound IPsec SPD.
 		 */
 		if (ipsec4_in_reject(m, NULL)) {
 			ipsecstat.in_polvio++;
 			goto bad;
 		}
 #endif /* IPSEC */
 		ip_forward(m, 0, args.next_hop);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (ipstealth && hlen > sizeof (struct ip) &&
 	    ip_dooptions(m, 1, args.next_hop))
 		return;
 #endif /* IPSTEALTH */
 
 	/* Count the packet in the ip address stats */
 	if (ia != NULL) {
 		ia->ia_ifa.if_ipackets++;
 		ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
 	}
 
 	/*
 	 * If offset or IP_MF are set, must reassemble.
 	 * Otherwise, nothing need be done.
 	 * (We could look in the reassembly queue to see
 	 * if the packet was previously fragmented,
 	 * but it's not worth the time; just let them time out.)
 	 */
 	if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
 
 		sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
 		/*
 		 * Look for queue of fragments
 		 * of this datagram.
 		 */
 		TAILQ_FOREACH(fp, &ipq[sum], ipq_list)
 			if (ip->ip_id == fp->ipq_id &&
 			    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 			    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 #ifdef MAC
 			    mac_fragment_match(m, fp) &&
 #endif
 			    ip->ip_p == fp->ipq_p)
 				goto found;
 
 		fp = 0;
 
 		/* check if there's a place for the new queue */
 		if (nipq > maxnipq) {
 		    /*
 		     * drop something from the tail of the current queue
 		     * before proceeding further
 		     */
 		    struct ipq *q = TAILQ_LAST(&ipq[sum], ipqhead);
 		    if (q == NULL) {   /* gak */
 			for (i = 0; i < IPREASS_NHASH; i++) {
 			    struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
 			    if (r) {
 				ip_freef(&ipq[i], r);
 				break;
 			    }
 			}
 		    } else
 			ip_freef(&ipq[sum], q);
 		}
 found:
 		/*
 		 * Adjust ip_len to not reflect header,
 		 * convert offset of this to bytes.
 		 */
 		ip->ip_len -= hlen;
 		if (ip->ip_off & IP_MF) {
 		        /*
 		         * Make sure that fragments have a data length
 			 * that's a non-zero multiple of 8 bytes.
 		         */
 			if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
 				ipstat.ips_toosmall++; /* XXX */
 				goto bad;
 			}
 			m->m_flags |= M_FRAG;
 		} else
 			m->m_flags &= ~M_FRAG;
 		ip->ip_off <<= 3;
 
 		/*
 		 * Attempt reassembly; if it succeeds, proceed.
 		 * ip_reass() will return a different mbuf, and update
 		 * the divert info in divert_info and args.divert_rule.
 		 */
 		ipstat.ips_fragments++;
 		m->m_pkthdr.header = ip;
 		m = ip_reass(m,
 		    &ipq[sum], fp, &divert_info, &args.divert_rule);
 		if (m == 0)
 			return;
 		ipstat.ips_reassembled++;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 #ifdef IPDIVERT
 		/* Restore original checksum before diverting packet */
 		if (divert_info != 0) {
 			ip->ip_len += hlen;
 			ip->ip_len = htons(ip->ip_len);
 			ip->ip_off = htons(ip->ip_off);
 			ip->ip_sum = 0;
 			if (hlen == sizeof(struct ip))
 				ip->ip_sum = in_cksum_hdr(ip);
 			else
 				ip->ip_sum = in_cksum(m, hlen);
 			ip->ip_off = ntohs(ip->ip_off);
 			ip->ip_len = ntohs(ip->ip_len);
 			ip->ip_len -= hlen;
 		}
 #endif
 	} else
 		ip->ip_len -= hlen;
 
 #ifdef IPDIVERT
 	/*
 	 * Divert or tee packet to the divert protocol if required.
 	 */
 	if (divert_info != 0) {
 		struct mbuf *clone = NULL;
 
 		/* Clone packet if we're doing a 'tee' */
 		if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0)
 			clone = m_dup(m, M_DONTWAIT);
 
 		/* Restore packet header fields to original values */
 		ip->ip_len += hlen;
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 
 		/* Deliver packet to divert input routine */
 		divert_packet(m, 1, divert_info & 0xffff, args.divert_rule);
 		ipstat.ips_delivered++;
 
 		/* If 'tee', continue with original packet */
 		if (clone == NULL)
 			return;
 		m = clone;
 		ip = mtod(m, struct ip *);
 		ip->ip_len += hlen;
 		/*
 		 * Jump backwards to complete processing of the
 		 * packet. But first clear divert_info to avoid
 		 * entering this block again.
 		 * We do not need to clear args.divert_rule
 		 * or args.next_hop as they will not be used.
 		 */
 		divert_info = 0;
 		goto pass;
 	}
 #endif
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 &&
 	    ipsec4_in_reject(m, NULL)) {
 		ipsecstat.in_polvio++;
 		goto bad;
 	}
 #endif
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	ipstat.ips_delivered++;
 	if (args.next_hop && ip->ip_p == IPPROTO_TCP) {
 		/* TCP needs IPFORWARD info if available */
 		struct m_hdr tag;
 
 		tag.mh_type = MT_TAG;
 		tag.mh_flags = PACKET_TAG_IPFORWARD;
 		tag.mh_data = (caddr_t)args.next_hop;
 		tag.mh_next = m;
 
 		(*inetsw[ip_protox[ip->ip_p]].pr_input)(
 			(struct mbuf *)&tag, hlen);
 	} else
 		(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP software interrupt routine - to go away sometime soon
  */
 static void
 ipintr(void)
 {
 	struct mbuf *m;
 
 	while (1) {
 		IF_DEQUEUE(&ipintrq, m);
 		if (m == 0)
 			return;
 		ip_input(m);
 	}
 }
 
 /*
  * Take incoming datagram fragment and try to reassemble it into
  * whole datagram.  If a chain for reassembly of this datagram already
  * exists, then it is given as fp; otherwise have to make a chain.
  *
  * When IPDIVERT enabled, keep additional state with each packet that
  * tells us if we need to divert or tee the packet we're building.
  * In particular, *divinfo includes the port and TEE flag,
  * *divert_rule is the number of the matching rule.
  */
 
 static struct mbuf *
 ip_reass(struct mbuf *m, struct ipqhead *head, struct ipq *fp,
 	u_int32_t *divinfo, u_int16_t *divert_rule)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	register struct mbuf *p, *q, *nq;
 	struct mbuf *t;
 	int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 	int i, next;
 
 	/*
 	 * Presence of header sizes in mbufs
 	 * would confuse code below.
 	 */
 	m->m_data += hlen;
 	m->m_len -= hlen;
 
 	/*
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == 0) {
 		/*
 		 * Enforce upper bound on number of fragmented packets
 		 * for which we attempt reassembly;
 		 * If maxfrag is 0, never accept fragments.
 		 * If maxfrag is -1, accept all fragments without limitation.
 		 */
 		if ((ip_maxfragpackets >= 0) && (ip_nfragpackets >= ip_maxfragpackets))
 			goto dropfrag;
 		ip_nfragpackets++;
 		if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL)
 			goto dropfrag;
 		fp = mtod(t, struct ipq *);
 #ifdef MAC
 		mac_init_ipq(fp);
 		mac_create_ipq(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
 		nipq++;
 		fp->ipq_ttl = IPFRAGTTL;
 		fp->ipq_p = ip->ip_p;
 		fp->ipq_id = ip->ip_id;
 		fp->ipq_src = ip->ip_src;
 		fp->ipq_dst = ip->ip_dst;
 		fp->ipq_frags = m;
 		m->m_nextpkt = NULL;
 #ifdef IPDIVERT
 		fp->ipq_div_info = 0;
 		fp->ipq_div_cookie = 0;
 #endif
 		goto inserted;
 	} else {
 #ifdef MAC
 		mac_update_ipq(m, fp);
 #endif
 	}
 
 #define GETIP(m)	((struct ip*)((m)->m_pkthdr.header))
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 		if (GETIP(q)->ip_off > ip->ip_off)
 			break;
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us, otherwise
 	 * stick new segment in the proper place.
 	 *
 	 * If some of the data is dropped from the the preceding
 	 * segment, then it's checksum is invalidated.
 	 */
 	if (p) {
 		i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
 		if (i > 0) {
 			if (i >= ip->ip_len)
 				goto dropfrag;
 			m_adj(m, i);
 			m->m_pkthdr.csum_flags = 0;
 			ip->ip_off += i;
 			ip->ip_len -= i;
 		}
 		m->m_nextpkt = p->m_nextpkt;
 		p->m_nextpkt = m;
 	} else {
 		m->m_nextpkt = fp->ipq_frags;
 		fp->ipq_frags = m;
 	}
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
 	     q = nq) {
 		i = (ip->ip_off + ip->ip_len) -
 		    GETIP(q)->ip_off;
 		if (i < GETIP(q)->ip_len) {
 			GETIP(q)->ip_len -= i;
 			GETIP(q)->ip_off += i;
 			m_adj(q, i);
 			q->m_pkthdr.csum_flags = 0;
 			break;
 		}
 		nq = q->m_nextpkt;
 		m->m_nextpkt = nq;
 		m_freem(q);
 	}
 
 inserted:
 
 #ifdef IPDIVERT
 	/*
 	 * Transfer firewall instructions to the fragment structure.
 	 * Only trust info in the fragment at offset 0.
 	 */
 	if (ip->ip_off == 0) {
 		fp->ipq_div_info = *divinfo;
 		fp->ipq_div_cookie = *divert_rule;
 	}
 	*divinfo = 0;
 	*divert_rule = 0;
 #endif
 
 	/*
 	 * Check for complete reassembly.
 	 */
 	next = 0;
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (GETIP(q)->ip_off != next)
 			return (0);
 		next += GETIP(q)->ip_len;
 	}
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_FRAG)
 		return (0);
 
 	/*
 	 * Reassembly is complete.  Make sure the packet is a sane size.
 	 */
 	q = fp->ipq_frags;
 	ip = GETIP(q);
 	if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
 		ipstat.ips_toolong++;
 		ip_freef(head, fp);
 		return (0);
 	}
 
 	/*
 	 * Concatenate fragments.
 	 */
 	m = q;
 	t = m->m_next;
 	m->m_next = 0;
 	m_cat(m, t);
 	nq = q->m_nextpkt;
 	q->m_nextpkt = 0;
 	for (q = nq; q != NULL; q = nq) {
 		nq = q->m_nextpkt;
 		q->m_nextpkt = NULL;
 		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 		m_cat(m, q);
 	}
 #ifdef MAC
 	mac_create_datagram_from_ipq(fp, m);
 	mac_destroy_ipq(fp);
 #endif
 
 #ifdef IPDIVERT
 	/*
 	 * Extract firewall instructions from the fragment structure.
 	 */
 	*divinfo = fp->ipq_div_info;
 	*divert_rule = fp->ipq_div_cookie;
 #endif
 
 	/*
 	 * Create header for new ip packet by
 	 * modifying header of first packet;
 	 * dequeue and discard fragment reassembly header.
 	 * Make header visible.
 	 */
 	ip->ip_len = next;
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
 	nipq--;
 	(void) m_free(dtom(fp));
 	ip_nfragpackets--;
 	m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
 	m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
 	/* some debugging cruft by sklower, below, will go away soon */
 	if (m->m_flags & M_PKTHDR)	/* XXX this should be done elsewhere */
 		m_fixhdr(m);
 	return (m);
 
 dropfrag:
 #ifdef IPDIVERT
 	*divinfo = 0;
 	*divert_rule = 0;
 #endif
 	ipstat.ips_fragdropped++;
 	m_freem(m);
 	return (0);
 
 #undef GETIP
 }
 
 /*
  * Free a fragment reassembly header and all
  * associated datagrams.
  */
 static void
 ip_freef(fhp, fp)
 	struct ipqhead *fhp;
 	struct ipq *fp;
 {
 	register struct mbuf *q;
 
 	while (fp->ipq_frags) {
 		q = fp->ipq_frags;
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
 	TAILQ_REMOVE(fhp, fp, ipq_list);
 	(void) m_free(dtom(fp));
 	ip_nfragpackets--;
 	nipq--;
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo()
 {
 	register struct ipq *fp;
 	int s = splnet();
 	int i;
 
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		for(fp = TAILQ_FIRST(&ipq[i]); fp;) {
 			struct ipq *fpp;
 
 			fpp = fp;
 			fp = TAILQ_NEXT(fp, ipq_list);
 			if(--fpp->ipq_ttl == 0) {
 				ipstat.ips_fragtimeout++;
 				ip_freef(&ipq[i], fpp);
 			}
 		}
 	}
 	/*
 	 * If we are over the maximum number of fragments
 	 * (due to the limit being lowered), drain off
 	 * enough to get down to the new limit.
 	 */
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		if (ip_maxfragpackets >= 0) {
 			while (ip_nfragpackets > ip_maxfragpackets &&
 				!TAILQ_EMPTY(&ipq[i])) {
 				ipstat.ips_fragdropped++;
 				ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
 			}
 		}
 	}
 	ipflow_slowtimo();
 	splx(s);
 }
 
 /*
  * Drain off all datagram fragments.
  */
 void
 ip_drain()
 {
 	int     i;
 
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		while(!TAILQ_EMPTY(&ipq[i])) {
 			ipstat.ips_fragdropped++;
 			ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
 		}
 	}
 	in_rtqdrain();
 }
 
 /*
  * Do option processing on a datagram,
  * possibly discarding it if bad options are encountered,
  * or forwarding it if source-routed.
  * The pass argument is used when operating in the IPSTEALTH
  * mode to tell what options to process:
  * [LS]SRR (pass 0) or the others (pass 1).
  * The reason for as many as two passes is that when doing IPSTEALTH,
  * non-routing options should be processed only if the packet is for us.
  * Returns 1 if packet has been forwarded/freed,
  * 0 if the packet should be processed further.
  */
 static int
 ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	u_char *cp;
 	struct in_ifaddr *ia;
 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
 	struct in_addr *sin, dst;
 	n_time ntime;
 	struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
 
 	dst = ip->ip_dst;
 	cp = (u_char *)(ip + 1);
 	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		/*
 		 * Source routing with record.
 		 * Find interface with current destination address.
 		 * If none on this machine then drop if strictly routed,
 		 * or do nothing if loosely routed.
 		 * Record interface address and bring up next address
 		 * component.  If strictly routed make sure next
 		 * address is on directly accessible net.
 		 */
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 #ifdef IPSTEALTH
 			if (ipstealth && pass > 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			ipaddr.sin_addr = ip->ip_dst;
 			ia = (struct in_ifaddr *)
 				ifa_ifwithaddr((struct sockaddr *)&ipaddr);
 			if (ia == 0) {
 				if (opt == IPOPT_SSRR) {
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				}
 				if (!ip_dosourceroute)
 					goto nosourcerouting;
 				/*
 				 * Loose routing, and not at next destination
 				 * yet; nothing to do except forward.
 				 */
 				break;
 			}
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr)) {
 				/*
 				 * End of source route.  Should be for us.
 				 */
 				if (!ip_acceptsourceroute)
 					goto nosourcerouting;
 				save_rte(cp, ip->ip_src);
 				break;
 			}
 #ifdef IPSTEALTH
 			if (ipstealth)
 				goto dropit;
 #endif
 			if (!ip_dosourceroute) {
 				if (ipforwarding) {
 					char buf[16]; /* aaa.bbb.ccc.ddd\0 */
 					/*
 					 * Acting as a router, so generate ICMP
 					 */
 nosourcerouting:
 					strcpy(buf, inet_ntoa(ip->ip_dst));
 					log(LOG_WARNING, 
 					    "attempted source route from %s to %s\n",
 					    inet_ntoa(ip->ip_src), buf);
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				} else {
 					/*
 					 * Not acting as a router, so silently drop.
 					 */
 #ifdef IPSTEALTH
 dropit:
 #endif
 					ipstat.ips_cantforward++;
 					m_freem(m);
 					return (1);
 				}
 			}
 
 			/*
 			 * locate outgoing interface
 			 */
 			(void)memcpy(&ipaddr.sin_addr, cp + off,
 			    sizeof(ipaddr.sin_addr));
 
 			if (opt == IPOPT_SSRR) {
 #define	INA	struct in_ifaddr *
 #define	SA	struct sockaddr *
 			    if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
 				ia = (INA)ifa_ifwithnet((SA)&ipaddr);
 			} else
 				ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt);
 			if (ia == 0) {
 				type = ICMP_UNREACH;
 				code = ICMP_UNREACH_SRCFAIL;
 				goto bad;
 			}
 			ip->ip_dst = ipaddr.sin_addr;
 			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
 			    sizeof(struct in_addr));
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			/*
 			 * Let ip_intr's mcast routing check handle mcast pkts
 			 */
 			forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
 			break;
 
 		case IPOPT_RR:
 #ifdef IPSTEALTH
 			if (ipstealth && pass == 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			/*
 			 * If no space remains, ignore.
 			 */
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr))
 				break;
 			(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
 			    sizeof(ipaddr.sin_addr));
 			/*
 			 * locate outgoing interface; if we're the destination,
 			 * use the incoming interface (should be same).
 			 */
 			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
 			    (ia = ip_rtaddr(ipaddr.sin_addr,
 			    &ipforward_rt)) == 0) {
 				type = ICMP_UNREACH;
 				code = ICMP_UNREACH_HOST;
 				goto bad;
 			}
 			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
 			    sizeof(struct in_addr));
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			break;
 
 		case IPOPT_TS:
 #ifdef IPSTEALTH
 			if (ipstealth && pass == 0)
 				break;
 #endif
 			code = cp - (u_char *)ip;
 			if (optlen < 4 || optlen > 40) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < 5) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if (off > optlen - (int)sizeof(int32_t)) {
 				cp[IPOPT_OFFSET + 1] += (1 << 4);
 				if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				break;
 			}
 			off--;				/* 0 origin */
 			sin = (struct in_addr *)(cp + off);
 			switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
 
 			case IPOPT_TS_TSONLY:
 				break;
 
 			case IPOPT_TS_TSANDADDR:
 				if (off + sizeof(n_time) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				ipaddr.sin_addr = dst;
 				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
 							    m->m_pkthdr.rcvif);
 				if (ia == 0)
 					continue;
 				(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
 				    sizeof(struct in_addr));
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			case IPOPT_TS_PRESPEC:
 				if (off + sizeof(n_time) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				(void)memcpy(&ipaddr.sin_addr, sin,
 				    sizeof(struct in_addr));
 				if (ifa_ifwithaddr((SA)&ipaddr) == 0)
 					continue;
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			default:
 				code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
 				goto bad;
 			}
 			ntime = iptime();
 			(void)memcpy(cp + off, &ntime, sizeof(n_time));
 			cp[IPOPT_OFFSET] += sizeof(n_time);
 		}
 	}
 	if (forward && ipforwarding) {
 		ip_forward(m, 1, next_hop);
 		return (1);
 	}
 	return (0);
 bad:
 	icmp_error(m, type, code, 0, 0);
 	ipstat.ips_badoptions++;
 	return (1);
 }
 
 /*
  * Given address of next destination (final or next hop),
  * return internet address info of interface to be used to get there.
  */
 struct in_ifaddr *
 ip_rtaddr(dst, rt)
 	struct in_addr dst;
 	struct route *rt;
 {
 	register struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)&rt->ro_dst;
 
 	if (rt->ro_rt == 0 ||
 	    !(rt->ro_rt->rt_flags & RTF_UP) ||
 	    dst.s_addr != sin->sin_addr.s_addr) {
 		if (rt->ro_rt) {
 			RTFREE(rt->ro_rt);
 			rt->ro_rt = 0;
 		}
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = dst;
 
 		rtalloc_ign(rt, RTF_PRCLONING);
 	}
 	if (rt->ro_rt == 0)
 		return ((struct in_ifaddr *)0);
 	return (ifatoia(rt->ro_rt->rt_ifa));
 }
 
 /*
  * Save incoming source route for use in replies,
  * to be picked up later by ip_srcroute if the receiver is interested.
  */
 static void
 save_rte(option, dst)
 	u_char *option;
 	struct in_addr dst;
 {
 	unsigned olen;
 
 	olen = option[IPOPT_OLEN];
 #ifdef DIAGNOSTIC
 	if (ipprintfs)
 		printf("save_rte: olen %d\n", olen);
 #endif
 	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
 		return;
 	bcopy(option, ip_srcrt.srcopt, olen);
 	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
 	ip_srcrt.dst = dst;
 }
 
 /*
  * Retrieve incoming source route for use in replies,
  * in the same form used by setsockopt.
  * The first hop is placed before the options, will be removed later.
  */
 struct mbuf *
 ip_srcroute()
 {
 	register struct in_addr *p, *q;
 	register struct mbuf *m;
 
 	if (ip_nhops == 0)
 		return ((struct mbuf *)0);
 	m = m_get(M_DONTWAIT, MT_HEADER);
 	if (m == 0)
 		return ((struct mbuf *)0);
 
 #define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
 
 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
 	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
 	    OPTSIZ;
 #ifdef DIAGNOSTIC
 	if (ipprintfs)
 		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
 #endif
 
 	/*
 	 * First save first hop for return route
 	 */
 	p = &ip_srcrt.route[ip_nhops - 1];
 	*(mtod(m, struct in_addr *)) = *p--;
 #ifdef DIAGNOSTIC
 	if (ipprintfs)
 		printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr));
 #endif
 
 	/*
 	 * Copy option fields and padding (nop) to mbuf.
 	 */
 	ip_srcrt.nop = IPOPT_NOP;
 	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
 	(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
 	    &ip_srcrt.nop, OPTSIZ);
 	q = (struct in_addr *)(mtod(m, caddr_t) +
 	    sizeof(struct in_addr) + OPTSIZ);
 #undef OPTSIZ
 	/*
 	 * Record return path as an IP source route,
 	 * reversing the path (pointers are now aligned).
 	 */
 	while (p >= ip_srcrt.route) {
 #ifdef DIAGNOSTIC
 		if (ipprintfs)
 			printf(" %lx", (u_long)ntohl(q->s_addr));
 #endif
 		*q++ = *p--;
 	}
 	/*
 	 * Last hop goes to final destination.
 	 */
 	*q = ip_srcrt.dst;
 #ifdef DIAGNOSTIC
 	if (ipprintfs)
 		printf(" %lx\n", (u_long)ntohl(q->s_addr));
 #endif
 	return (m);
 }
 
 /*
  * Strip out IP options, at higher
  * level protocol in the kernel.
  * Second argument is buffer to which options
  * will be moved, and return value is their length.
  * XXX should be deleted; last arg currently ignored.
  */
 void
 ip_stripoptions(m, mopt)
 	register struct mbuf *m;
 	struct mbuf *mopt;
 {
 	register int i;
 	struct ip *ip = mtod(m, struct ip *);
 	register caddr_t opts;
 	int olen;
 
 	olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
 	opts = (caddr_t)(ip + 1);
 	i = m->m_len - (sizeof (struct ip) + olen);
 	bcopy(opts + olen, opts, (unsigned)i);
 	m->m_len -= olen;
 	if (m->m_flags & M_PKTHDR)
 		m->m_pkthdr.len -= olen;
 	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 static void
 ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct rtentry *rt;
 	int error, type = 0, code = 0;
 	struct mbuf *mcopy;
 	n_long dest;
 	struct in_addr pkt_dst;
 	struct ifnet *destifp;
 #ifdef IPSEC
 	struct ifnet dummyifp;
 #endif
 
 	dest = 0;
 	/*
 	 * Cache the destination address of the packet; this may be
 	 * changed by use of 'ipfw fwd'.
 	 */
 	pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst;
 
 #ifdef DIAGNOSTIC
 	if (ipprintfs)
 		printf("forward: src %lx dst %lx ttl %x\n",
 		    (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr,
 		    ip->ip_ttl);
 #endif
 
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) {
 		ipstat.ips_cantforward++;
 		m_freem(m);
 		return;
 	}
 #ifdef IPSTEALTH
 	if (!ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    dest, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	if (ip_rtaddr(pkt_dst, &ipforward_rt) == 0) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
 		return;
 	} else
 		rt = ipforward_rt.ro_rt;
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	MGET(mcopy, M_DONTWAIT, m->m_type);
 	if (mcopy != NULL) {
 		M_COPY_PKTHDR(mcopy, m);
 		mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
 		    (int)ip->ip_len);
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 #ifdef MAC
 		/*
 		 * XXXMAC: This will eventually become an explicit
 		 * labeling point.
 		 */
 		mac_create_mbuf_from_mbuf(m, mcopy);
 #endif
 	}
 
 #ifdef IPSTEALTH
 	if (!ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
 	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 	    satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
 	    ipsendredirects && !srcrt && !next_hop) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 		u_long src = ntohl(ip->ip_src.s_addr);
 
 		if (RTA(rt) &&
 		    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 		    if (rt->rt_flags & RTF_GATEWAY)
 			dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
 		    else
 			dest = pkt_dst.s_addr;
 		    /* Router requirements says to only send host redirects */
 		    type = ICMP_REDIRECT;
 		    code = ICMP_REDIRECT_HOST;
 #ifdef DIAGNOSTIC
 		    if (ipprintfs)
 		        printf("redirect (%d) to %lx\n", code, (u_long)dest);
 #endif
 		}
 	}
 
     {
 	struct m_hdr tag;
 
 	if (next_hop) {
 		/* Pass IPFORWARD info if available */
  
 		tag.mh_type = MT_TAG;
 		tag.mh_flags = PACKET_TAG_IPFORWARD;
 		tag.mh_data = (caddr_t)next_hop;
 		tag.mh_next = m;
 		m = (struct mbuf *)&tag;
 	}
 	error = ip_output(m, (struct mbuf *)0, &ipforward_rt, 
-			  IP_FORWARDING, 0);
+			  IP_FORWARDING, 0, NULL);
     }
 	if (error)
 		ipstat.ips_cantforward++;
 	else {
 		ipstat.ips_forward++;
 		if (type)
 			ipstat.ips_redirectsent++;
 		else {
 			if (mcopy) {
 				ipflow_create(&ipforward_rt, mcopy);
 				m_freem(mcopy);
 			}
 			return;
 		}
 	}
 	if (mcopy == NULL)
 		return;
 	destifp = NULL;
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:		/* shouldn't happen, checked above */
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
-#ifndef IPSEC
-		if (ipforward_rt.ro_rt)
-			destifp = ipforward_rt.ro_rt->rt_ifp;
-#else
+#ifdef IPSEC
 		/*
 		 * If the packet is routed over IPsec tunnel, tell the
 		 * originator the tunnel MTU.
 		 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
 		 * XXX quickhack!!!
 		 */
 		if (ipforward_rt.ro_rt) {
 			struct secpolicy *sp = NULL;
 			int ipsecerror;
 			int ipsechdr;
 			struct route *ro;
 
 			sp = ipsec4_getpolicybyaddr(mcopy,
 						    IPSEC_DIR_OUTBOUND,
 			                            IP_FORWARDING,
 			                            &ipsecerror);
 
 			if (sp == NULL)
 				destifp = ipforward_rt.ro_rt->rt_ifp;
 			else {
 				/* count IPsec header size */
 				ipsechdr = ipsec4_hdrsiz(mcopy,
 							 IPSEC_DIR_OUTBOUND,
 							 NULL);
 
 				/*
 				 * find the correct route for outer IPv4
 				 * header, compute tunnel MTU.
 				 *
 				 * XXX BUG ALERT
 				 * The "dummyifp" code relies upon the fact
 				 * that icmp_error() touches only ifp->if_mtu.
 				 */
 				/*XXX*/
 				destifp = NULL;
 				if (sp->req != NULL
 				 && sp->req->sav != NULL
 				 && sp->req->sav->sah != NULL) {
 					ro = &sp->req->sav->sah->sa_route;
 					if (ro->ro_rt && ro->ro_rt->rt_ifp) {
 						dummyifp.if_mtu =
 						    ro->ro_rt->rt_ifp->if_mtu;
 						dummyifp.if_mtu -= ipsechdr;
 						destifp = &dummyifp;
 					}
 				}
 
 				key_freesp(sp);
 			}
 		}
+#else
+		if (ipforward_rt.ro_rt)
+			destifp = ipforward_rt.ro_rt->rt_ifp;
 #endif /*IPSEC*/
 		ipstat.ips_cantfrag++;
 		break;
 
 	case ENOBUFS:
 		type = ICMP_SOURCEQUENCH;
 		code = 0;
 		break;
 
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		return;
 	}
 	icmp_error(mcopy, type, code, dest, destifp);
 }
 
 void
 ip_savecontrol(inp, mp, ip, m)
 	register struct inpcb *inp;
 	register struct mbuf **mp;
 	register struct ip *ip;
 	register struct mbuf *m;
 {
 	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 			SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t) ip_srcroute(),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if (((ifp = m->m_pkthdr.rcvif)) 
 		&& ( ifp->if_index && (ifp->if_index <= if_index))) {
 			sdp = (struct sockaddr_dl *)
 			    (ifaddr_byindex(ifp->if_index)->ifa_addr);
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if ((sdp->sdl_family != AF_LINK)
 			|| (sdp->sdl_len > sizeof(sdlbuf))) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len
 				= offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
 			IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 }
 
 /*
  * XXX these routines are called from the upper part of the kernel.
  * They need to be locked when we remove Giant.
  *
  * They could also be moved to ip_mroute.c, since all the RSVP
  *  handling is done there already.
  */
 static int ip_rsvp_on;
 struct socket *ip_rsvpd;
 int
 ip_rsvp_init(struct socket *so)
 {
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 	  return EOPNOTSUPP;
 
 	if (ip_rsvpd != NULL)
 	  return EADDRINUSE;
 
 	ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!ip_rsvp_on) {
 		ip_rsvp_on = 1;
 		rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 	ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (ip_rsvp_on) {
 		ip_rsvp_on = 0;
 		rsvp_on--;
 	}
 	return 0;
 }
Index: head/sys/netinet/ip_mroute.c
===================================================================
--- head/sys/netinet/ip_mroute.c	(revision 105193)
+++ head/sys/netinet/ip_mroute.c	(revision 105194)
@@ -1,2257 +1,2257 @@
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  *
  * MROUTING Revision: 3.5
  * $FreeBSD$
  */
 
 #include "opt_mrouting.h"
 #include "opt_random_ip_id.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 
 #ifndef MROUTING
 extern u_long	_ip_mcast_src(int vifi);
 extern int	_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
 		    struct ip_moptions *imo);
 extern int	_ip_mrouter_done(void);
 extern int	_ip_mrouter_get(struct socket *so, struct sockopt *sopt);
 extern int	_ip_mrouter_set(struct socket *so, struct sockopt *sopt);
 extern int	_mrt_ioctl(int req, caddr_t data);
 
 /*
  * Dummy routines and globals used when multicast routing is not compiled in.
  */
 
 struct socket  *ip_mrouter  = NULL;
 u_int		rsvpdebug = 0;
 
 int
 _ip_mrouter_set(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	return(EOPNOTSUPP);
 }
 
 int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set;
 
 
 int
 _ip_mrouter_get(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	return(EOPNOTSUPP);
 }
 
 int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get;
 
 int
 _ip_mrouter_done()
 {
 	return(0);
 }
 
 int (*ip_mrouter_done)(void) = _ip_mrouter_done;
 
 int
 _ip_mforward(ip, ifp, m, imo)
 	struct ip *ip;
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct ip_moptions *imo;
 {
 	return(0);
 }
 
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *) = _ip_mforward;
 
 int
 _mrt_ioctl(int req, caddr_t data)
 {
 	return EOPNOTSUPP;
 }
 
 int (*mrt_ioctl)(int, caddr_t) = _mrt_ioctl;
 
 void
 rsvp_input(m, off)		/* XXX must fixup manually */
 	struct mbuf *m;
 	int off;
 {
     /* Can still get packets with rsvp_on = 0 if there is a local member
      * of the group to which the RSVP packet is addressed.  But in this
      * case we want to throw the packet away.
      */
     if (!rsvp_on) {
 	m_freem(m);
 	return;
     }
  
     if (ip_rsvpd != NULL) {
 	if (rsvpdebug)
 	    printf("rsvp_input: Sending packet up old-style socket\n");
 	rip_input(m, off);
 	return;
     }
     /* Drop the packet */
     m_freem(m);
 }
 
 int (*legal_vif_num)(int) = 0;
 
 /*
  * This should never be called, since IP_MULTICAST_VIF should fail, but
  * just in case it does get called, the code a little lower in ip_output
  * will assign the packet a local address.
  */
 u_long
 _ip_mcast_src(int vifi) { return INADDR_ANY; }
 u_long (*ip_mcast_src)(int) = _ip_mcast_src;
 
 int
 ip_rsvp_vif_init(so, sopt)
     struct socket *so;
     struct sockopt *sopt;
 {
     return(EINVAL);
 }
 
 int
 ip_rsvp_vif_done(so, sopt)
     struct socket *so;
     struct sockopt *sopt;
 {
     return(EINVAL);
 }
 
 void
 ip_rsvp_force_done(so)
     struct socket *so;
 {
     return;
 }
 
 #else /* MROUTING */
 
 #define M_HASCL(m)	((m)->m_flags & M_EXT)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");
 
 #ifndef MROUTE_KLD
 /* The socket used to communicate with the multicast routing daemon.  */
 struct socket  *ip_mrouter  = NULL;
 #endif
 
 #if defined(MROUTING) || defined(MROUTE_KLD)
 static struct mrtstat	mrtstat;
 SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
     &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");
 #endif
 
 static struct mfc	*mfctable[MFCTBLSIZ];
 static u_char		nexpire[MFCTBLSIZ];
 static struct vif	viftable[MAXVIFS];
 static u_int	mrtdebug = 0;	  /* debug level 	*/
 #define		DEBUG_MFC	0x02
 #define		DEBUG_FORWARD	0x04
 #define		DEBUG_EXPIRE	0x08
 #define		DEBUG_XMIT	0x10
 static u_int  	tbfdebug = 0;     /* tbf debug level 	*/
 static u_int	rsvpdebug = 0;	  /* rsvp debug level   */
 
 static struct callout_handle expire_upcalls_ch;
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 /*
  * Define the token bucket filter structures
  * tbftable -> each vif has one of these for storing info 
  */
 
 static struct tbf tbftable[MAXVIFS];
 #define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
 
 /*
  * 'Interfaces' associated with decapsulator (so we can tell
  * packets that went through it from ones that get reflected
  * by a broken gateway).  These interfaces are never linked into
  * the system ifnet list & no routes point to them.  I.e., packets
  * can't be sent this way.  They only exist as a placeholder for
  * multicast source verification.
  */
 static struct ifnet multicast_decap_if[MAXVIFS];
 
 #define ENCAP_TTL 64
 #define ENCAP_PROTO IPPROTO_IPIP	/* 4 */
 
 /* prototype IP hdr for encapsulated packets */
 static struct ip multicast_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2, IPVERSION,
 #else
 	IPVERSION, sizeof(struct ip) >> 2,
 #endif
 	0,				/* tos */
 	sizeof(struct ip),		/* total length */
 	0,				/* id */
 	0,				/* frag offset */
 	ENCAP_TTL, ENCAP_PROTO,	
 	0,				/* checksum */
 };
 
 /*
  * Private variables.
  */
 static vifi_t	   numvifs = 0;
 static const struct encaptab *encap_cookie = NULL;
 
 /*
  * one-back cache used by mroute_encapcheck to locate a tunnel's vif
  * given a datagram's src ip address.
  */
 static u_long last_encap_src;
 static struct vif *last_encap_vif;
 
 static u_long	X_ip_mcast_src(int vifi);
 static int	X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *so, struct sockopt *m);
 static int	X_ip_mrouter_set(struct socket *so, struct sockopt *m);
 static int	X_legal_vif_num(int vif);
 static int	X_mrt_ioctl(int cmd, caddr_t data);
 
 static int get_sg_cnt(struct sioc_sg_req *);
 static int get_vif_cnt(struct sioc_vif_req *);
 static int ip_mrouter_init(struct socket *, int);
 static int add_vif(struct vifctl *);
 static int del_vif(vifi_t);
 static int add_mfc(struct mfcctl *);
 static int del_mfc(struct mfcctl *);
 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
 static int set_assert(int);
 static void expire_upcalls(void *);
 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *,
 		  vifi_t);
 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
 static void encap_send(struct ip *, struct vif *, struct mbuf *);
 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long);
 static void tbf_queue(struct vif *, struct mbuf *);
 static void tbf_process_q(struct vif *);
 static void tbf_reprocess_q(void *);
 static int tbf_dq_sel(struct vif *, struct ip *);
 static void tbf_send_packet(struct vif *, struct mbuf *);
 static void tbf_update_tokens(struct vif *);
 static int priority(struct vif *, struct ip *);
 
 /*
  * whether or not special PIM assert processing is enabled.
  */
 static int pim_assert;
 /*
  * Rate limit for assert notification messages, in usec
  */
 #define ASSERT_MSG_TIME		3000000
 
 /*
  * Hash function for a source, group entry
  */
 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
 			((g) >> 20) ^ ((g) >> 10) ^ (g))
 
 /*
  * Find a route for a given origin IP address and Multicast group address
  * Type of service parameter to be added in the future!!!
  */
 
 #define MFCFIND(o, g, rt) { \
 	register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \
 	rt = NULL; \
 	++mrtstat.mrts_mfc_lookups; \
 	while (_rt) { \
 		if ((_rt->mfc_origin.s_addr == o) && \
 		    (_rt->mfc_mcastgrp.s_addr == g) && \
 		    (_rt->mfc_stall == NULL)) { \
 			rt = _rt; \
 			break; \
 		} \
 		_rt = _rt->mfc_next; \
 	} \
 	if (rt == NULL) { \
 		++mrtstat.mrts_mfc_misses; \
 	} \
 }
 
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  */
 #define TV_DELTA(a, b, delta) { \
 	    register int xxs; \
 		\
 	    delta = (a).tv_usec - (b).tv_usec; \
 	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
 	       switch (xxs) { \
 		      case 2: \
 			  delta += 1000000; \
 			      /* FALLTHROUGH */ \
 		      case 1: \
 			  delta += 1000000; \
 			  break; \
 		      default: \
 			  delta += (1000000 * xxs); \
 	       } \
 	    } \
 }
 
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 #ifdef UPCALL_TIMING
 u_long upcall_data[51];
 static void collate(struct timeval *);
 #endif /* UPCALL_TIMING */
 
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 static int
 X_ip_mrouter_set(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int	error, optval;
 	vifi_t	vifi;
 	struct	vifctl vifc;
 	struct	mfcctl mfc;
 
 	if (so != ip_mrouter && sopt->sopt_name != MRT_INIT)
 		return (EPERM);
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case MRT_INIT:
 		error = sooptcopyin(sopt, &optval, sizeof optval, 
 				    sizeof optval);
 		if (error)
 			break;
 		error = ip_mrouter_init(so, optval);
 		break;
 
 	case MRT_DONE:
 		error = ip_mrouter_done();
 		break;
 
 	case MRT_ADD_VIF:
 		error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 		if (error)
 			break;
 		error = add_vif(&vifc);
 		break;
 
 	case MRT_DEL_VIF:
 		error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 		if (error)
 			break;
 		error = del_vif(vifi);
 		break;
 
 	case MRT_ADD_MFC:
 	case MRT_DEL_MFC:
 		error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc);
 		if (error)
 			break;
 		if (sopt->sopt_name == MRT_ADD_MFC)
 			error = add_mfc(&mfc);
 		else
 			error = del_mfc(&mfc);
 		break;
 
 	case MRT_ASSERT:
 		error = sooptcopyin(sopt, &optval, sizeof optval, 
 				    sizeof optval);
 		if (error)
 			break;
 		set_assert(optval);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 #ifndef MROUTE_KLD
 int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set;
 #endif
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error;
 	static int version = 0x0305; /* !!! why is this here? XXX */
 
 	switch (sopt->sopt_name) {
 	case MRT_VERSION:
 		error = sooptcopyout(sopt, &version, sizeof version);
 		break;
 
 	case MRT_ASSERT:
 		error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 #ifndef MROUTE_KLD
 int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get;
 #endif
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(cmd, data)
     int cmd;
     caddr_t data;
 {
     int error = 0;
 
     switch (cmd) {
 	case (SIOCGETVIFCNT):
 	    return (get_vif_cnt((struct sioc_vif_req *)data));
 	    break;
 	case (SIOCGETSGCNT):
 	    return (get_sg_cnt((struct sioc_sg_req *)data));
 	    break;
 	default:
 	    return (EINVAL);
 	    break;
     }
     return error;
 }
 
 #ifndef MROUTE_KLD
 int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl;
 #endif
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(req)
     register struct sioc_sg_req *req;
 {
     register struct mfc *rt;
     int s;
 
     s = splnet();
     MFCFIND(req->src.s_addr, req->grp.s_addr, rt);
     splx(s);
     if (rt != NULL) {
 	req->pktcnt = rt->mfc_pkt_cnt;
 	req->bytecnt = rt->mfc_byte_cnt;
 	req->wrong_if = rt->mfc_wrong_if;
     } else
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(req)
     register struct sioc_vif_req *req;
 {
     register vifi_t vifi = req->vifi;
 
     if (vifi >= numvifs) return EINVAL;
 
     req->icount = viftable[vifi].v_pkt_in;
     req->ocount = viftable[vifi].v_pkt_out;
     req->ibytes = viftable[vifi].v_bytes_in;
     req->obytes = viftable[vifi].v_bytes_out;
 
     return 0;
 }
 
 /*
  * Enable multicast routing
  */
 static int
 ip_mrouter_init(so, version)
 	struct socket *so;
 	int version;
 {
     if (mrtdebug)
 	log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
 		so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW ||
 	so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP;
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     if (ip_mrouter != NULL) return EADDRINUSE;
 
     ip_mrouter = so;
 
     bzero((caddr_t)mfctable, sizeof(mfctable));
     bzero((caddr_t)nexpire, sizeof(nexpire));
 
     pim_assert = 0;
 
     expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT);
 
     if (mrtdebug)
 	log(LOG_DEBUG, "ip_mrouter_init\n");
 
     return 0;
 }
 
 /*
  * Disable multicast routing
  */
 static int
 X_ip_mrouter_done()
 {
     vifi_t vifi;
     int i;
     struct ifnet *ifp;
     struct ifreq ifr;
     struct mfc *rt;
     struct rtdetq *rte;
     int s;
 
     s = splnet();
 
     /*
      * For each phyint in use, disable promiscuous reception of all IP
      * multicasts.
      */
     for (vifi = 0; vifi < numvifs; vifi++) {
 	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
 	    !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
 	    ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
 	    ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr
 								= INADDR_ANY;
 	    ifp = viftable[vifi].v_ifp;
 	    if_allmulti(ifp, 0);
 	}
     }
     bzero((caddr_t)tbftable, sizeof(tbftable));
     bzero((caddr_t)viftable, sizeof(viftable));
     numvifs = 0;
     pim_assert = 0;
 
     untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch);
 
     /*
      * Free all multicast forwarding cache entries.
      */
     for (i = 0; i < MFCTBLSIZ; i++) {
 	for (rt = mfctable[i]; rt != NULL; ) {
 	    struct mfc *nr = rt->mfc_next;
 
 	    for (rte = rt->mfc_stall; rte != NULL; ) {
 		struct rtdetq *n = rte->next;
 
 		m_freem(rte->m);
 		free(rte, M_MRTABLE);
 		rte = n;
 	    }
 	    free(rt, M_MRTABLE);
 	    rt = nr;
 	}
     }
 
     bzero((caddr_t)mfctable, sizeof(mfctable));
 
     /*
      * Reset de-encapsulation cache
      */
     last_encap_src = 0;
     last_encap_vif = NULL;
     if (encap_cookie) {
 	encap_detach(encap_cookie);
 	encap_cookie = NULL;
     }
  
     ip_mrouter = NULL;
 
     splx(s);
 
     if (mrtdebug)
 	log(LOG_DEBUG, "ip_mrouter_done\n");
 
     return 0;
 }
 
 #ifndef MROUTE_KLD
 int (*ip_mrouter_done)(void) = X_ip_mrouter_done;
 #endif
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(i)
 	int i;
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     pim_assert = i;
 
     return 0;
 }
 
 /*
  * Decide if a packet is from a tunnelled peer.
  * Return 0 if not, 64 if so.
  */
 static int
 mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
     struct ip *ip = mtod(m, struct ip *);
     int hlen = ip->ip_hl << 2;
     register struct vif *vifp;
 
     /*
      * don't claim the packet if it's not to a multicast destination or if
      * we don't have an encapsulating tunnel with the source.
      * Note:  This code assumes that the remote site IP address
      * uniquely identifies the tunnel (i.e., that this site has
      * at most one tunnel with the remote site).
      */
     if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) {
 	return 0;
     }
     if (ip->ip_src.s_addr != last_encap_src) {
 	register struct vif *vife;
 	
 	vifp = viftable;
 	vife = vifp + numvifs;
 	last_encap_src = ip->ip_src.s_addr;
 	last_encap_vif = 0;
 	for ( ; vifp < vife; ++vifp)
 	    if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) {
 		if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT))
 		    == VIFF_TUNNEL)
 		    last_encap_vif = vifp;
 		break;
 	    }
     }
     if ((vifp = last_encap_vif) == 0) {
 	last_encap_src = 0;
 	return 0;
     }
     return 64;
 }
 
 /*
  * De-encapsulate a packet and feed it back through ip input (this
  * routine is called whenever IP gets a packet that mroute_encap_func()
  * claimed).
  */
 static void
 mroute_encap_input(struct mbuf *m, int off)
 {
     struct ip *ip = mtod(m, struct ip *);
     int hlen = ip->ip_hl << 2;
 
     if (hlen > sizeof(struct ip))
       ip_stripoptions(m, (struct mbuf *) 0);
     m->m_data += sizeof(struct ip);
     m->m_len -= sizeof(struct ip);
     m->m_pkthdr.len -= sizeof(struct ip);
 
     m->m_pkthdr.rcvif = last_encap_vif->v_ifp;
 
     (void) IF_HANDOFF(&ipintrq, m, NULL);
 	/*
 	 * normally we would need a "schednetisr(NETISR_IP)"
 	 * here but we were called by ip_input and it is going
 	 * to loop back & try to dequeue the packet we just
 	 * queued as soon as we return so we avoid the
 	 * unnecessary software interrrupt.
 	 */
 }
 
 extern struct domain inetdomain;
 static struct protosw mroute_encap_protosw =
 { SOCK_RAW,	&inetdomain,	IPPROTO_IPV4,	PR_ATOMIC|PR_ADDR,
   mroute_encap_input,	0,	0,		rip_ctloutput,
   0,
   0,		0,		0,		0,
   &rip_usrreqs
 };
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(vifcp)
     register struct vifctl *vifcp;
 {
     register struct vif *vifp = viftable + vifcp->vifc_vifi;
     static struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error, s;
     struct tbf *v_tbf = tbftable + vifcp->vifc_vifi;
 
     if (vifcp->vifc_vifi >= MAXVIFS)  return EINVAL;
     if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE;
 
     /* Find the interface with an address in AF_INET family */
     sin.sin_addr = vifcp->vifc_lcl_addr;
     ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
     if (ifa == 0) return EADDRNOTAVAIL;
     ifp = ifa->ifa_ifp;
 
     if (vifcp->vifc_flags & VIFF_TUNNEL) {
 	if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) {
 		/*
 		 * An encapsulating tunnel is wanted.  Tell
 		 * mroute_encap_input() to start paying attention
 		 * to encapsulated packets.
 		 */
 		if (encap_cookie == NULL) {
 			encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
 				mroute_encapcheck,
 				(struct protosw *)&mroute_encap_protosw, NULL);
 
 			if (encap_cookie == NULL) {
 				printf("ip_mroute: unable to attach encap\n");
 				return (EIO);	/* XXX */
 			}
 			for (s = 0; s < MAXVIFS; ++s) {
 				multicast_decap_if[s].if_name = "mdecap";
 				multicast_decap_if[s].if_unit = s;
 			}
 		}
 		/*
 		 * Set interface to fake encapsulator interface
 		 */
 		ifp = &multicast_decap_if[vifcp->vifc_vifi];
 		/*
 		 * Prepare cached route entry
 		 */
 		bzero(&vifp->v_route, sizeof(vifp->v_route));
 	} else {
 	    log(LOG_ERR, "source routed tunnels not supported\n");
 	    return EOPNOTSUPP;
 	}
     } else {
 	/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 	    return EOPNOTSUPP;
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	s = splnet();
 	error = if_allmulti(ifp, 1);
 	splx(s);
 	if (error)
 	    return error;
     }
 
     s = splnet();
     /* define parameters for the tbf structure */
     vifp->v_tbf = v_tbf;
     GET_TIME(vifp->v_tbf->tbf_last_pkt_t);
     vifp->v_tbf->tbf_n_tok = 0;
     vifp->v_tbf->tbf_q_len = 0;
     vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
     vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     /* scaling up here allows division by 1024 in critical code */
     vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000;
     vifp->v_rsvp_on   = 0;
     vifp->v_rsvpd     = NULL;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
     splx(s);
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
 
     if (mrtdebug)
 	log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n",
 	    vifcp->vifc_vifi, 
 	    (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
 	    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
 	    (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
 	    vifcp->vifc_threshold,
 	    vifcp->vifc_rate_limit);    
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif(vifi)
 	vifi_t vifi;
 {
     register struct vif *vifp = &viftable[vifi];
     register struct mbuf *m;
     struct ifnet *ifp;
     struct ifreq ifr;
     int s;
 
     if (vifi >= numvifs) return EINVAL;
     if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL;
 
     s = splnet();
 
     if (!(vifp->v_flags & VIFF_TUNNEL)) {
 	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
 	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY;
 	ifp = vifp->v_ifp;
 	if_allmulti(ifp, 0);
     }
 
     if (vifp == last_encap_vif) {
 	last_encap_vif = 0;
 	last_encap_src = 0;
     }
 
     /*
      * Free packets queued at the interface
      */
     while (vifp->v_tbf->tbf_q) {
 	m = vifp->v_tbf->tbf_q;
 	vifp->v_tbf->tbf_q = m->m_act;
 	m_freem(m);
     }
 
     bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf)));
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     if (mrtdebug)
       log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);
 
     /* Adjust numvifs down */
     for (vifi = numvifs; vifi > 0; vifi--)
 	if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break;
     numvifs = vifi;
 
     splx(s);
 
     return 0;
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(mfccp)
     struct mfcctl *mfccp;
 {
     struct mfc *rt;
     u_long hash;
     struct rtdetq *rte;
     register u_short nstl;
     int s;
     int i;
 
     MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	if (mrtdebug & DEBUG_MFC)
 	    log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
 		(u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		mfccp->mfcc_parent);
 
 	s = splnet();
 	rt->mfc_parent = mfccp->mfcc_parent;
 	for (i = 0; i < numvifs; i++)
 	    rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	splx(s);
 	return 0;
     }
 
     /* 
      * Find the entry for which the upcall was made and update
      */
     s = splnet();
     hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
     for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {
 
 	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
 	    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
 	    (rt->mfc_stall != NULL)) {
   
 	    if (nstl++)
 		log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
 		    "multiple kernel entries",
 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
 
 	    if (mrtdebug & DEBUG_MFC)
 		log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
 
 	    rt->mfc_origin     = mfccp->mfcc_origin;
 	    rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 	    rt->mfc_parent     = mfccp->mfcc_parent;
 	    for (i = 0; i < numvifs; i++)
 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt    = 0;
 	    rt->mfc_byte_cnt   = 0;
 	    rt->mfc_wrong_if   = 0;
 	    rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
 
 	    rt->mfc_expire = 0;	/* Don't clean this guy up */
 	    nexpire[hash]--;
 
 	    /* free packets Qed at the end of this entry */
 	    for (rte = rt->mfc_stall; rte != NULL; ) {
 		struct rtdetq *n = rte->next;
 
 		ip_mdq(rte->m, rte->ifp, rt, -1);
 		m_freem(rte->m);
 #ifdef UPCALL_TIMING
 		collate(&(rte->t));
 #endif /* UPCALL_TIMING */
 		free(rte, M_MRTABLE);
 		rte = n;
 	    }
 	    rt->mfc_stall = NULL;
 	}
     }
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	if (mrtdebug & DEBUG_MFC)
 	    log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
 		hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		mfccp->mfcc_parent);
 	
 	for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
 	    
 	    if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
 		(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
 
 		rt->mfc_origin     = mfccp->mfcc_origin;
 		rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 		rt->mfc_parent     = mfccp->mfcc_parent;
 		for (i = 0; i < numvifs; i++)
 		    rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 		/* initialize pkt counters per src-grp */
 		rt->mfc_pkt_cnt    = 0;
 		rt->mfc_byte_cnt   = 0;
 		rt->mfc_wrong_if   = 0;
 		rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
 		if (rt->mfc_expire)
 		    nexpire[hash]--;
 		rt->mfc_expire	   = 0;
 	    }
 	}
 	if (rt == NULL) {
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		splx(s);
 		return ENOBUFS;
 	    }
 	    
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin     = mfccp->mfcc_origin;
 	    rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 	    rt->mfc_parent     = mfccp->mfcc_parent;
 	    for (i = 0; i < numvifs; i++)
 		    rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt    = 0;
 	    rt->mfc_byte_cnt   = 0;
 	    rt->mfc_wrong_if   = 0;
 	    rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
 	    rt->mfc_expire     = 0;
 	    rt->mfc_stall      = NULL;
 	    
 	    /* link into table */
 	    rt->mfc_next = mfctable[hash];
 	    mfctable[hash] = rt;
 	}
     }
     splx(s);
     return 0;
 }
 
 #ifdef UPCALL_TIMING
 /*
  * collect delay statistics on the upcalls 
  */
 static void collate(t)
 register struct timeval *t;
 {
     register u_long d;
     register struct timeval tp;
     register u_long delta;
     
     GET_TIME(tp);
     
     if (TV_LT(*t, tp))
     {
 	TV_DELTA(tp, *t, delta);
 	
 	d = delta >> 10;
 	if (d > 50)
 	    d = 50;
 	
 	++upcall_data[d];
     }
 }
 #endif /* UPCALL_TIMING */
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(mfccp)
     struct mfcctl *mfccp;
 {
     struct in_addr 	origin;
     struct in_addr 	mcastgrp;
     struct mfc 		*rt;
     struct mfc	 	**nptr;
     u_long 		hash;
     int s;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
     hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
 
     if (mrtdebug & DEBUG_MFC)
 	log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
 	    (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
 
     s = splnet();
 
     nptr = &mfctable[hash];
     while ((rt = *nptr) != NULL) {
 	if (origin.s_addr == rt->mfc_origin.s_addr &&
 	    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
 	    rt->mfc_stall == NULL)
 	    break;
 
 	nptr = &rt->mfc_next;
     }
     if (rt == NULL) {
 	splx(s);
 	return EADDRNOTAVAIL;
     }
 
     *nptr = rt->mfc_next;
     free(rt, M_MRTABLE);
 
     splx(s);
 
     return 0;
 }
 
 /*
  * Send a message to mrouted on the multicast routing socket
  */
 static int
 socket_send(s, mm, src)
 	struct socket *s;
 	struct mbuf *mm;
 	struct sockaddr_in *src;
 {
 	if (s) {
 		if (sbappendaddr(&s->so_rcv,
 				 (struct sockaddr *)src,
 				 mm, (struct mbuf *)0) != 0) {
 			sorwakeup(s);
 			return 0;
 		}
 	}
 	m_freem(mm);
 	return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(ip, ifp, m, imo)
     register struct ip *ip;
     struct ifnet *ifp;
     struct mbuf *m;
     struct ip_moptions *imo;
 {
     register struct mfc *rt;
     register u_char *ipoptions;
     static struct sockaddr_in 	k_igmpsrc	= { sizeof k_igmpsrc, AF_INET };
     static int srctun = 0;
     register struct mbuf *mm;
     int s;
     vifi_t vifi;
     struct vif *vifp;
 
     if (mrtdebug & DEBUG_FORWARD)
 	log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
 	    (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
 	    (void *)ifp);
 
     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 	(ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
 	 * Packet arrived via a physical interface or
 	 * an encapsulated tunnel.
 	 */
     } else {
 	/*
 	 * Packet arrived through a source-route tunnel.
 	 * Source-route tunnels are no longer supported.
 	 */
 	if ((srctun++ % 1000) == 0)
 	    log(LOG_ERR,
 		"ip_mforward: received source-routed packet from %lx\n",
 		(u_long)ntohl(ip->ip_src.s_addr));
 
 	return 1;
     }
 
     if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 	if (ip->ip_ttl < 255)
 		ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 	    vifp = viftable + vifi;
 	    printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n",
 		(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
 		vifi,
 		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 		vifp->v_ifp->if_name, vifp->v_ifp->if_unit);
 	}
 	return (ip_mdq(m, ifp, NULL, vifi));
     }
     if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 	printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
 	    (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
 	if(!imo)
 		printf("In fact, no options were specified at all\n");
     }
 
     /*
      * Don't forward a packet with time-to-live of zero or one,
      * or a packet destined to a local-only group.
      */
     if (ip->ip_ttl <= 1 ||
 	ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
 	return 0;
 
     /*
      * Determine forwarding vifs from the forwarding cache table
      */
     s = splnet();
     MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt);
 
     /* Entry exists, so forward if necessary */
     if (rt != NULL) {
 	splx(s);
 	return (ip_mdq(m, ifp, rt, -1));
     } else {
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet &
 	 * send message to routing daemon
 	 */
 
 	register struct mbuf *mb0;
 	register struct rtdetq *rte;
 	register u_long hash;
 	int hlen = ip->ip_hl << 2;
 #ifdef UPCALL_TIMING
 	struct timeval tp;
 
 	GET_TIME(tp);
 #endif
 
 	mrtstat.mrts_no_route++;
 	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 	    log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
 		(u_long)ntohl(ip->ip_src.s_addr),
 		(u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT);
 	if (rte == NULL) {
 	    splx(s);
 	    return ENOBUFS;
 	}
 	mb0 = m_copy(m, 0, M_COPYALL);
 	if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
 	    mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 	    free(rte, M_MRTABLE);
 	    splx(s);
 	    return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this packet? */
 	hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
 	for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
 	    if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
 		(ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
 		(rt->mfc_stall != NULL))
 		break;
 	}
 
 	if (rt == NULL) {
 	    int i;
 	    struct igmpmsg *im;
 
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		splx(s);
 		return ENOBUFS;
 	    }
 	    /* Make a copy of the header to send to the user level process */
 	    mm = m_copy(mb0, 0, hlen);
 	    if (mm == NULL) {
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		free(rt, M_MRTABLE);
 		splx(s);
 		return ENOBUFS;
 	    }
 
 	    /* 
 	     * Send message to routing daemon to install 
 	     * a route into the kernel table
 	     */
 	    k_igmpsrc.sin_addr = ip->ip_src;
 	    
 	    im = mtod(mm, struct igmpmsg *);
 	    im->im_msgtype	= IGMPMSG_NOCACHE;
 	    im->im_mbz		= 0;
 
 	    mrtstat.mrts_upcalls++;
 
 	    if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
 		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
 		++mrtstat.mrts_upq_sockfull;
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		free(rt, M_MRTABLE);
 		splx(s);
 		return ENOBUFS;
 	    }
 
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 	    nexpire[hash]++;
 	    for (i = 0; i < numvifs; i++)
 		rt->mfc_ttls[i] = 0;
 	    rt->mfc_parent = -1;
 
 	    /* link into table */
 	    rt->mfc_next   = mfctable[hash];
 	    mfctable[hash] = rt;
 	    rt->mfc_stall = rte;
 
 	} else {
 	    /* determine if q has overflowed */
 	    int npkts = 0;
 	    struct rtdetq **p;
 
 	    for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 		npkts++;
 
 	    if (npkts > MAX_UPQ) {
 		mrtstat.mrts_upq_ovflw++;
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		splx(s);
 		return 0;
 	    }
 
 	    /* Add this entry to the end of the queue */
 	    *p = rte;
 	}
 
 	rte->m 			= mb0;
 	rte->ifp 		= ifp;
 #ifdef UPCALL_TIMING
 	rte->t			= tp;
 #endif
 	rte->next		= NULL;
 
 	splx(s);
 
 	return 0;
     }		
 }
 
 #ifndef MROUTE_KLD
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *) = X_ip_mforward;
 #endif
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *unused)
 {
     struct rtdetq *rte;
     struct mfc *mfc, **nptr;
     int i;
     int s;
 
     s = splnet();
     for (i = 0; i < MFCTBLSIZ; i++) {
 	if (nexpire[i] == 0)
 	    continue;
 	nptr = &mfctable[i];
 	for (mfc = *nptr; mfc != NULL; mfc = *nptr) {
 	    /*
 	     * Skip real cache entries
 	     * Make sure it wasn't marked to not expire (shouldn't happen)
 	     * If it expires now
 	     */
 	    if (mfc->mfc_stall != NULL &&
 	        mfc->mfc_expire != 0 &&
 		--mfc->mfc_expire == 0) {
 		if (mrtdebug & DEBUG_EXPIRE)
 		    log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
 			(u_long)ntohl(mfc->mfc_origin.s_addr),
 			(u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
 		/*
 		 * drop all the packets
 		 * free the mbuf with the pkt, if, timing info
 		 */
 		for (rte = mfc->mfc_stall; rte; ) {
 		    struct rtdetq *n = rte->next;
 
 		    m_freem(rte->m);
 		    free(rte, M_MRTABLE);
 		    rte = n;
 		}
 		++mrtstat.mrts_cache_cleanups;
 		nexpire[i]--;
 
 		*nptr = mfc->mfc_next;
 		free(mfc, M_MRTABLE);
 	    } else {
 		nptr = &mfc->mfc_next;
 	    }
 	}
     }
     splx(s);
     expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(m, ifp, rt, xmt_vif)
     register struct mbuf *m;
     register struct ifnet *ifp;
     register struct mfc *rt;
     register vifi_t xmt_vif;
 {
     register struct ip  *ip = mtod(m, struct ip *);
     register vifi_t vifi;
     register struct vif *vifp;
     register int plen = ip->ip_len;
 
 /*
  * Macro to send packet on vif.  Since RSVP packets don't get counted on
  * input, they shouldn't get counted on output, so statistics keeping is
  * separate.
  */
 #define MC_SEND(ip,vifp,m) {                             \
                 if ((vifp)->v_flags & VIFF_TUNNEL)  	 \
                     encap_send((ip), (vifp), (m));       \
                 else                                     \
                     phyint_send((ip), (vifp), (m));      \
 }
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < numvifs) {
 	MC_SEND(ip, viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 	/* came in the wrong interface */
 	if (mrtdebug & DEBUG_FORWARD)
 	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 		(void *)ifp, vifi, (void *)viftable[vifi].v_ifp); 
 	++mrtstat.mrts_wrong_if;
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, and we are forwarding
 	 * packets on this interface, and it is a broadcast medium
 	 * interface (and not a tunnel), send a message to the routing daemon.
 	 */
 	if (pim_assert && rt->mfc_ttls[vifi] &&
 		(ifp->if_flags & IFF_BROADCAST) &&
 		!(viftable[vifi].v_flags & VIFF_TUNNEL)) {
 	    struct sockaddr_in k_igmpsrc;
 	    struct mbuf *mm;
 	    struct igmpmsg *im;
 	    int hlen = ip->ip_hl << 2;
 	    struct timeval now;
 	    register u_long delta;
 
 	    GET_TIME(now);
 
 	    TV_DELTA(rt->mfc_last_assert, now, delta);
 
 	    if (delta > ASSERT_MSG_TIME) {
 		mm = m_copy(m, 0, hlen);
 		if (mm && (M_HASCL(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL) {
 		    return ENOBUFS;
 		}
 
 		rt->mfc_last_assert = now;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		k_igmpsrc.sin_addr = im->im_src;
 
 		socket_send(ip_mrouter, mm, &k_igmpsrc);
 	    }
 	}
 	return 0;
     }
 
     /* If I sourced this packet, it counts as output, else it was input. */
     if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
 	viftable[vifi].v_pkt_out++;
 	viftable[vifi].v_bytes_out += plen;
     } else {
 	viftable[vifi].v_pkt_in++;
 	viftable[vifi].v_bytes_in += plen;
     }
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) &&
 	    (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    vifp->v_pkt_out++;
 	    vifp->v_bytes_out += plen;
 	    MC_SEND(ip, vifp, m);
 	}
 
     return 0;
 }
 
 /*
  * check if a vif number is legal/ok. This is used by ip_output, to export
  * numvifs there, 
  */
 static int
 X_legal_vif_num(vif)
     int vif;
 {
     if (vif >= 0 && vif < numvifs)
        return(1);
     else
        return(0);
 }
 
 #ifndef MROUTE_KLD
 int (*legal_vif_num)(int) = X_legal_vif_num;
 #endif
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(vifi)
     int vifi;
 {
     if (vifi >= 0 && vifi < numvifs)
 	return viftable[vifi].v_lcl_addr.s_addr;
     else
 	return INADDR_ANY;
 }
 
 #ifndef MROUTE_KLD
 u_long (*ip_mcast_src)(int) = X_ip_mcast_src;
 #endif
 
 static void
 phyint_send(ip, vifp, m)
     struct ip *ip;
     struct vif *vifp;
     struct mbuf *m;
 {
     register struct mbuf *mb_copy;
     register int hlen = ip->ip_hl << 2;
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copy(m, 0, M_COPYALL);
     if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     if (vifp->v_rate_limit == 0)
 	tbf_send_packet(vifp, mb_copy);
     else
 	tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
 }
 
 static void
 encap_send(ip, vifp, m)
     register struct ip *ip;
     register struct vif *vifp;
     register struct mbuf *m;
 {
     register struct mbuf *mb_copy;
     register struct ip *ip_copy;
     register int i, len = ip->ip_len;
 
     /*
      * copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.  Try to fill the new
      * mbuf since if we don't the ethernet driver will.
      */
     MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER);
     if (mb_copy == NULL)
 	return;
     mb_copy->m_data += max_linkhdr;
     mb_copy->m_len = sizeof(multicast_encap_iphdr);
 
     if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 	m_freem(mb_copy);
 	return;
     }
     i = MHLEN - M_LEADINGSPACE(mb_copy);
     if (i > len)
 	i = len;
     mb_copy = m_pullup(mb_copy, i);
     if (mb_copy == NULL)
 	return;
     mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr);
 
     /*
      * fill in the encapsulating IP header.
      */
     ip_copy = mtod(mb_copy, struct ip *);
     *ip_copy = multicast_encap_iphdr;
 #ifdef RANDOM_IP_ID
     ip_copy->ip_id = ip_randomid();
 #else
     ip_copy->ip_id = htons(ip_id++);
 #endif
     ip_copy->ip_len += len;
     ip_copy->ip_src = vifp->v_lcl_addr;
     ip_copy->ip_dst = vifp->v_rmt_addr;
 
     /*
      * turn the encapsulated IP header back into a valid one.
      */
     ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
     --ip->ip_ttl;
     ip->ip_len = htons(ip->ip_len);
     ip->ip_off = htons(ip->ip_off);
     ip->ip_sum = 0;
     mb_copy->m_data += sizeof(multicast_encap_iphdr);
     ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 
     if (vifp->v_rate_limit == 0)
 	tbf_send_packet(vifp, mb_copy);
     else
 	tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
 }
 
 /*
  * Token bucket filter module
  */
 
 static void
 tbf_control(vifp, m, ip, p_len)
 	register struct vif *vifp;
 	register struct mbuf *m;
 	register struct ip *ip;
 	register u_long p_len;
 {
     register struct tbf *t = vifp->v_tbf;
 
     if (p_len > MAX_BKT_SIZE) {
 	/* drop if packet is too large */
 	mrtstat.mrts_pkt2large++;
 	m_freem(m);
 	return;
     }
 
     tbf_update_tokens(vifp);
 
     /* if there are enough tokens, 
      * and the queue is empty,
      * send this packet out
      */
 
     if (t->tbf_q_len == 0) {
 	/* queue empty, send packet if enough tokens */
 	if (p_len <= t->tbf_n_tok) {
 	    t->tbf_n_tok -= p_len;
 	    tbf_send_packet(vifp, m);
 	} else {
 	    /* queue packet and timeout till later */
 	    tbf_queue(vifp, m);
 	    timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
 	}
     } else if (t->tbf_q_len < t->tbf_max_q_len) {
 	/* finite queue length, so queue pkts and process queue */
 	tbf_queue(vifp, m);
 	tbf_process_q(vifp);
     } else {
 	/* queue length too much, try to dq and queue and process */
 	if (!tbf_dq_sel(vifp, ip)) {
 	    mrtstat.mrts_q_overflow++;
 	    m_freem(m);
 	    return;
 	} else {
 	    tbf_queue(vifp, m);
 	    tbf_process_q(vifp);
 	}
     }
     return;
 }
 
 /* 
  * adds a packet to the queue at the interface
  */
 static void
 tbf_queue(vifp, m) 
 	register struct vif *vifp;
 	register struct mbuf *m;
 {
     register int s = splnet();
     register struct tbf *t = vifp->v_tbf;
 
     if (t->tbf_t == NULL) {
 	/* Queue was empty */
 	t->tbf_q = m;
     } else {
 	/* Insert at tail */
 	t->tbf_t->m_act = m;
     }
 
     /* Set new tail pointer */
     t->tbf_t = m;
 
 #ifdef DIAGNOSTIC
     /* Make sure we didn't get fed a bogus mbuf */
     if (m->m_act)
 	panic("tbf_queue: m_act");
 #endif
     m->m_act = NULL;
 
     t->tbf_q_len++;
 
     splx(s);
 }
 
 
 /* 
  * processes the queue at the interface
  */
 static void
 tbf_process_q(vifp)
     register struct vif *vifp;
 {
     register struct mbuf *m;
     register int len;
     register int s = splnet();
     register struct tbf *t = vifp->v_tbf;
 
     /* loop through the queue at the interface and send as many packets
      * as possible
      */
     while (t->tbf_q_len > 0) {
 	m = t->tbf_q;
 
 	len = mtod(m, struct ip *)->ip_len;
 
 	/* determine if the packet can be sent */
 	if (len <= t->tbf_n_tok) {
 	    /* if so,
 	     * reduce no of tokens, dequeue the packet,
 	     * send the packet.
 	     */
 	    t->tbf_n_tok -= len;
 
 	    t->tbf_q = m->m_act;
 	    if (--t->tbf_q_len == 0)
 		t->tbf_t = NULL;
 
 	    m->m_act = NULL;
 	    tbf_send_packet(vifp, m);
 
 	} else break;
     }
     splx(s);
 }
 
 static void
 tbf_reprocess_q(xvifp)
 	void *xvifp;
 {
     register struct vif *vifp = xvifp;
     if (ip_mrouter == NULL) 
 	return;
 
     tbf_update_tokens(vifp);
 
     tbf_process_q(vifp);
 
     if (vifp->v_tbf->tbf_q_len)
 	timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
 }
 
 /* function that will selectively discard a member of the queue
  * based on the precedence value and the priority
  */
 static int
 tbf_dq_sel(vifp, ip)
     register struct vif *vifp;
     register struct ip *ip;
 {
     register int s = splnet();
     register u_int p;
     register struct mbuf *m, *last;
     register struct mbuf **np;
     register struct tbf *t = vifp->v_tbf;
 
     p = priority(vifp, ip);
 
     np = &t->tbf_q;
     last = NULL;
     while ((m = *np) != NULL) {
 	if (p > priority(vifp, mtod(m, struct ip *))) {
 	    *np = m->m_act;
 	    /* If we're removing the last packet, fix the tail pointer */
 	    if (m == t->tbf_t)
 		t->tbf_t = last;
 	    m_freem(m);
 	    /* it's impossible for the queue to be empty, but
 	     * we check anyway. */
 	    if (--t->tbf_q_len == 0)
 		t->tbf_t = NULL;
 	    splx(s);
 	    mrtstat.mrts_drop_sel++;
 	    return(1);
 	}
 	np = &m->m_act;
 	last = m;
     }
     splx(s);
     return(0);
 }
 
 static void
 tbf_send_packet(vifp, m)
     register struct vif *vifp;
     register struct mbuf *m;
 {
     struct ip_moptions imo;
     int error;
     static struct route ro;
     int s = splnet();
 
     if (vifp->v_flags & VIFF_TUNNEL) {
 	/* If tunnel options */
 	ip_output(m, (struct mbuf *)0, &vifp->v_route,
-		  IP_FORWARDING, (struct ip_moptions *)0);
+		  IP_FORWARDING, (struct ip_moptions *)0, NULL);
     } else {
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = 1;
 	imo.imo_multicast_vif  = -1;
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, (struct mbuf *)0, &ro,
-			  IP_FORWARDING, &imo);
+			  IP_FORWARDING, &imo, NULL);
 
 	if (mrtdebug & DEBUG_XMIT)
 	    log(LOG_DEBUG, "phyint_send on vif %d err %d\n", 
 		vifp - viftable, error);
     }
     splx(s);
 }
 
 /* determine the current time and then
  * the elapsed time (between the last time and time now)
  * in milliseconds & update the no. of tokens in the bucket
  */
 static void
 tbf_update_tokens(vifp)
     register struct vif *vifp;
 {
     struct timeval tp;
     register u_long tm;
     register int s = splnet();
     register struct tbf *t = vifp->v_tbf;
 
     GET_TIME(tp);
 
     TV_DELTA(tp, t->tbf_last_pkt_t, tm);
 
     /*
      * This formula is actually
      * "time in seconds" * "bytes/second".
      *
      * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
      *
      * The (1000/1024) was introduced in add_vif to optimize
      * this divide into a shift.
      */
     t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8;
     t->tbf_last_pkt_t = tp;
 
     if (t->tbf_n_tok > MAX_BKT_SIZE)
 	t->tbf_n_tok = MAX_BKT_SIZE;
 
     splx(s);
 }
 
 static int
 priority(vifp, ip)
     register struct vif *vifp;
     register struct ip *ip;
 {
     register int prio;
 
     /* temporary hack; may add general packet classifier some day */
 
     /*
      * The UDP port space is divided up into four priority ranges:
      * [0, 16384)     : unclassified - lowest priority
      * [16384, 32768) : audio - highest priority
      * [32768, 49152) : whiteboard - medium priority
      * [49152, 65536) : video - low priority
      */
     if (ip->ip_p == IPPROTO_UDP) {
 	struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 	switch (ntohs(udp->uh_dport) & 0xc000) {
 	    case 0x4000:
 		prio = 70;
 		break;
 	    case 0x8000:
 		prio = 60;
 		break;
 	    case 0xc000:
 		prio = 55;
 		break;
 	    default:
 		prio = 50;
 		break;
 	}
 	if (tbfdebug > 1)
 		log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio);
     } else {
 	    prio = 50;
     }
     return prio;
 }
 
 /*
  * End of token bucket filter modifications 
  */
 
 int
 ip_rsvp_vif_init(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
     int error, i, s;
 
     if (rsvpdebug)
 	printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
 	       so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 	return EOPNOTSUPP;
 
     /* Check mbuf. */
     error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
     if (error)
 	    return (error);
  
     if (rsvpdebug)
 	printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on);
  
     s = splnet();
 
     /* Check vif. */
     if (!legal_vif_num(i)) {
 	splx(s);
 	return EADDRNOTAVAIL;
     }
 
     /* Check if socket is available. */
     if (viftable[i].v_rsvpd != NULL) {
 	splx(s);
 	return EADDRINUSE;
     }
 
     viftable[i].v_rsvpd = so;
     /* This may seem silly, but we need to be sure we don't over-increment
      * the RSVP counter, in case something slips up.
      */
     if (!viftable[i].v_rsvp_on) {
 	viftable[i].v_rsvp_on = 1;
 	rsvp_on++;
     }
 
     splx(s);
     return 0;
 }
 
 int
 ip_rsvp_vif_done(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error, i, s;
  
 	if (rsvpdebug)
 		printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
 		       so->so_type, so->so_proto->pr_protocol);
  
 	if (so->so_type != SOCK_RAW || 
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
  
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (error)
 		return (error);
  
 	s = splnet();
  
 	/* Check vif. */
 	if (!legal_vif_num(i)) {
 		splx(s);
 		return EADDRNOTAVAIL;
 	}
 
 	if (rsvpdebug)
 		printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n",
 		       viftable[i].v_rsvpd, so);
 
 	/*
 	 * XXX as an additional consistency check, one could make sure
 	 * that viftable[i].v_rsvpd == so, otherwise passing so as
 	 * first parameter is pretty useless.
 	 */
 	viftable[i].v_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (viftable[i].v_rsvp_on) {
 		viftable[i].v_rsvp_on = 0;
 		rsvp_on--;
 	}
 
 	splx(s);
 	return 0;
 }
 
 void
 ip_rsvp_force_done(so)
     struct socket *so;
 {
     int vifi;
     register int s;
 
     /* Don't bother if it is not the right type of socket. */
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 	return;
 
     s = splnet();
 
     /* The socket may be attached to more than one vif...this
      * is perfectly legal.
      */
     for (vifi = 0; vifi < numvifs; vifi++) {
 	if (viftable[vifi].v_rsvpd == so) {
 	    viftable[vifi].v_rsvpd = NULL;
 	    /* This may seem silly, but we need to be sure we don't
 	     * over-decrement the RSVP counter, in case something slips up.
 	     */
 	    if (viftable[vifi].v_rsvp_on) {
 		viftable[vifi].v_rsvp_on = 0;
 		rsvp_on--;
 	    }
 	}
     }
 
     splx(s);
     return;
 }
 
 void
 rsvp_input(m, off)
 	struct mbuf *m;
 	int off;
 {
     int vifi;
     register struct ip *ip = mtod(m, struct ip *);
     static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
     register int s;
     struct ifnet *ifp;
 
     if (rsvpdebug)
 	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
 
     /* Can still get packets with rsvp_on = 0 if there is a local member
      * of the group to which the RSVP packet is addressed.  But in this
      * case we want to throw the packet away.
      */
     if (!rsvp_on) {
 	m_freem(m);
 	return;
     }
 
     s = splnet();
 
     if (rsvpdebug)
 	printf("rsvp_input: check vifs\n");
 
 #ifdef DIAGNOSTIC
     if (!(m->m_flags & M_PKTHDR))
 	    panic("rsvp_input no hdr");
 #endif
 
     ifp = m->m_pkthdr.rcvif;
     /* Find which vif the packet arrived on. */
     for (vifi = 0; vifi < numvifs; vifi++)
 	if (viftable[vifi].v_ifp == ifp)
 	    break;
 
     if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) {
 	/*
 	 * If the old-style non-vif-associated socket is set,
 	 * then use it.  Otherwise, drop packet since there
 	 * is no specific socket for this vif.
 	 */
 	if (ip_rsvpd != NULL) {
 	    if (rsvpdebug)
 		printf("rsvp_input: Sending packet up old-style socket\n");
 	    rip_input(m, off);  /* xxx */
 	} else {
 	    if (rsvpdebug && vifi == numvifs)
 		printf("rsvp_input: Can't find vif for packet.\n");
 	    else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
 		printf("rsvp_input: No socket defined for vif %d\n",vifi);
 	    m_freem(m);
 	}
 	splx(s);
 	return;
     }
     rsvp_src.sin_addr = ip->ip_src;
 
     if (rsvpdebug && m)
 	printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
 	       m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));
 
     if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
 	if (rsvpdebug)
 	    printf("rsvp_input: Failed to append to socket\n");
     } else {
 	if (rsvpdebug)
 	    printf("rsvp_input: send packet up\n");
     }
 
     splx(s);
 }
 
 #ifdef MROUTE_KLD
 
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
 	int s;
 
 	switch (type) {
 		static u_long (*old_ip_mcast_src)(int);
 		static int (*old_ip_mrouter_set)(struct socket *,
 			struct sockopt *);
 		static int (*old_ip_mrouter_get)(struct socket *,
 			struct sockopt *);
 		static int (*old_ip_mrouter_done)(void);
 		static int (*old_ip_mforward)(struct ip *, struct ifnet *,
 			struct mbuf *, struct ip_moptions *);
 		static int (*old_mrt_ioctl)(int, caddr_t);
 		static int (*old_legal_vif_num)(int);
 
 	case MOD_LOAD:
 		s = splnet();
 		/* XXX Protect against multiple loading */
 		old_ip_mcast_src = ip_mcast_src;
 		ip_mcast_src = X_ip_mcast_src;
 		old_ip_mrouter_get = ip_mrouter_get;
 		ip_mrouter_get = X_ip_mrouter_get;
 		old_ip_mrouter_set = ip_mrouter_set;
 		ip_mrouter_set = X_ip_mrouter_set;
 		old_ip_mrouter_done = ip_mrouter_done;
 		ip_mrouter_done = X_ip_mrouter_done;
 		old_ip_mforward = ip_mforward;
 		ip_mforward = X_ip_mforward;
 		old_mrt_ioctl = mrt_ioctl;
 		mrt_ioctl = X_mrt_ioctl;
 		old_legal_vif_num = legal_vif_num;
 		legal_vif_num = X_legal_vif_num;
 
 		splx(s);
 		return 0;
 
 	case MOD_UNLOAD:
 		if (ip_mrouter)
 		  return EINVAL;
 
 		s = splnet();
 		ip_mrouter_get = old_ip_mrouter_get;
 		ip_mrouter_set = old_ip_mrouter_set;
 		ip_mrouter_done = old_ip_mrouter_done;
 		ip_mforward = old_ip_mforward;
 		mrt_ioctl = old_mrt_ioctl;
 		legal_vif_num = old_legal_vif_num;
 		splx(s);
 		return 0;
 
 	default:
 		break;
 	}
 	return 0;
 }
 
 static moduledata_t ip_mroutemod = {
 	"ip_mroute",
 	ip_mroute_modevent,
 	0
 };
 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 #endif /* MROUTE_KLD */
 #endif /* MROUTING */
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 105193)
+++ head/sys/netinet/ip_output.c	(revision 105194)
@@ -1,2065 +1,2059 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #define _IP_VHL
 
 #include "opt_ipfw.h"
 #include "opt_ipdn.h"
 #include "opt_ipdivert.h"
 #include "opt_ipfilter.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_pfil_hooks.h"
 #include "opt_random_ip_id.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 
 #include <machine/in_cksum.h>
 
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netkey/key.h>
 #ifdef IPSEC_DEBUG
 #include <netkey/key_debug.h>
 #else
 #define	KEYDEBUG(lev,arg)
 #endif
 #endif /*IPSEC*/
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
 				x, (ntohl(a.s_addr)>>24)&0xFF,\
 				  (ntohl(a.s_addr)>>16)&0xFF,\
 				  (ntohl(a.s_addr)>>8)&0xFF,\
 				  (ntohl(a.s_addr))&0xFF, y);
 
 u_short ip_id;
 
 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
 static void	ip_mloopback
 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
 static int	ip_getmoptions
 	(struct sockopt *, struct ip_moptions *);
 static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
 static int	ip_setmoptions
 	(struct sockopt *, struct ip_moptions **);
 
 int	ip_optcopy(struct ip *, struct ip *);
 
 
 extern	struct protosw inetsw[];
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  */
 int
-ip_output(m0, opt, ro, flags, imo)
+ip_output(m0, opt, ro, flags, imo, inp)
 	struct mbuf *m0;
 	struct mbuf *opt;
 	struct route *ro;
 	int flags;
 	struct ip_moptions *imo;
+	struct inpcb *inp;
 {
 	struct ip *ip, *mhip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m;
 	int hlen = sizeof (struct ip);
 	int len, off, error = 0;
 	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
 	struct in_ifaddr *ia = NULL;
 	int isbroadcast, sw_csum;
 	struct in_addr pkt_dst;
 #ifdef IPSEC
 	struct route iproute;
-	struct socket *so = NULL;
 	struct secpolicy *sp = NULL;
+	struct socket *so = inp ? inp->inp_socket : NULL;
 #endif
 	struct ip_fw_args args;
 	int src_was_INADDR_ANY = 0;	/* as the name says... */
 #ifdef PFIL_HOOKS
 	struct packet_filter_hook *pfh;
 	struct mbuf *m1;
 	int rv;
 #endif /* PFIL_HOOKS */
 
 	args.eh = NULL;
 	args.rule = NULL;
 	args.next_hop = NULL;
 	args.divert_rule = 0;			/* divert cookie */
 
 	/* Grab info from MT_TAG mbufs prepended to the chain. */
 	for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) {
-		switch(m0->m_tag_id) {
+		switch(m0->_m_tag_id) {
 		default:
 			printf("ip_output: unrecognised MT_TAG tag %d\n",
-			    m0->m_tag_id);
+			    m0->_m_tag_id);
 			break;
 
 		case PACKET_TAG_DUMMYNET:
 			/*
 			 * the packet was already tagged, so part of the
 			 * processing was already done, and we need to go down.
 			 * Get parameters from the header.
 			 */
 			args.rule = ((struct dn_pkt *)m0)->rule;
 			opt = NULL ;
 			ro = & ( ((struct dn_pkt *)m0)->ro ) ;
 			imo = NULL ;
 			dst = ((struct dn_pkt *)m0)->dn_dst ;
 			ifp = ((struct dn_pkt *)m0)->ifp ;
 			flags = ((struct dn_pkt *)m0)->flags ;
 			break;
 
 		case PACKET_TAG_DIVERT:
 			args.divert_rule = (intptr_t)m0->m_data & 0xffff;
 			break;
 
 		case PACKET_TAG_IPFORWARD:
 			args.next_hop = (struct sockaddr_in *)m0->m_data;
 			break;
 		}
 	}
 	m = m0;
 
 	KASSERT(!m || (m->m_flags & M_PKTHDR) != 0, ("ip_output: no HDR"));
 
-	KASSERT(ro != NULL, ("ip_output: no route, proto %d",
-	    mtod(m, struct ip *)->ip_p));
-
-#ifdef IPSEC
-	so = ipsec_getsocket(m);
-	(void)ipsec_setsocket(m, NULL);
-#endif
 	if (args.rule != NULL) {	/* dummynet already saw us */
 		ip = mtod(m, struct ip *);
 		hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
 		if (ro->ro_rt)
 			ia = ifatoia(ro->ro_rt->rt_ifa);
 		goto sendit;
 	}
 
 	if (opt) {
 		len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len;
 	}
 	ip = mtod(m, struct ip *);
 	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
 
 	/*
 	 * Fill in IP header.
 	 */
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 		ip->ip_off &= IP_DF;
 #ifdef RANDOM_IP_ID
 		ip->ip_id = ip_randomid();
 #else
 		ip->ip_id = htons(ip_id++);
 #endif
 		ipstat.ips_localout++;
 	} else {
 		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 	}
 
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 */
 	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = (struct rtentry *)0;
 	}
 	if (ro->ro_rt == 0) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = pkt_dst;
 	}
 	/*
 	 * If routing to interface only,
 	 * short circuit routing lookup.
 	 */
 	if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
 			ipstat.ips_noroute++;
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia);
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * If this is the case, we probably don't want to allocate
 		 * a protocol-cloned route since we didn't get one from the
 		 * ULP.  This lets TCP do its thing, while not burdening
 		 * forwarding or ICMP with the overhead of cloning a route.
 		 * Of course, we still want to do any cloning requested by
 		 * the link layer, as this is probably required in all cases
 		 * for correct operation (as it is for ARP).
 		 */
 		if (ro->ro_rt == 0)
 			rtalloc_ign(ro, RTF_PRCLONING);
 		if (ro->ro_rt == 0) {
 			ipstat.ips_noroute++;
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 		ro->ro_rt->rt_use++;
 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 		if (ro->ro_rt->rt_flags & RTF_HOST)
 			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 		else
 			isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	}
 	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 		struct in_multi *inm;
 
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "dst"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		dst = (struct sockaddr_in *)&ro->ro_dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src(imo->imo_multicast_vif);
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				ipstat.ips_noroute++;
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
 			/*
 			 * XXX
 			 * delayed checksums are not currently
 			 * compatible with IP multicast routing
 			 */
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				in_delayed_cksum(m);
 				m->m_pkthdr.csum_flags &=
 					~CSUM_DELAY_DATA;
 			}
 		}
 		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
 		if (inm != NULL &&
 		   (imo == NULL || imo->imo_multicast_loop)) {
 			/*
 			 * If we belong to the destination multicast group
 			 * on the outgoing interface, and the caller did not
 			 * forbid loopback, loop back a copy.
 			 */
 			ip_mloopback(ifp, m, dst, hlen);
 		}
 		else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * Check if rsvp daemon is running. If not, don't
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!rsvp_on)
 				  imo = NULL;
 				if (ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 #ifndef notdef
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface. In case, keep note we did that, so
 	 * if the the firewall changes the next-hop causing the output
 	 * interface to change, we can fix that.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 			src_was_INADDR_ANY = 1;
 		}
 	}
 #endif /* notdef */
 	/*
 	 * Verify that we have any chance at all of being able to queue
 	 *      the packet or packet fragments
 	 */
 	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
 		ifp->if_snd.ifq_maxlen) {
 			error = ENOBUFS;
 			ipstat.ips_odropped++;
 			goto bad;
 	}
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if ((u_short)ip->ip_len > ifp->if_mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	/* get SP for this packet */
 	if (so == NULL)
 		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
 	else
 		sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
 
 	if (sp == NULL) {
 		ipsecstat.out_inval++;
 		goto bad;
 	}
 
 	error = 0;
 
 	/* check policy */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		/*
 		 * This packet is just discarded.
 		 */
 		ipsecstat.out_polvio++;
 		goto bad;
 
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		/* no need to do IPsec. */
 		goto skip_ipsec;
 	
 	case IPSEC_POLICY_IPSEC:
 		if (sp->req == NULL) {
 			/* acquire a policy */
 			error = key_spdacquire(sp);
 			goto bad;
 		}
 		break;
 
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		printf("ip_output: Invalid policy found. %d\n", sp->policy);
 	}
     {
 	struct ipsec_output_state state;
 	bzero(&state, sizeof(state));
 	state.m = m;
 	if (flags & IP_ROUTETOIF) {
 		state.ro = &iproute;
 		bzero(&iproute, sizeof(iproute));
 	} else
 		state.ro = ro;
 	state.dst = (struct sockaddr *)dst;
 
 	ip->ip_sum = 0;
 
 	/*
 	 * XXX
 	 * delayed checksums are not currently compatible with IPsec
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 
 	ip->ip_len = htons(ip->ip_len);
 	ip->ip_off = htons(ip->ip_off);
 
 	error = ipsec4_output(&state, sp, flags);
 
 	m = state.m;
 	if (flags & IP_ROUTETOIF) {
 		/*
 		 * if we have tunnel mode SA, we may need to ignore
 		 * IP_ROUTETOIF.
 		 */
 		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
 			flags &= ~IP_ROUTETOIF;
 			ro = state.ro;
 		}
 	} else
 		ro = state.ro;
 	dst = (struct sockaddr_in *)state.dst;
 	if (error) {
 		/* mbuf is already reclaimed in ipsec4_output. */
 		m0 = NULL;
 		switch (error) {
 		case EHOSTUNREACH:
 		case ENETUNREACH:
 		case EMSGSIZE:
 		case ENOBUFS:
 		case ENOMEM:
 			break;
 		default:
 			printf("ip4_output (ipsec): error code %d\n", error);
 			/*fall through*/
 		case ENOENT:
 			/* don't show these error codes to the user */
 			error = 0;
 			break;
 		}
 		goto bad;
 	}
     }
 
 	/* be sure to update variables that are affected by ipsec4_output() */
 	ip = mtod(m, struct ip *);
 #ifdef _IP_VHL
 	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 #else
 	hlen = ip->ip_hl << 2;
 #endif
 	if (ro->ro_rt == NULL) {
 		if ((flags & IP_ROUTETOIF) == 0) {
 			printf("ip_output: "
 				"can't update route after IPsec processing\n");
 			error = EHOSTUNREACH;	/*XXX*/
 			goto bad;
 		}
 	} else {
 		ia = ifatoia(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 	}
 
 	/* make it flipped, again. */
 	ip->ip_len = ntohs(ip->ip_len);
 	ip->ip_off = ntohs(ip->ip_off);
 skip_ipsec:
 #endif /*IPSEC*/
 
 	/*
 	 * IpHack's section.
 	 * - Xlate: translate packet's addr/port (NAT).
 	 * - Firewall: deny/allow/etc.
 	 * - Wrap: fake packet's addr/port <unimpl.>
 	 * - Encapsulate: put it in another IP and send out. <unimp.>
 	 */ 
 #ifdef PFIL_HOOKS
 	/*
 	 * Run through list of hooks for output packets.
 	 */
 	m1 = m;
 	pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
 	for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link))
 		if (pfh->pfil_func) {
 			rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1);
 			if (rv) {
 				error = EHOSTUNREACH;
 				goto done;
 			}
 			m = m1;
 			if (m == NULL)
 				goto done;
 			ip = mtod(m, struct ip *);
 		}
 #endif /* PFIL_HOOKS */
 
 	/*
 	 * Check with the firewall...
 	 * but not if we are already being fwd'd from a firewall.
 	 */
 	if (fw_enable && IPFW_LOADED && !args.next_hop) {
 		struct sockaddr_in *old = dst;
 
 		args.m = m;
 		args.next_hop = dst;
 		args.oif = ifp;
 		off = ip_fw_chk_ptr(&args);
 		m = args.m;
 		dst = args.next_hop;
 
                 /*
 		 * On return we must do the following:
 		 * m == NULL	-> drop the pkt (old interface, deprecated)
 		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
 		 * 1<=off<= 0xffff		-> DIVERT
 		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
 		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
 		 * dst != old			-> IPFIREWALL_FORWARD
 		 * off==0, dst==old		-> accept
 		 * If some of the above modules are not compiled in, then
 		 * we should't have to check the corresponding condition
 		 * (because the ipfw control socket should not accept
 		 * unsupported rules), but better play safe and drop
 		 * packets in case of doubt.
 		 */
 		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
 			if (m)
 				m_freem(m);
 			error = EACCES;
 			goto done;
 		}
 		ip = mtod(m, struct ip *);
 		if (off == 0 && dst == old)		/* common case */
 			goto pass;
                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
 			/*
 			 * pass the pkt to dummynet. Need to include
 			 * pipe number, m, ifp, ro, dst because these are
 			 * not recomputed in the next pass.
 			 * All other parameters have been already used and
 			 * so they are not needed anymore. 
 			 * XXX note: if the ifp or ro entry are deleted
 			 * while a pkt is in dummynet, we are in trouble!
 			 */ 
 			args.ro = ro;
 			args.dst = dst;
 			args.flags = flags;
 
 			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
 				&args);
 			goto done;
 		}
 #ifdef IPDIVERT
 		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
 			struct mbuf *clone = NULL;
 
 			/* Clone packet if we're doing a 'tee' */
 			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
 				clone = m_dup(m, M_DONTWAIT);
 
 			/*
 			 * XXX
 			 * delayed checksums are not currently compatible
 			 * with divert sockets.
 			 */
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				in_delayed_cksum(m);
 				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			}
 
 			/* Restore packet header fields to original values */
 			ip->ip_len = htons(ip->ip_len);
 			ip->ip_off = htons(ip->ip_off);
 
 			/* Deliver packet to divert input routine */
 			divert_packet(m, 0, off & 0xffff, args.divert_rule);
 
 			/* If 'tee', continue with original packet */
 			if (clone != NULL) {
 				m = clone;
 				ip = mtod(m, struct ip *);
 				goto pass;
 			}
 			goto done;
 		}
 #endif
 
 		/* IPFIREWALL_FORWARD */
 		/*
 		 * Check dst to make sure it is directly reachable on the
 		 * interface we previously thought it was.
 		 * If it isn't (which may be likely in some situations) we have
 		 * to re-route it (ie, find a route for the next-hop and the
 		 * associated interface) and set them here. This is nested
 		 * forwarding which in most cases is undesirable, except where
 		 * such control is nigh impossible. So we do it here.
 		 * And I'm babbling.
 		 */
 		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
 #if 0
 			/*
 			 * XXX To improve readability, this block should be
 			 * changed into a function call as below:
 			 */
 			error = ip_ipforward(&m, &dst, &ifp);
 			if (error)
 				goto bad;
 			if (m == NULL) /* ip_input consumed the mbuf */
 				goto done;
 #else
 			struct in_ifaddr *ia;
 
 			/*
 			 * XXX sro_fwd below is static, and a pointer
 			 * to it gets passed to routines downstream.
 			 * This could have surprisingly bad results in
 			 * practice, because its content is overwritten
 			 * by subsequent packets.
 			 */
 			/* There must be a better way to do this next line... */
 			static struct route sro_fwd;
 			struct route *ro_fwd = &sro_fwd;
 
 #if 0
 			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
 			    dst->sin_addr, "\n");
 #endif
 
 			/*
 			 * We need to figure out if we have been forwarded
 			 * to a local socket. If so, then we should somehow 
 			 * "loop back" to ip_input, and get directed to the
 			 * PCB as if we had received this packet. This is
 			 * because it may be dificult to identify the packets
 			 * you want to forward until they are being output
 			 * and have selected an interface. (e.g. locally
 			 * initiated packets) If we used the loopback inteface,
 			 * we would not be able to control what happens 
 			 * as the packet runs through ip_input() as
 			 * it is done through a ISR.
 			 */
 			LIST_FOREACH(ia,
 			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
 				/*
 				 * If the addr to forward to is one
 				 * of ours, we pretend to
 				 * be the destination for this packet.
 				 */
 				if (IA_SIN(ia)->sin_addr.s_addr ==
 						 dst->sin_addr.s_addr)
 					break;
 			}
 			if (ia) {	/* tell ip_input "dont filter" */
 				struct m_hdr tag;
 
 				tag.mh_type = MT_TAG;
 				tag.mh_flags = PACKET_TAG_IPFORWARD;
 				tag.mh_data = (caddr_t)args.next_hop;
 				tag.mh_next = m;
 
 				if (m->m_pkthdr.rcvif == NULL)
 					m->m_pkthdr.rcvif = ifunit("lo0");
 				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 					m->m_pkthdr.csum_flags |=
 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 					m0->m_pkthdr.csum_data = 0xffff;
 				}
 				m->m_pkthdr.csum_flags |=
 				    CSUM_IP_CHECKED | CSUM_IP_VALID;
 				ip->ip_len = htons(ip->ip_len);
 				ip->ip_off = htons(ip->ip_off);
 				ip_input((struct mbuf *)&tag);
 				goto done;
 			}
 			/* Some of the logic for this was
 			 * nicked from above.
 			 *
 			 * This rewrites the cached route in a local PCB.
 			 * Is this what we want to do?
 			 */
 			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
 
 			ro_fwd->ro_rt = 0;
 			rtalloc_ign(ro_fwd, RTF_PRCLONING);
 
 			if (ro_fwd->ro_rt == 0) {
 				ipstat.ips_noroute++;
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 
 			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
 			ifp = ro_fwd->ro_rt->rt_ifp;
 			ro_fwd->ro_rt->rt_use++;
 			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
 				dst = (struct sockaddr_in *)
 					ro_fwd->ro_rt->rt_gateway;
 			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
 				isbroadcast =
 				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
 			else
 				isbroadcast = in_broadcast(dst->sin_addr, ifp);
 			if (ro->ro_rt)
 				RTFREE(ro->ro_rt);
 			ro->ro_rt = ro_fwd->ro_rt;
 			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
 
 #endif	/* ... block to be put into a function */
 			/*
 			 * If we added a default src ip earlier,
 			 * which would have been gotten from the-then
 			 * interface, do it again, from the new one.
 			 */
 			if (src_was_INADDR_ANY)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 			goto pass ;
 		}
 
                 /*
                  * if we get here, none of the above matches, and 
                  * we have to drop the pkt
                  */
 		m_freem(m);
                 error = EACCES; /* not sure this is the right error msg */
                 goto done;
 	}
 
 pass:
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			ipstat.ips_badaddr++;
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
 	if (sw_csum & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		sw_csum &= ~CSUM_DELAY_DATA;
 	}
 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, can just send directly.
 	 */
 	if ((u_short)ip->ip_len <= ifp->if_mtu ||
 	    ifp->if_hwassist & CSUM_FRAGMENT) {
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 		ip->ip_sum = 0;
 		if (sw_csum & CSUM_DELAY_IP) {
 			if (ip->ip_vhl == IP_VHL_BORING) {
 				ip->ip_sum = in_cksum_hdr(ip);
 			} else {
 				ip->ip_sum = in_cksum(m, hlen);
 			}
 		}
 
 		/* Record statistics for this interface address. */
 		if (!(flags & IP_FORWARDING) && ia) {
 			ia->ia_ifa.if_opackets++;
 			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
 		}
 
 #ifdef IPSEC
 		/* clean ipsec history once it goes out of the node */
 		ipsec_delaux(m);
 #endif
 
 		error = (*ifp->if_output)(ifp, m,
 				(struct sockaddr *)dst, ro->ro_rt);
 		goto done;
 	}
 	/*
 	 * Too large for interface; fragment if possible.
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (ip->ip_off & IP_DF) {
 		error = EMSGSIZE;
 		/*
 		 * This case can happen if the user changed the MTU
 		 * of an interface after enabling IP on it.  Because
 		 * most netifs don't keep track of routes pointing to
 		 * them, there is no way for one to update all its
 		 * routes when the MTU is changed.
 		 */
 		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
 		    && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
 		    && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
 		}
 		ipstat.ips_cantfrag++;
 		goto bad;
 	}
 	len = (ifp->if_mtu - hlen) &~ 7;
 	if (len < 8) {
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/*
 	 * if the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
 	    (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 
 	if (len > PAGE_SIZE) {
 		/* 
 		 * Fragement large datagrams such that each segment 
 		 * contains a multiple of PAGE_SIZE amount of data, 
 		 * plus headers. This enables a receiver to perform 
 		 * page-flipping zero-copy optimizations.
 		 */
 
 		int newlen;
 		struct mbuf *mtmp;
 
 		for (mtmp = m, off = 0; 
 		     mtmp && ((off + mtmp->m_len) <= ifp->if_mtu);
 		     mtmp = mtmp->m_next) {
 			off += mtmp->m_len;
 		}
 		/*
 		 * firstlen (off - hlen) must be aligned on an 
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & ifp->if_mtu;
 		if ((newlen + sizeof (struct ip)) > ifp->if_mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 
 /*		printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n",
 		len, hlen, sizeof (struct ip), newlen, off);*/
 
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 
 
     {
 	int mhlen, firstlen = off - hlen;
 	struct mbuf **mnext = &m->m_nextpkt;
 	int nfrags = 1;
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 */
 	m0 = m;
 	mhlen = sizeof (struct ip);
 	for (; off < (u_short)ip->ip_len; off += len) {
 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
 		if (m == 0) {
 			error = ENOBUFS;
 			ipstat.ips_odropped++;
 			goto sendorfree;
 		}
 		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
 		}
 		m->m_len = mhlen;
 		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
 		if (off + len >= (u_short)ip->ip_len)
 			len = (u_short)ip->ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copy(m0, off, len);
 		if (m->m_next == 0) {
 			(void) m_free(m);
 			error = ENOBUFS;	/* ??? */
 			ipstat.ips_odropped++;
 			goto sendorfree;
 		}
 		m->m_pkthdr.len = mhlen + len;
 		m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 		mac_create_fragment(m0, m);
 #endif
 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (sw_csum & CSUM_DELAY_IP) {
 			if (mhip->ip_vhl == IP_VHL_BORING) {
 				mhip->ip_sum = in_cksum_hdr(mhip);
 			} else {
 				mhip->ip_sum = in_cksum(m, mhlen);
 			}
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		nfrags++;
 	}
 	ipstat.ips_ofragments += nfrags;
 
 	/* set first/last markers for fragment chain */
 	m->m_flags |= M_LASTFRAG;
 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
 	m0->m_pkthdr.csum_data = nfrags;
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header, then send each fragment (in order).
 	 */
 	m = m0;
 	m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
 	m->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m->m_pkthdr.len);
 	ip->ip_off |= IP_MF;
 	ip->ip_off = htons(ip->ip_off);
 	ip->ip_sum = 0;
 	if (sw_csum & CSUM_DELAY_IP) {
 		if (ip->ip_vhl == IP_VHL_BORING) {
 			ip->ip_sum = in_cksum_hdr(ip);
 		} else {
 			ip->ip_sum = in_cksum(m, hlen);
 		}
 	}
 sendorfree:
 	for (m = m0; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 #ifdef IPSEC
 		/* clean ipsec history once it goes out of the node */
 		ipsec_delaux(m);
 #endif
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				ia->ia_ifa.if_opackets++;
 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
 			}
 			
 			error = (*ifp->if_output)(ifp, m,
 			    (struct sockaddr *)dst, ro->ro_rt);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		ipstat.ips_fragmented++;
     }
 done:
 #ifdef IPSEC
 	if (ro == &iproute && ro->ro_rt) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = NULL;
 	}
 	if (sp != NULL) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP ip_output call free SP:%p\n", sp));
 		key_freesp(sp);
 	}
 #endif /* IPSEC */
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	u_short csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
 	csum = in_cksum_skip(m, ip->ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(u_short) > m->m_len) {
 		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
 		    m->m_len, offset, ip->ip_p);
 		/*
 		 * XXX
 		 * this shouldn't happen, but if it does, the
 		 * correct behavior may be to insert the checksum
 		 * in the existing chain instead of rearranging it.
 		 */
 		m = m_pullup(m, offset + sizeof(u_short));
 	}
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * Insert IP options into preformed packet.
  * Adjust IP destination as required for IP source routing,
  * as indicated by a non-zero in_addr at the start of the options.
  *
  * XXX This routine assumes that the packet has no options in place.
  */
 static struct mbuf *
 ip_insertoptions(m, opt, phlen)
 	register struct mbuf *m;
 	struct mbuf *opt;
 	int *phlen;
 {
 	register struct ipoption *p = mtod(opt, struct ipoption *);
 	struct mbuf *n;
 	register struct ip *ip = mtod(m, struct ip *);
 	unsigned optlen;
 
 	optlen = opt->m_len - sizeof(p->ipopt_dst);
 	if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
 		*phlen = 0;
 		return (m);		/* XXX should fail */
 	}
 	if (p->ipopt_dst.s_addr)
 		ip->ip_dst = p->ipopt_dst;
 	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
 		if (n == 0) {
 			*phlen = 0;
 			return (m);
 		}
 		n->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 		mac_create_mbuf_from_mbuf(m, n);
 #endif
 		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
 		m->m_len -= sizeof(struct ip);
 		m->m_data += sizeof(struct ip);
 		n->m_next = m;
 		m = n;
 		m->m_len = optlen + sizeof(struct ip);
 		m->m_data += max_linkhdr;
 		(void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
 	} else {
 		m->m_data -= optlen;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 		ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 	}
 	ip = mtod(m, struct ip *);
 	bcopy(p->ipopt_list, ip + 1, optlen);
 	*phlen = sizeof(struct ip) + optlen;
 	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
 	ip->ip_len += optlen;
 	return (m);
 }
 
 /*
  * Copy options from ip to jp,
  * omitting those not copied during fragmentation.
  */
 int
 ip_optcopy(ip, jp)
 	struct ip *ip, *jp;
 {
 	register u_char *cp, *dp;
 	int opt, optlen, cnt;
 
 	cp = (u_char *)(ip + 1);
 	dp = (u_char *)(jp + 1);
 	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP) {
 			/* Preserve for IP mcast tunnel's LSRR alignment. */
 			*dp++ = IPOPT_NOP;
 			optlen = 1;
 			continue;
 		}
 
 		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
 		    ("ip_optcopy: malformed ipv4 option"));
 		optlen = cp[IPOPT_OLEN];
 		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
 		    ("ip_optcopy: malformed ipv4 option"));
 
 		/* bogus lengths should have been caught by ip_dooptions */
 		if (optlen > cnt)
 			optlen = cnt;
 		if (IPOPT_COPIED(opt)) {
 			bcopy(cp, dp, optlen);
 			dp += optlen;
 		}
 	}
 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
 		*dp++ = IPOPT_EOL;
 	return (optlen);
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		return (EINVAL);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
 			if (m == 0) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			
 			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
 					   m));
 		}
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVIF:
 		case IP_FAITH:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 #define	OPTSET(bit) \
 	if (optval) \
 		inp->inp_flags |= bit; \
 	else \
 		inp->inp_flags &= ~bit;
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_FAITH:
 				OPTSET(INP_FAITH);
 				break;
 			}
 			break;
 #undef OPTSET
 
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 			error = ip_setmoptions(sopt, &inp->inp_moptions);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			size_t len = 0;
 			int priv;
 			struct mbuf *m;
 			int optname;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			priv = (sopt->sopt_td != NULL &&
 				suser(sopt->sopt_td) != 0) ? 0 : 1;
 			req = mtod(m, caddr_t);
 			len = m->m_len;
 			optname = sopt->sopt_name;
 			error = ipsec4_set_policy(inp, optname, req, len, priv);
 			m_freem(m);
 			break;
 		}
 #endif /*IPSEC*/
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt, 
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_FAITH:
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_FAITH:
 				optval = OPTBIT(INP_FAITH);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 			error = ip_getmoptions(sopt, inp->inp_moptions);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != 0) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /*IPSEC*/
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Set up IP options in pcb for insertion in output packets.
  * Store in mbuf with pointer in pcbopt, adding pseudo-option
  * with destination address if source routed.
  */
 static int
 ip_pcbopts(optname, pcbopt, m)
 	int optname;
 	struct mbuf **pcbopt;
 	register struct mbuf *m;
 {
 	register int cnt, optlen;
 	register u_char *cp;
 	u_char opt;
 
 	/* turn off any old options */
 	if (*pcbopt)
 		(void)m_free(*pcbopt);
 	*pcbopt = 0;
 	if (m == (struct mbuf *)0 || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options.
 		 */
 		if (m)
 			(void)m_free(m);
 		return (0);
 	}
 
 	if (m->m_len % sizeof(int32_t))
 		goto bad;
 	/*
 	 * IP first-hop destination address will be stored before
 	 * actual options; move other options back
 	 * and clear it when none present.
 	 */
 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
 		goto bad;
 	cnt = m->m_len;
 	m->m_len += sizeof(struct in_addr);
 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
 	ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
 	bzero(mtod(m, caddr_t), sizeof(struct in_addr));
 
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp))
 				goto bad;
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
 				goto bad;
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 			/*
 			 * user process specifies route as:
 			 *	->A->B->C->D
 			 * D must be our final destination (but we can't
 			 * check that since we may not have connected yet).
 			 * A is first hop destination, which doesn't appear in
 			 * actual IP option, but is stored before the options.
 			 */
 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
 				goto bad;
 			m->m_len -= sizeof(struct in_addr);
 			cnt -= sizeof(struct in_addr);
 			optlen -= sizeof(struct in_addr);
 			cp[IPOPT_OLEN] = optlen;
 			/*
 			 * Move first hop before start of options.
 			 */
 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
 			    sizeof(struct in_addr));
 			/*
 			 * Then copy rest of options back
 			 * to close up the deleted entry.
 			 */
 			ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
 			    sizeof(struct in_addr)),
 			    (caddr_t)&cp[IPOPT_OFFSET+1],
 			    (unsigned)cnt + sizeof(struct in_addr));
 			break;
 		}
 	}
 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
 		goto bad;
 	*pcbopt = m;
 	return (0);
 
 bad:
 	(void)m_free(m);
 	return (EINVAL);
 }
 
 /*
  * XXX
  * The whole multicast option thing needs to be re-thought.
  * Several of these options are equally applicable to non-multicast
  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
  * standard option (IP_TTL).
  */
 
 /*
  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
  */
 static struct ifnet *
 ip_multicast_if(a, ifindexp)
 	struct in_addr *a;
 	int *ifindexp;
 {
 	int ifindex;
 	struct ifnet *ifp;
 
 	if (ifindexp)
 		*ifindexp = 0;
 	if (ntohl(a->s_addr) >> 24 == 0) {
 		ifindex = ntohl(a->s_addr) & 0xffffff;
 		if (ifindex < 0 || if_index < ifindex)
 			return NULL;
 		ifp = ifnet_byindex(ifindex);
 		if (ifindexp)
 			*ifindexp = ifindex;
 	} else {
 		INADDR_TO_IFP(*a, ifp);
 	}
 	return ifp;
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  */
 static int
 ip_setmoptions(sopt, imop)
 	struct sockopt *sopt;
 	struct ip_moptions **imop;
 {
 	int error = 0;
 	int i;
 	struct in_addr addr;
 	struct ip_mreq mreq;
 	struct ifnet *ifp;
 	struct ip_moptions *imo = *imop;
 	struct route ro;
 	struct sockaddr_in *dst;
 	int ifindex;
 	int s;
 
 	if (imo == NULL) {
 		/*
 		 * No multicast option buffer attached to the pcb;
 		 * allocate one and initialize to default values.
 		 */
 		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
 		    M_WAITOK);
 
 		if (imo == NULL)
 			return (ENOBUFS);
 		*imop = imo;
 		imo->imo_multicast_ifp = NULL;
 		imo->imo_multicast_addr.s_addr = INADDR_ANY;
 		imo->imo_multicast_vif = -1;
 		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
 		imo->imo_num_memberships = 0;
 	}
 
 	switch (sopt->sopt_name) {
 	/* store an index number for the vif you wanna use in the send */
 	case IP_MULTICAST_VIF:
 		if (legal_vif_num == 0) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 		if (error)
 			break;
 		if (!legal_vif_num(i) && (i != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo->imo_multicast_vif = i;
 		break;
 
 	case IP_MULTICAST_IF:
 		/*
 		 * Select the interface for outgoing multicast packets.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
 		if (error)
 			break;
 		/*
 		 * INADDR_ANY is used to remove a previous selection.
 		 * When no interface is selected, a default one is
 		 * chosen every time a multicast packet is sent.
 		 */
 		if (addr.s_addr == INADDR_ANY) {
 			imo->imo_multicast_ifp = NULL;
 			break;
 		}
 		/*
 		 * The selected interface is identified by its local
 		 * IP address.  Find the interface and confirm that
 		 * it supports multicasting.
 		 */
 		s = splimp();
 		ifp = ip_multicast_if(&addr, &ifindex);
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			splx(s);
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		imo->imo_multicast_ifp = ifp;
 		if (ifindex)
 			imo->imo_multicast_addr = addr;
 		else
 			imo->imo_multicast_addr.s_addr = INADDR_ANY;
 		splx(s);
 		break;
 
 	case IP_MULTICAST_TTL:
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == 1) {
 			u_char ttl;
 			error = sooptcopyin(sopt, &ttl, 1, 1);
 			if (error)
 				break;
 			imo->imo_multicast_ttl = ttl;
 		} else {
 			u_int ttl;
 			error = sooptcopyin(sopt, &ttl, sizeof ttl, 
 					    sizeof ttl);
 			if (error)
 				break;
 			if (ttl > 255)
 				error = EINVAL;
 			else
 				imo->imo_multicast_ttl = ttl;
 		}
 		break;
 
 	case IP_MULTICAST_LOOP:
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == 1) {
 			u_char loop;
 			error = sooptcopyin(sopt, &loop, 1, 1);
 			if (error)
 				break;
 			imo->imo_multicast_loop = !!loop;
 		} else {
 			u_int loop;
 			error = sooptcopyin(sopt, &loop, sizeof loop,
 					    sizeof loop);
 			if (error)
 				break;
 			imo->imo_multicast_loop = !!loop;
 		}
 		break;
 
 	case IP_ADD_MEMBERSHIP:
 		/*
 		 * Add a multicast group membership.
 		 * Group must be a valid IP multicast address.
 		 */
 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
 		if (error)
 			break;
 
 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
 			error = EINVAL;
 			break;
 		}
 		s = splimp();
 		/*
 		 * If no interface address was provided, use the interface of
 		 * the route to the given multicast address.
 		 */
 		if (mreq.imr_interface.s_addr == INADDR_ANY) {
 			bzero((caddr_t)&ro, sizeof(ro));
 			dst = (struct sockaddr_in *)&ro.ro_dst;
 			dst->sin_len = sizeof(*dst);
 			dst->sin_family = AF_INET;
 			dst->sin_addr = mreq.imr_multiaddr;
 			rtalloc(&ro);
 			if (ro.ro_rt == NULL) {
 				error = EADDRNOTAVAIL;
 				splx(s);
 				break;
 			}
 			ifp = ro.ro_rt->rt_ifp;
 			rtfree(ro.ro_rt);
 		}
 		else {
 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
 		}
 
 		/*
 		 * See if we found an interface, and confirm that it
 		 * supports multicast.
 		 */
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			error = EADDRNOTAVAIL;
 			splx(s);
 			break;
 		}
 		/*
 		 * See if the membership already exists or if all the
 		 * membership slots are full.
 		 */
 		for (i = 0; i < imo->imo_num_memberships; ++i) {
 			if (imo->imo_membership[i]->inm_ifp == ifp &&
 			    imo->imo_membership[i]->inm_addr.s_addr
 						== mreq.imr_multiaddr.s_addr)
 				break;
 		}
 		if (i < imo->imo_num_memberships) {
 			error = EADDRINUSE;
 			splx(s);
 			break;
 		}
 		if (i == IP_MAX_MEMBERSHIPS) {
 			error = ETOOMANYREFS;
 			splx(s);
 			break;
 		}
 		/*
 		 * Everything looks good; add a new record to the multicast
 		 * address list for the given interface.
 		 */
 		if ((imo->imo_membership[i] =
 		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
 			error = ENOBUFS;
 			splx(s);
 			break;
 		}
 		++imo->imo_num_memberships;
 		splx(s);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 		/*
 		 * Drop a multicast group membership.
 		 * Group must be a valid IP multicast address.
 		 */
 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
 		if (error)
 			break;
 
 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
 			error = EINVAL;
 			break;
 		}
 
 		s = splimp();
 		/*
 		 * If an interface address was specified, get a pointer
 		 * to its ifnet structure.
 		 */
 		if (mreq.imr_interface.s_addr == INADDR_ANY)
 			ifp = NULL;
 		else {
 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
 			if (ifp == NULL) {
 				error = EADDRNOTAVAIL;
 				splx(s);
 				break;
 			}
 		}
 		/*
 		 * Find the membership in the membership array.
 		 */
 		for (i = 0; i < imo->imo_num_memberships; ++i) {
 			if ((ifp == NULL ||
 			     imo->imo_membership[i]->inm_ifp == ifp) &&
 			     imo->imo_membership[i]->inm_addr.s_addr ==
 			     mreq.imr_multiaddr.s_addr)
 				break;
 		}
 		if (i == imo->imo_num_memberships) {
 			error = EADDRNOTAVAIL;
 			splx(s);
 			break;
 		}
 		/*
 		 * Give up the multicast address record to which the
 		 * membership points.
 		 */
 		in_delmulti(imo->imo_membership[i]);
 		/*
 		 * Remove the gap in the membership array.
 		 */
 		for (++i; i < imo->imo_num_memberships; ++i)
 			imo->imo_membership[i-1] = imo->imo_membership[i];
 		--imo->imo_num_memberships;
 		splx(s);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	/*
 	 * If all options have default values, no need to keep the mbuf.
 	 */
 	if (imo->imo_multicast_ifp == NULL &&
 	    imo->imo_multicast_vif == -1 &&
 	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
 	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
 	    imo->imo_num_memberships == 0) {
 		free(*imop, M_IPMOPTS);
 		*imop = NULL;
 	}
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 static int
 ip_getmoptions(sopt, imo)
 	struct sockopt *sopt;
 	register struct ip_moptions *imo;
 {
 	struct in_addr addr;
 	struct in_ifaddr *ia;
 	int error, optval;
 	u_char coptval;
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: 
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		error = sooptcopyout(sopt, &optval, sizeof optval);
 		break;
 
 	case IP_MULTICAST_IF:
 		if (imo == NULL || imo->imo_multicast_ifp == NULL)
 			addr.s_addr = INADDR_ANY;
 		else if (imo->imo_multicast_addr.s_addr) {
 			/* return the value user has set */
 			addr = imo->imo_multicast_addr;
 		} else {
 			IFP_TO_IA(imo->imo_multicast_ifp, ia);
 			addr.s_addr = (ia == NULL) ? INADDR_ANY
 				: IA_SIN(ia)->sin_addr.s_addr;
 		}
 		error = sooptcopyout(sopt, &addr, sizeof addr);
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == 0)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		if (sopt->sopt_valsize == 1)
 			error = sooptcopyout(sopt, &coptval, 1);
 		else
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == 0)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		if (sopt->sopt_valsize == 1)
 			error = sooptcopyout(sopt, &coptval, 1);
 		else
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Discard the IP multicast options.
  */
 void
 ip_freemoptions(imo)
 	register struct ip_moptions *imo;
 {
 	register int i;
 
 	if (imo != NULL) {
 		for (i = 0; i < imo->imo_num_memberships; ++i)
 			in_delmulti(imo->imo_membership[i]);
 		free(imo, M_IPMOPTS);
 	}
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(ifp, m, dst, hlen)
 	struct ifnet *ifp;
 	register struct mbuf *m;
 	register struct sockaddr_in *dst;
 	int hlen;
 {
 	register struct ip *ip;
 	struct mbuf *copym;
 
 	copym = m_copy(m, 0, M_COPYALL);
 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 		ip->ip_sum = 0;
 		if (ip->ip_vhl == IP_VHL_BORING) {
 			ip->ip_sum = in_cksum_hdr(ip);
 		} else {
 			ip->ip_sum = in_cksum(copym, hlen);
 		}
 		/*
 		 * NB:
 		 * It's not clear whether there are any lingering
 		 * reentrancy problems in other areas which might
 		 * be exposed by using ip_input directly (in
 		 * particular, everything which modifies the packet
 		 * in-place).  Yet another option is using the
 		 * protosw directly to deliver the looped back
 		 * packet.  For the moment, we'll err on the side
 		 * of safety by using if_simloop().
 		 */
 #if 1 /* XXX */
 		if (dst->sin_family != AF_INET) {
 			printf("ip_mloopback: bad address family %d\n",
 						dst->sin_family);
 			dst->sin_family = AF_INET;
 		}
 #endif
 
 #ifdef notdef
 		copym->m_pkthdr.rcvif = ifp;
 		ip_input(copym);
 #else
 		/* if the checksum hasn't been computed, mark it as valid */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		if_simloop(ifp, copym, dst->sin_family, 0);
 #endif
 	}
 }
Index: head/sys/netinet/ip_var.h
===================================================================
--- head/sys/netinet/ip_var.h	(revision 105193)
+++ head/sys/netinet/ip_var.h	(revision 105194)
@@ -1,210 +1,211 @@
 /*
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IP_VAR_H_
 #define	_NETINET_IP_VAR_H_
 
 #include <sys/queue.h>
 
 #ifdef _KERNEL
 #include <sys/_label.h>
 #endif
 
 /*
  * Overlay for ip header used by other protocols (tcp, udp).
  */
 struct ipovly {
 	u_char	ih_x1[9];		/* (unused) */
 	u_char	ih_pr;			/* protocol */
 	u_short	ih_len;			/* protocol length */
 	struct	in_addr ih_src;		/* source internet address */
 	struct	in_addr ih_dst;		/* destination internet address */
 };
 
 #ifdef _KERNEL
 /*
  * Ip reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  * They are timed out after ipq_ttl drops to 0, and may also
  * be reclaimed if memory becomes tight.
  */
 struct ipq {
 	TAILQ_ENTRY(ipq) ipq_list;	/* to other reass headers */
 	u_char	ipq_ttl;		/* time for reass q to live */
 	u_char	ipq_p;			/* protocol of this fragment */
 	u_short	ipq_id;			/* sequence id for reassembly */
 	struct mbuf *ipq_frags;		/* to ip headers of fragments */
 	struct	in_addr ipq_src,ipq_dst;
 #ifdef IPDIVERT
 	u_int32_t ipq_div_info;		/* ipfw divert port & flags */
 	u_int16_t ipq_div_cookie;	/* ipfw divert cookie */
 #endif
 	struct label ipq_label;		/* MAC label */
 };
 #endif /* _KERNEL */
 
 /*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
  * is in m_len.
  */
 #define MAX_IPOPTLEN	40
 
 struct ipoption {
 	struct	in_addr ipopt_dst;	/* first-hop dst if source routed */
 	char	ipopt_list[MAX_IPOPTLEN];	/* options proper */
 };
 
 /*
  * Structure attached to inpcb.ip_moptions and
  * passed to ip_output when IP multicast options are in use.
  */
 struct ip_moptions {
 	struct	ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
 	struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
 	u_char	imo_multicast_ttl;	/* TTL for outgoing multicasts */
 	u_char	imo_multicast_loop;	/* 1 => hear sends if a member */
 	u_short	imo_num_memberships;	/* no. memberships this socket */
 	struct	in_multi *imo_membership[IP_MAX_MEMBERSHIPS];
 	u_long	imo_multicast_vif;	/* vif num outgoing multicasts */
 };
 
 struct	ipstat {
 	u_long	ips_total;		/* total packets received */
 	u_long	ips_badsum;		/* checksum bad */
 	u_long	ips_tooshort;		/* packet too short */
 	u_long	ips_toosmall;		/* not enough data */
 	u_long	ips_badhlen;		/* ip header length < data size */
 	u_long	ips_badlen;		/* ip length < ip header length */
 	u_long	ips_fragments;		/* fragments received */
 	u_long	ips_fragdropped;	/* frags dropped (dups, out of space) */
 	u_long	ips_fragtimeout;	/* fragments timed out */
 	u_long	ips_forward;		/* packets forwarded */
 	u_long	ips_fastforward;	/* packets fast forwarded */
 	u_long	ips_cantforward;	/* packets rcvd for unreachable dest */
 	u_long	ips_redirectsent;	/* packets forwarded on same net */
 	u_long	ips_noproto;		/* unknown or unsupported protocol */
 	u_long	ips_delivered;		/* datagrams delivered to upper level*/
 	u_long	ips_localout;		/* total ip packets generated here */
 	u_long	ips_odropped;		/* lost packets due to nobufs, etc. */
 	u_long	ips_reassembled;	/* total packets reassembled ok */
 	u_long	ips_fragmented;		/* datagrams successfully fragmented */
 	u_long	ips_ofragments;		/* output fragments created */
 	u_long	ips_cantfrag;		/* don't fragment flag was set, etc. */
 	u_long	ips_badoptions;		/* error in option processing */
 	u_long	ips_noroute;		/* packets discarded due to no route */
 	u_long	ips_badvers;		/* ip version != 4 */
 	u_long	ips_rawout;		/* total raw ip packets generated */
 	u_long	ips_toolong;		/* ip length > max ip packet size */
 	u_long	ips_notmember;		/* multicasts for unregistered grps */
 	u_long	ips_nogif;		/* no match gif found */
 	u_long	ips_badaddr;		/* invalid address on header */
 };
 
 #ifdef _KERNEL
 
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
 #define	IP_ROUTETOIF		SO_DONTROUTE	/* bypass routing tables */
 #define	IP_ALLOWBROADCAST	SO_BROADCAST	/* can send broadcast packets */
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 
 extern struct	ipstat	ipstat;
 #ifndef RANDOM_IP_ID
 extern u_short	ip_id;				/* ip packet ctr, for ids */
 #endif
 extern int	ip_defttl;			/* default IP ttl */
 extern int	ipforwarding;			/* ip forwarding */
 extern struct route ipforward_rt;		/* ip forwarding cached route */
 extern u_char	ip_protox[];
 extern struct socket *ip_rsvpd;	/* reservation protocol daemon */
 extern struct socket *ip_mrouter; /* multicast routing daemon */
 extern int	(*legal_vif_num)(int);
 extern u_long	(*ip_mcast_src)(int);
 extern int rsvp_on;
 extern struct	pr_usrreqs rip_usrreqs;
 
 int	 ip_ctloutput(struct socket *, struct sockopt *sopt);
 void	 ip_drain(void);
 void	 ip_freemoptions(struct ip_moptions *);
 void	 ip_init(void);
 extern int	 (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 			  struct ip_moptions *);
 int	 ip_output(struct mbuf *,
-	    struct mbuf *, struct route *, int, struct ip_moptions *);
+	    struct mbuf *, struct route *, int, struct ip_moptions *,
+	    struct inpcb *);
 struct in_ifaddr *
 	 ip_rtaddr(struct in_addr, struct route *);
 void	 ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
 		struct mbuf *);
 void	 ip_slowtimo(void);
 struct mbuf *
 	 ip_srcroute(void);
 void	 ip_stripoptions(struct mbuf *, struct mbuf *);
 #ifdef RANDOM_IP_ID
 u_int16_t	
 	 ip_randomid(void);
 #endif
 int	rip_ctloutput(struct socket *, struct sockopt *);
 void	rip_ctlinput(int, struct sockaddr *, void *);
 void	rip_init(void);
 void	rip_input(struct mbuf *, int);
 int	rip_output(struct mbuf *, struct socket *, u_long);
 void	ipip_input(struct mbuf *, int);
 void	rsvp_input(struct mbuf *, int);
 int	ip_rsvp_init(struct socket *);
 int	ip_rsvp_done(void);
 int	ip_rsvp_vif_init(struct socket *, struct sockopt *);
 int	ip_rsvp_vif_done(struct socket *, struct sockopt *);
 void	ip_rsvp_force_done(struct socket *);
 
 #ifdef IPDIVERT
 void	div_init(void);
 void	div_input(struct mbuf *, int);
 void	divert_packet(struct mbuf *m, int incoming, int port, int rule);
 extern struct pr_usrreqs div_usrreqs;
 #endif
 
 void	in_delayed_cksum(struct mbuf *m);
 
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IP_VAR_H_ */
Index: head/sys/netinet/raw_ip.c
===================================================================
--- head/sys/netinet/raw_ip.c	(revision 105193)
+++ head/sys/netinet/raw_ip.c	(revision 105194)
@@ -1,732 +1,725 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_random_ip_id.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 struct	inpcbhead ripcb;
 struct	inpcbinfo ripcbinfo;
 
 /* control hooks for ipfw and dummynet */
 ip_fw_ctl_t *ip_fw_ctl_ptr;
 ip_dn_ctl_t *ip_dn_ctl_ptr;
 
 /*
  * Nominal space allocated to a raw ip socket.
  */
 #define	RIPSNDQ		8192
 #define	RIPRCVQ		8192
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block q.
  */
 void
 rip_init()
 {
 	INP_INFO_LOCK_INIT(&ripcbinfo, "rip");
 	LIST_INIT(&ripcb);
 	ripcbinfo.listhead = &ripcb;
 	/*
 	 * XXX We don't use the hash list for raw IP, but it's easier
 	 * to allocate a one entry hash list than it is to check all
 	 * over the place for hashbase == NULL.
 	 */
 	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
 	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
 	ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
 }
 
 static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
  * mbuf chain.
  */
 void
 rip_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	register struct ip *ip = mtod(m, struct ip *);
 	register struct inpcb *inp;
 	struct inpcb *last = 0;
 	struct mbuf *opts = 0;
 	int proto = ip->ip_p;
 
 	ripsrc.sin_addr = ip->ip_src;
 	LIST_FOREACH(inp, &ripcb, inp_list) {
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_ip_p && inp->inp_ip_p != proto)
 			continue;
 		if (inp->inp_laddr.s_addr &&
                   inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			continue;
 		if (inp->inp_faddr.s_addr &&
                   inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			continue;
 		if (last) {
 			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
 			int policyfail = 0;
 
 			if (n != NULL) {
 #ifdef IPSSEC
 				/* check AH/ESP integrity. */
 				if (ipsec4_in_reject_so(n, last->inp_socket)) {
 					policyfail = 1;
 					ipsecstat.in_polvio++;
 					/* do not inject data to pcb */
 				}
 #endif /*IPSEC*/
 #ifdef MAC
 				if (policyfail == 0 &&
 				    mac_check_socket_deliver(last->inp_socket,
 				    n) != 0)
 					policyfail = 1;
 #endif
 			}
 			if (policyfail)
 				m_freem(n);
 			else if (n) {
 				if (last->inp_flags & INP_CONTROLOPTS ||
 				    last->inp_socket->so_options & SO_TIMESTAMP)
 				    ip_savecontrol(last, &opts, ip, n);
 				if (sbappendaddr(&last->inp_socket->so_rcv,
 				    (struct sockaddr *)&ripsrc, n,
 				    opts) == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts)
 					    m_freem(opts);
 				} else
 					sorwakeup(last->inp_socket);
 				opts = 0;
 			}
 		}
 		last = inp;
 	}
 	if (last) {
 #ifdef IPSEC
 		/* check AH/ESP integrity. */
 		if (ipsec4_in_reject_so(m, last->inp_socket)) {
 			m_freem(m);
 			ipsecstat.in_polvio++;
 			ipstat.ips_delivered--;
 			/* do not inject data to pcb */
 			return;
 		}
 #endif /*IPSEC*/
 #ifdef MAC
 		if (mac_check_socket_deliver(last->inp_socket, m) != 0) {
 			m_freem(m);
 			ipstat.ips_delivered--;
 			return;
 		}
 #endif
 		if (last->inp_flags & INP_CONTROLOPTS ||
 		    last->inp_socket->so_options & SO_TIMESTAMP)
 			ip_savecontrol(last, &opts, ip, m);
 		if (sbappendaddr(&last->inp_socket->so_rcv,
 		    (struct sockaddr *)&ripsrc, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 			    m_freem(opts);
 		} else
 			sorwakeup(last->inp_socket);
 	} else {
 		m_freem(m);
 		ipstat.ips_noproto++;
 		ipstat.ips_delivered--;
 	}
 }
 
 /*
  * Generate IP header and pass packet to ip_output.
  * Tack on options user may have setup with control call.
  */
 int
 rip_output(m, so, dst)
 	struct mbuf *m;
 	struct socket *so;
 	u_long dst;
 {
 	register struct ip *ip;
 	register struct inpcb *inp = sotoinpcb(so);
 	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
 
 #ifdef MAC
 	mac_create_mbuf_from_socket(so, m);
 #endif
 
 	/*
 	 * If the user handed us a complete IP packet, use it.
 	 * Otherwise, allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_TRYWAIT);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_off = 0;
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = m->m_pkthdr.len;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		ip = mtod(m, struct ip *);
 		/* don't allow both user specified and setsockopt options,
 		   and don't allow packet length sizes that will crash */
 		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
 		     && inp->inp_options)
 		    || (ip->ip_len > m->m_pkthdr.len)
 		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
 			m_freem(m);
 			return EINVAL;
 		}
 		if (ip->ip_id == 0)
 #ifdef RANDOM_IP_ID
 			ip->ip_id = ip_randomid();
 #else
 			ip->ip_id = htons(ip_id++);
 #endif
 		/* XXX prevent ip_output from overwriting header fields */
 		flags |= IP_RAWOUTPUT;
 		ipstat.ips_rawout++;
 	}
 
-#ifdef IPSEC
-	if (ipsec_setsocket(m, so) != 0) {
-		m_freem(m);
-		return ENOBUFS;
-	}
-#endif /*IPSEC*/
-
 	return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
-			  inp->inp_moptions));
+			  inp->inp_moptions, inp));
 }
 
 /*
  * Raw IP socket option processing.
  */
 int
 rip_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP)
 		return (EINVAL);
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 			if (IPFW_LOADED)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_GET:
 			if (DUMMYNET_LOADED)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 			error = ip_mrouter_get(so, sopt);
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 			if (IPFW_LOADED)
 				error = ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			if (DUMMYNET_LOADED)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = ip_rsvp_done();
 			break;
 
 			/* XXX - should be combined */
 		case IP_RSVP_VIF_ON:
 			error = ip_rsvp_vif_init(so, sopt);
 			break;
 			
 		case IP_RSVP_VIF_OFF:
 			error = ip_rsvp_vif_done(so, sopt);
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 			error = ip_mrouter_set(so, sopt);
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which
  * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
  * and calls in_ifadown() to remove all routes corresponding to that address.
  * It also receives the PRC_IFUP messages from if_up() and reinstalls the
  * interface routes.
  */
 void
 rip_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				/*
 				 * in_ifscrub kills the interface route.
 				 */
 				in_ifscrub(ia->ia_ifp, ia);
 				/*
 				 * in_ifadown gets rid of all the rest of
 				 * the routes.  This is not quite the right
 				 * thing to do, but at least if we are running
 				 * a routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				break;
 			}
 		}
 		break;
 
 	case PRC_IFUP:
 		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
 			return;
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 		break;
 	}
 }
 
 u_long	rip_sendspace = RIPSNDQ;
 u_long	rip_recvspace = RIPRCVQ;
 
 SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error, s;
 
 	inp = sotoinpcb(so);
 	if (inp)
 		panic("rip_attach");
 	if (td && (error = suser(td)) != 0)
 		return error;
 
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &ripcbinfo, td);
 	splx(s);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = ip_defttl;
 	return 0;
 }
 
 static int
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		panic("rip_detach");
 	if (so == ip_mrouter)
 		ip_mrouter_done();
 	ip_rsvp_force_done(so);
 	if (so == ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 	return 0;
 }
 
 static int
 rip_abort(struct socket *so)
 {
 	soisdisconnected(so);
 	return rip_detach(so);
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	return rip_abort(so);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 
 	if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) &&
 				    (addr->sin_family != AF_IMPLINK)) ||
 	    (addr->sin_addr.s_addr &&
 	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
 		return EADDRNOTAVAIL;
 	inp->inp_laddr = addr->sin_addr;
 	return 0;
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 	if (TAILQ_EMPTY(&ifnet))
 		return EADDRNOTAVAIL;
 	if ((addr->sin_family != AF_INET) &&
 	    (addr->sin_family != AF_IMPLINK))
 		return EAFNOSUPPORT;
 	inp->inp_faddr = addr->sin_addr;
 	soisconnected(so);
 	return 0;
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	register u_long dst;
 
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return EISCONN;
 		}
 		dst = inp->inp_faddr.s_addr;
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return ENOTCONN;
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	return rip_output(m, so, dst);
 }
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = ripcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = ripcbinfo.ipi_gencnt;
 	n = ripcbinfo.ipi_count;
 	splx(s);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		if (inp->inp_gencnt <= gencnt) {
 			if (cr_canseesocket(req->td->td_ucred,
 			    inp->inp_socket))
 				continue;
 			inp_list[i++] = inp;
 		}
 	}
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		xig.xig_gen = ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = ripcbinfo.ipi_count;
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &ripcbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 rip_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &ripcbinfo));
 }
 
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
 	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 struct pr_usrreqs rip_usrreqs = {
 	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
 	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
 	pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
 	rip_sockaddr, sosend, soreceive, sopoll
 };
Index: head/sys/netinet/tcp_input.c
===================================================================
--- head/sys/netinet/tcp_input.c	(revision 105193)
+++ head/sys/netinet/tcp_input.c	(revision 105194)
@@ -1,2790 +1,2790 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_ipfw.h"		/* for ipfw_fwd		*/
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 #include "opt_tcp_input.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* for ICMP_BANDLIM		*/
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM		*/
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netinet6/ipsec6.h>
 #include <netkey/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
 
 static const int tcprexmtthresh = 3;
 tcp_cc	tcp_ccgen;
 
 struct	tcpstat tcpstat;
 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
     &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 static int log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 
     &log_in_vain, 0, "Log all incoming TCP connections");
 
 static int blackhole = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send RST when dropping refused connections");
 
 int tcp_delack_enabled = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 
     &tcp_delack_enabled, 0, 
     "Delay ACK to try and piggyback it onto a data packet");
 
 #ifdef TCP_DROP_SYNFIN
 static int drop_synfin = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 #endif
 
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
 struct mtx	*tcbinfo_mtx;
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *,
 		     struct mbuf *);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
 	    (tp)->t_inpcb->in6p_route.ro_rt) \
 		nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  *	- delayed acks are enabled and
  *	- there is no delayed ack timer in progress and
  *	- our last ack wasn't a 0-sized window.  We never want to delay
  *	  the ack that opens up a 0-sized window.
  */
 #define DELAY_ACK(tp) \
 	(tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
 	(tp->t_flags & TF_RXWIN0SENT) == 0)
 
 static int
 tcp_reass(tp, th, tlenp, m)
 	register struct tcpcb *tp;
 	register struct tcphdr *th;
 	int *tlenp;
 	struct mbuf *m;
 {
 	struct tseg_qent *q;
 	struct tseg_qent *p = NULL;
 	struct tseg_qent *nq;
 	struct tseg_qent *te;
 	struct socket *so = tp->t_inpcb->inp_socket;
 	int flags;
 
 	/*
 	 * Call with th==0 after become established to
 	 * force pre-ESTABLISHED data up to user socket.
 	 */
 	if (th == 0)
 		goto present;
 
 	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 	MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
 	       M_NOWAIT);
 	if (te == NULL) {
 		tcpstat.tcps_rcvmemdrop++;
 		m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 			break;
 		p = q;
 	}
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us.
 	 */
 	if (p != NULL) {
 		register int i;
 		/* conversion to int (in i) handles seq wraparound */
 		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 		if (i > 0) {
 			if (i >= *tlenp) {
 				tcpstat.tcps_rcvduppack++;
 				tcpstat.tcps_rcvdupbyte += *tlenp;
 				m_freem(m);
 				FREE(te, M_TSEGQ);
 				/*
 				 * Try to present any queued data
 				 * at the left window edge to the user.
 				 * This is needed after the 3-WHS
 				 * completes.
 				 */
 				goto present;	/* ??? */
 			}
 			m_adj(m, i);
 			*tlenp -= i;
 			th->th_seq += i;
 		}
 	}
 	tcpstat.tcps_rcvoopack++;
 	tcpstat.tcps_rcvoobyte += *tlenp;
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	while (q) {
 		register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 		if (i <= 0)
 			break;
 		if (i < q->tqe_len) {
 			q->tqe_th->th_seq += i;
 			q->tqe_len -= i;
 			m_adj(q->tqe_m, i);
 			break;
 		}
 
 		nq = LIST_NEXT(q, tqe_q);
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		FREE(q, M_TSEGQ);
 		q = nq;
 	}
 
 	/* Insert the new segment queue entry into place. */
 	te->tqe_m = m;
 	te->tqe_th = th;
 	te->tqe_len = *tlenp;
 
 	if (p == NULL) {
 		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 	} else {
 		LIST_INSERT_AFTER(p, te, tqe_q);
 	}
 
 present:
 	/*
 	 * Present data to user, advancing rcv_nxt through
 	 * completed sequence space.
 	 */
 	if (!TCPS_HAVEESTABLISHED(tp->t_state))
 		return (0);
 	q = LIST_FIRST(&tp->t_segq);
 	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
 		return (0);
 	do {
 		tp->rcv_nxt += q->tqe_len;
 		flags = q->tqe_th->th_flags & TH_FIN;
 		nq = LIST_NEXT(q, tqe_q);
 		LIST_REMOVE(q, tqe_q);
 		if (so->so_state & SS_CANTRCVMORE)
 			m_freem(q->tqe_m);
 		else
 			sbappend(&so->so_rcv, q->tqe_m);
 		FREE(q, M_TSEGQ);
 		q = nq;
 	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 	ND6_HINT(tp);
 	sorwakeup(so);
 	return (flags);
 }
 
 /*
  * TCP input routine, follows pages 65-76 of the
  * protocol specification dated September, 1981 very closely.
  */
 #ifdef INET6
 int
 tcp6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;
 {
 	register struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {		
 		struct ip6_hdr *ip6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
 }
 #endif
 
 void
 tcp_input(m, off0)
 	register struct mbuf *m;
 	int off0;
 {
 	register struct tcphdr *th;
 	register struct ip *ip = NULL;
 	register struct ipovly *ipov;
 	register struct inpcb *inp = NULL;
 	u_char *optp = NULL;
 	int optlen = 0;
 	int len, tlen, off;
 	int drop_hdrlen;
 	register struct tcpcb *tp = 0;
 	register int thflags;
 	struct socket *so = 0;
 	int todrop, acked, ourfinisacked, needoutput = 0;
 	u_long tiwin;
 	struct tcpopt to;		/* options in this segment */
 	struct rmxp_tao *taop;		/* pointer to our TAO cache entry */
 	struct rmxp_tao	tao_noncached;	/* in case there's no cached entry */
 	int headlocked = 0;
 	struct sockaddr_in *next_hop = NULL;
 	int rstreason; /* For badport_bandlim accounting purposes */
 
 	struct ip6_hdr *ip6 = NULL;
 #ifdef INET6
 	int isipv6;
 #else
 	const int isipv6 = 0;
 #endif
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[40];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef MAC
 	int error;
 #endif
 
 	/* Grab info from MT_TAG mbufs prepended to the chain. */
 	for (;m && m->m_type == MT_TAG; m = m->m_next) { 
-		if (m->m_tag_id == PACKET_TAG_IPFORWARD)
+		if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
 			next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
 	}
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 	bzero((char *)&to, sizeof(to));
 
 	tcpstat.tcps_rcvtotal++;
 
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
 		ip6 = mtod(m, struct ip6_hdr *);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
 			tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 	} else {
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m, (struct mbuf *)0);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
 				tcpstat.tcps_rcvshort++;
 				return;
 			}
 		}
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ip->ip_len;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr,
 						htonl(m->m_pkthdr.csum_data +
 							ip->ip_len +
 							IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = sizeof (struct ip) + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 			th->th_sum = in_cksum(m, len);
 		}
 		if (th->th_sum) {
 			tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 #ifdef INET6
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 #endif
 	}
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		tcpstat.tcps_rcvbadoff++;
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 		if (isipv6) {
 			IP6_EXTHDR_CHECK(m, off0, off, );
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		} else {
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 						== 0) {
 					tcpstat.tcps_rcvshort++;
 					return;
 				}
 				ip = mtod(m, struct ip *);
 				ipov = (struct ipovly *)ip;
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 #ifdef TCP_DROP_SYNFIN
 	/*
 	 * If the drop_synfin option is enabled, drop all packets with
 	 * both the SYN and FIN bits set. This prevents e.g. nmap from
 	 * identifying the TCP/IP stack.
 	 *
 	 * This is a violation of the TCP specification.
 	 */
 	if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
 		goto drop;
 #endif
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 
 	/*
 	 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
 	 * until after ip6_savecontrol() is called and before other functions
 	 * which don't want those proto headers.
 	 * Because ip6_savecontrol() is going to parse the mbuf to
 	 * search for data to be passed up to user-land, it wants mbuf
 	 * parameters to be unchanged.
 	 * XXX: the call of ip6_savecontrol() has been obsoleted based on
 	 * latest version of the advanced API (20020110).
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment.
 	 */
 	 INP_INFO_WLOCK(&tcbinfo);
 	 headlocked = 1;
 findpcb:
 	/* IPFIREWALL_FORWARD section */
 	if (next_hop != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this? 
 		 */
 		inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
 					ip->ip_dst, th->th_dport,
 					0, m->m_pkthdr.rcvif);
 		if (!inp) {
 			/* It's new.  Try find the ambushing socket. */
 			inp = in_pcblookup_hash(&tcbinfo,
 						ip->ip_src, th->th_sport,
 						next_hop->sin_addr,
 						next_hop->sin_port ?
 						    ntohs(next_hop->sin_port) :
 						    th->th_dport,
 						1, m->m_pkthdr.rcvif);
 		}
 	} else {
 		if (isipv6)
 			inp = in6_pcblookup_hash(&tcbinfo,
 						 &ip6->ip6_src, th->th_sport,
 						 &ip6->ip6_dst, th->th_dport,
 						 1, m->m_pkthdr.rcvif);
 		else
 			inp = in_pcblookup_hash(&tcbinfo,
 						ip->ip_src, th->th_sport,
 						ip->ip_dst, th->th_dport,
 						1, m->m_pkthdr.rcvif);
       }
 
 #ifdef IPSEC
 	if (isipv6) {
 		if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
 			ipsec6stat.in_polvio++;
 			goto drop;
 		}
 	} else {
 		if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
 			ipsecstat.in_polvio++;
 			goto drop;
 		}
 	}
 #endif
 
 	/*
 	 * If the state is CLOSED (i.e., TCB does not exist) then
 	 * all data in the incoming segment is discarded.
 	 * If the TCB exists but is in CLOSED state, it is embryonic,
 	 * but should either do a listen or a connect soon.
 	 */
 	if (inp == NULL) {
 		if (log_in_vain) {
 #ifdef INET6
 			char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2];
 #else
 			char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
 #endif
 
 			if (isipv6) {
 				strcpy(dbuf, "[");
 				strcpy(sbuf, "[");
 				strcat(dbuf, ip6_sprintf(&ip6->ip6_dst));
 				strcat(sbuf, ip6_sprintf(&ip6->ip6_src));
 				strcat(dbuf, "]");
 				strcat(sbuf, "]");
 			} else {
 				strcpy(dbuf, inet_ntoa(ip->ip_dst));
 				strcpy(sbuf, inet_ntoa(ip->ip_src));
 			}
 			switch (log_in_vain) {
 			case 1:
 				if (thflags & TH_SYN)
 					log(LOG_INFO,
 					    "Connection attempt to TCP %s:%d "
 					    "from %s:%d\n",
 					    dbuf, ntohs(th->th_dport), sbuf,
 					    ntohs(th->th_sport));
 				break;
 			case 2:
 				log(LOG_INFO,
 				    "Connection attempt to TCP %s:%d "
 				    "from %s:%d flags:0x%x\n",
 				    dbuf, ntohs(th->th_dport), sbuf,
 				    ntohs(th->th_sport), thflags);
 				break;
 			default:
 				break;
 			}
 		}
 		if (blackhole) { 
 			switch (blackhole) {
 			case 1:
 				if (thflags & TH_SYN)
 					goto drop;
 				break;
 			case 2:
 				goto drop;
 			default:
 				goto drop;
 			}
 		}
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_LOCK(inp);
 	tp = intotcpcb(inp);
 	if (tp == 0) {
 		INP_UNLOCK(inp);
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	if (tp->t_state == TCPS_CLOSED)
 		goto drop;
 
 	/* Unscale the window into a 32-bit value. */
 	if ((thflags & TH_SYN) == 0)
 		tiwin = th->th_win << tp->snd_scale;
 	else
 		tiwin = th->th_win;
 
 	so = inp->inp_socket;
 #ifdef MAC
 	error = mac_check_socket_deliver(so, m);
 	if (error)
 		goto drop;
 #endif
 	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
 		struct in_conninfo inc;
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG) {
 			ostate = tp->t_state;
 			if (isipv6)
 				bcopy((char *)ip6, (char *)tcp_saveipgen,
 					sizeof(*ip6));
 			else
 				bcopy((char *)ip, (char *)tcp_saveipgen,
 					sizeof(*ip));
 			tcp_savetcp = *th;
 		}
 #endif
 		/* skip if this isn't a listen socket */
 		if ((so->so_options & SO_ACCEPTCONN) == 0)
 			goto after_listen;
 #ifdef INET6
 		inc.inc_isipv6 = isipv6;
 #endif
 		if (isipv6) {
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 			inc.inc6_route.ro_rt = NULL;		/* XXX */
 		} else {
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 			inc.inc_route.ro_rt = NULL;		/* XXX */
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 
 	        /*
 	         * If the state is LISTEN then ignore segment if it contains
 		 * a RST.  If the segment contains an ACK then it is bad and
 		 * send a RST.  If it does not contain a SYN then it is not
 		 * interesting; drop it.
 		 *
 		 * If the state is SYN_RECEIVED (syncache) and seg contains
 		 * an ACK, but not for our SYN/ACK, send a RST.  If the seg
 		 * contains a RST, check the sequence number to see if it
 		 * is a valid reset segment.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
 			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 				if (!syncache_expand(&inc, th, &so, m)) {
 					/*
 					 * No syncache entry, or ACK was not
 					 * for our SYN/ACK.  Send a RST.
 					 */
 					tcpstat.tcps_badsyn++;
 					rstreason = BANDLIM_RST_OPENPORT;
 					goto dropwithreset;
 				}
 				if (so == NULL) {
 					/*
 					 * Could not complete 3-way handshake,
 					 * connection is being closed down, and
 					 * syncache will free mbuf.
 					 */
 					INP_UNLOCK(inp);
 					INP_INFO_WUNLOCK(&tcbinfo);
 					return;
 				}
 				/*
 				 * Socket is created in state SYN_RECEIVED.
 				 * Continue processing segment.
 				 */
 				INP_UNLOCK(inp);
 				inp = sotoinpcb(so);
 				INP_LOCK(inp);
 				tp = intotcpcb(inp);
 				/*
 				 * This is what would have happened in
 				 * tcp_output() when the SYN,ACK was sent.
 				 */
 				tp->snd_up = tp->snd_una;
 				tp->snd_max = tp->snd_nxt = tp->iss + 1;
 				tp->last_ack_sent = tp->rcv_nxt;
 /*
  * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
  * until the _second_ ACK is received:
  *    rcv SYN (set wscale opts)	 --> send SYN/ACK, set snd_wnd = window.
  *    rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
  *        move to ESTAB, set snd_wnd to tiwin.
  */        
 				tp->snd_wnd = tiwin;	/* unscaled */
 				goto after_listen;
 			}
 			if (thflags & TH_RST) {
 				syncache_chkrst(&inc, th);
 				goto drop;
 			}
 			if (thflags & TH_ACK) {
 				syncache_badack(&inc);
 				tcpstat.tcps_badsyn++;
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			goto drop;
 		}
 
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 */
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			if ((ia6 = ip6_getdstifaddr(m)) &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				INP_UNLOCK(inp);
 				tp = NULL;
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 		}
 #endif
 		/*
 		 * If it is from this socket, drop it, it must be forged.
 		 * Don't bother responding if the destination was a broadcast.
 		 */
 		if (th->th_dport == th->th_sport) {
 			if (isipv6) {
 				if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 						       &ip6->ip6_src))
 					goto drop;
 			} else {
 				if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
 					goto drop;
 			}
 		}
 		/*
 		 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 		 *
 		 * Note that it is quite possible to receive unicast
 		 * link-layer packets with a broadcast IP address. Use
 		 * in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			goto drop;
 		if (isipv6) {
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 				goto drop;
 		} else {
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 				goto drop;
 		}
 		/*
 		 * SYN appears to be valid; create compressed TCP state
 		 * for syncache, or perform t/tcp connection.
 		 */
 		if (so->so_qlen <= so->so_qlimit) {
 			tcp_dooptions(&to, optp, optlen, 1);
 			if (!syncache_add(&inc, &to, th, &so, m))
 				goto drop;
 			if (so == NULL) {
 				/*
 				 * Entry added to syncache, mbuf used to
 				 * send SYN,ACK packet.
 				 */
 				KASSERT(headlocked, ("headlocked"));
 				INP_UNLOCK(inp);
 				INP_INFO_WUNLOCK(&tcbinfo);
 				return;
 			}
 			/*
 			 * Segment passed TAO tests.
 			 */
 			INP_UNLOCK(inp);
 			inp = sotoinpcb(so);
 			INP_LOCK(inp);
 			tp = intotcpcb(inp);
 			tp->snd_wnd = tiwin;
 			tp->t_starttime = ticks;
 			tp->t_state = TCPS_ESTABLISHED;
 
 			/*
 			 * If there is a FIN, or if there is data and the
 			 * connection is local, then delay SYN,ACK(SYN) in
 			 * the hope of piggy-backing it on a response
 			 * segment.  Otherwise must send ACK now in case
 			 * the other side is slow starting.
 			 */
 			if (DELAY_ACK(tp) &&
 			    ((thflags & TH_FIN) ||
 			     (tlen != 0 &&
 			      ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 			       (!isipv6 && in_localaddr(inp->inp_faddr)))))) {
 				callout_reset(tp->tt_delack, tcp_delacktime,  
 						tcp_timer_delack, tp);  
 				tp->t_flags |= TF_NEEDSYN;
 			} else 
 				tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 			goto trimthenstep6;
 		}
 		goto drop;
 	}
 after_listen:
 
 /* XXX temp debugging */
 	/* should not happen - syncache should pick up these connections */
 	if (tp->t_state == TCPS_LISTEN)
 		panic("tcp_input: TCPS_LISTEN");
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
 
 	/*
 	 * Process options.
 	 * XXX this is tradtitional behavior, may need to be cleaned up.
 	 */
 	tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
 	if (thflags & TH_SYN) {
 		if (to.to_flags & TOF_SCALE) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->requested_s_scale = to.to_requested_s_scale;
 		}
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = ticks;
 		}
 		if (to.to_flags & (TOF_CC|TOF_CCNEW))
 			tp->t_flags |= TF_RCVD_CC;
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED above, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
 	    /*
 	     * Using the CC option is compulsory if once started:
 	     *   the segment is OK if no T/TCP was negotiated or
 	     *   if the segment has a CC option equal to CCrecv
 	     */
 	    ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
 	     ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
 	    th->th_seq == tp->rcv_nxt &&
 	    tiwin && tiwin == tp->snd_wnd &&
 	    tp->snd_nxt == tp->snd_max) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = ticks;
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
 			    tp->t_dupacks < tcprexmtthresh) {
 				KASSERT(headlocked, ("headlocked"));
 				INP_INFO_WUNLOCK(&tcbinfo);
 				headlocked = 0;
 				/*
 				 * this is a pure ack for outstanding data.
 				 */
 				++tcpstat.tcps_predack;
 				/*
 				 * "bad retransmit" recovery
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    ticks < tp->t_badrxtwin) {
 					tp->snd_cwnd = tp->snd_cwnd_prev;
 					tp->snd_ssthresh =
 					    tp->snd_ssthresh_prev;
 					tp->snd_nxt = tp->snd_max;
 					tp->t_badrxtwin = 0;
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore 
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					tcp_xmit_timer(tp,
 					    ticks - to.to_tsecr + 1);
 				} else if (tp->t_rtttime &&
 					    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				tp->snd_una = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* some progress has been done */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 				if (tp->snd_una == tp->snd_max)
 					callout_stop(tp->tt_rexmt);
 				else if (!callout_active(tp->tt_persist))
 					callout_reset(tp->tt_rexmt, 
 						      tp->t_rxtcur,
 						      tcp_timer_rexmt, tp);
 
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				INP_UNLOCK(inp);
 				return;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
 			/*
 			 * this is a pure, in-sequence data packet
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
 			/*
 			 * Add data to socket buffer.
 			 */
 			if (so->so_state & SS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappend(&so->so_rcv, m);
 			}
 			sorwakeup(so);
 			if (DELAY_ACK(tp)) {
 	                        callout_reset(tp->tt_delack, tcp_delacktime,
 	                            tcp_timer_delack, tp);
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			INP_UNLOCK(inp);
 			return;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	{ int win;
 
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
 			taop = &tao_noncached;
 			bzero(taop, sizeof(*taop));
 		}
 
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			/*
 			 * If we have a cached CCsent for the remote host,
 			 * hence we haven't just crashed and restarted,
 			 * do not send a RST.  This may be a retransmission
 			 * from the other side after our earlier ACK was lost.
 			 * Our new SYN, when it arrives, will serve as the
 			 * needed ACK.
 			 */
 			if (taop->tao_ccsent != 0)
 				goto drop;
 			else {
 				rstreason = BANDLIM_UNLIMITED;
 				goto dropwithreset;
 			}
 		}
 		if (thflags & TH_RST) {
 			if (thflags & TH_ACK)
 				tp = tcp_drop(tp, ECONNREFUSED);
 			goto drop;
 		}
 		if ((thflags & TH_SYN) == 0)
 			goto drop;
 		tp->snd_wnd = th->th_win;	/* initial send window */
 		tp->cc_recv = to.to_cc;		/* foreign CC */
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			/*
 			 * Our SYN was acked.  If segment contains CC.ECHO
 			 * option, check it to make sure this segment really
 			 * matches our SYN.  If not, just drop it as old
 			 * duplicate, but send an RST if we're still playing
 			 * by the old rules.  If no CC.ECHO option, make sure
 			 * we don't get fooled into using T/TCP.
 			 */
 			if (to.to_flags & TOF_CCECHO) {
 				if (tp->cc_send != to.to_ccecho) {
 					if (taop->tao_ccsent != 0)
 						goto drop;
 					else {
 						rstreason = BANDLIM_UNLIMITED;
 						goto dropwithreset;
 					}
 				}
 			} else
 				tp->t_flags &= ~TF_RCVD_CC;
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 #ifdef MAC
 			mac_set_socket_peer_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->snd_scale = tp->requested_s_scale;
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			/* Segment is acceptable, update cache if undefined. */
 			if (taop->tao_ccsent == 0)
 				taop->tao_ccsent = to.to_ccecho;
 
 			tp->rcv_adv += tp->rcv_wnd;
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp) && tlen != 0)
                                 callout_reset(tp->tt_delack, tcp_delacktime,  
                                     tcp_timer_delack, tp);  
 			else
 				tp->t_flags |= TF_ACKNOW;
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_state = TCPS_FIN_WAIT_1;
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
 				callout_reset(tp->tt_keep, tcp_keepidle,
 					      tcp_timer_keep, tp);
 			}
 		} else {
 			/*
 		 	 * Received initial SYN in SYN-SENT[*] state =>
 		 	 * simultaneous open.  If segment contains CC option
 		 	 * and there is a cached CC, apply TAO test.
 		 	 * If it succeeds, connection is * half-synchronized.
 		 	 * Otherwise, do 3-way handshake:
 		 	 *        SYN-SENT -> SYN-RECEIVED
 		 	 *        SYN-SENT* -> SYN-RECEIVED*
 		 	 * If there was no CC option, clear cached CC value.
 		 	 */
 			tp->t_flags |= TF_ACKNOW;
 			callout_stop(tp->tt_rexmt);
 			if (to.to_flags & TOF_CC) {
 				if (taop->tao_cc != 0 &&
 				    CC_GT(to.to_cc, taop->tao_cc)) {
 					/*
 					 * update cache and make transition:
 					 *        SYN-SENT -> ESTABLISHED*
 					 *        SYN-SENT* -> FIN-WAIT-1*
 					 */
 					taop->tao_cc = to.to_cc;
 					tp->t_starttime = ticks;
 					if (tp->t_flags & TF_NEEDFIN) {
 						tp->t_state = TCPS_FIN_WAIT_1;
 						tp->t_flags &= ~TF_NEEDFIN;
 					} else {
 						tp->t_state = TCPS_ESTABLISHED;
 						callout_reset(tp->tt_keep,
 							      tcp_keepidle,
 							      tcp_timer_keep,
 							      tp);
 					}
 					tp->t_flags |= TF_NEEDSYN;
 				} else
 					tp->t_state = TCPS_SYN_RECEIVED;
 			} else {
 				/* CC.NEW or no option => invalidate cache */
 				taop->tao_cc = 0;
 				tp->t_state = TCPS_SYN_RECEIVED;
 			}
 		}
 
 trimthenstep6:
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			tcpstat.tcps_rcvpackafterwin++;
 			tcpstat.tcps_rcvbyteafterwin += todrop;
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
  		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *	if segment contains a SYN and CC [not CC.NEW] option:
 	 *              if state == TIME_WAIT and connection duration > MSL,
 	 *                  drop packet and send RST;
 	 *
 	 *		if SEG.CC > CCrecv then is new SYN, and can implicitly
 	 *		    ack the FIN (and data) in retransmission queue.
 	 *                  Complete close and delete TCPCB.  Then reprocess
 	 *                  segment, hoping to find new TCPCB in LISTEN state;
 	 *
 	 *		else must be old SYN; drop it.
 	 *      else do normal processing.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 	case TCPS_TIME_WAIT:
 		if ((thflags & TH_SYN) &&
 		    (to.to_flags & TOF_CC) && tp->cc_recv != 0) {
 			if (tp->t_state == TCPS_TIME_WAIT &&
 					(ticks - tp->t_starttime) > tcp_msl) {
 				rstreason = BANDLIM_UNLIMITED;
 				goto dropwithreset;
 			}
 			if (CC_GT(to.to_cc, tp->cc_recv)) {
 				tp = tcp_close(tp);
 				goto findpcb;
 			}
 			else
 				goto drop;
 		}
  		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 *
 	 *
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 * Note: this does not take into account delayed ACKs, so
 	 *   we should test against last_ack_sent instead of rcv_nxt.
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 * If we have multiple segments in flight, the intial reset
 	 * segment sequence numbers will be to the left of last_ack_sent,
 	 * but they will eventually catch up.
 	 * In any case, it never made sense to trim reset segments to
 	 * fit the receive window since RFC 1122 says:
 	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 	 *
 	 *    A TCP SHOULD allow a received RST segment to include data.
 	 *
 	 *    DISCUSSION
 	 *         It has been suggested that a RST segment could contain
 	 *         ASCII text that encoded and explained the cause of the
 	 *         RST.  No standard has yet been established for such
 	 *         data.
 	 *
 	 * If the reset segment passes the sequence number test examine
 	 * the state:
 	 *    SYN_RECEIVED STATE:
 	 *	If passive open, return to LISTEN state.
 	 *	If active open, inform user that connection was refused.
 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 	 *	Inform user that connection was reset, and close tcb.
 	 *    CLOSING, LAST_ACK STATES:
 	 *	Close the tcb.
 	 *    TIME_WAIT STATE:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST) {
 		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			switch (tp->t_state) {
 
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 
 			case TCPS_ESTABLISHED:
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
 				tp->t_state = TCPS_CLOSED;
 				tcpstat.tcps_drops++;
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_TIME_WAIT:
 				break;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += tlen;
 			tcpstat.tcps_pawsdrop++;
 			goto dropafterack;
 		}
 	}
 
 	/*
 	 * T/TCP mechanism
 	 *   If T/TCP was negotiated and the segment doesn't have CC,
 	 *   or if its CC is wrong then drop the segment.
 	 *   RST segments do not have to comply with this.
 	 */
 	if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
 	    ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
  		goto dropafterack;
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += todrop;
 		} else {
 			tcpstat.tcps_rcvpartduppack++;
 			tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		tp = tcp_close(tp);
 		tcpstat.tcps_rcvafterclose++;
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 	if (todrop > 0) {
 		tcpstat.tcps_rcvpackafterwin++;
 		if (todrop >= tlen) {
 			tcpstat.tcps_rcvbyteafterwin += tlen;
 			/*
 			 * If a new connection request is received
 			 * while in TIME_WAIT, drop the old connection
 			 * and start over if the sequence numbers
 			 * are above the previous ones.
 			 */
 			if (thflags & TH_SYN &&
 			    tp->t_state == TCPS_TIME_WAIT &&
 			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
 				tp = tcp_close(tp);
 				goto findpcb;
 			}
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				tcpstat.tcps_rcvwinprobe++;
 			} else
 				goto dropafterack;
 		} else
 			tcpstat.tcps_rcvbyteafterwin += todrop;
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE that the test is modified according to the latest
 	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = ticks;
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If a SYN is in the window, then this is an
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		tcpstat.tcps_connects++;
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->snd_scale = tp->requested_s_scale;
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		/*
 		 * Upon successful completion of 3-way handshake,
 		 * update cache.CC if it was undefined, pass any queued
 		 * data to the user, and advance state appropriately.
 		 */
 		if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
 		    taop->tao_cc == 0)
 			taop->tao_cc = tp->cc_recv;
 
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_state = TCPS_FIN_WAIT_1;
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
 			callout_reset(tp->tt_keep, tcp_keepidle, 
 				      tcp_timer_keep, tp);
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 	case TCPS_TIME_WAIT:
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				tcpstat.tcps_rcvdupack++;
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change), the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 */
 				if (!callout_active(tp->tt_rexmt) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 					u_int win =
 					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 						tp->t_maxseg;
 					if (tcp_do_newreno &&
 					    SEQ_LT(th->th_ack,
 						   tp->snd_recover)) {
 						/* False retransmit, should not
 						 * cut window
 						 */
 						tp->snd_cwnd += tp->t_maxseg;
 						tp->t_dupacks = 0;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_maxseg;
 					tp->snd_recover = tp->snd_max;
 					callout_stop(tp->tt_rexmt);
 					tp->t_rtttime = 0;
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					tp->snd_cwnd = tp->snd_ssthresh +
 						tp->t_maxseg * tp->t_dupacks;
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (tp->t_dupacks > tcprexmtthresh) {
 					tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (tcp_do_newreno) {
 			int is_partialack = SEQ_LT(th->th_ack, tp->snd_recover);
 			if (tp->t_dupacks >= tcprexmtthresh) {
 				if (is_partialack) {
 					tcp_newreno_partial_ack(tp, th);
 				} else {
 					/*
 					 * Window inflation should have left us
 					 * with approximately snd_ssthresh
 					 * outstanding data.
 					 * But in case we would be inclined to
 					 * send a burst, better to do it via
 					 * the slow start mechanism.
 					 */
 					if (SEQ_GT(th->th_ack +
 							tp->snd_ssthresh,
 						   tp->snd_max))
 						tp->snd_cwnd = tp->snd_max -
 								th->th_ack +
 								tp->t_maxseg;
 					else
 						tp->snd_cwnd = tp->snd_ssthresh;
 				}
 			}
 			/*
 			 * Reset dupacks, except on partial acks in
 			 *   fast recovery.
 			 */
 			if (!(tp->t_dupacks >= tcprexmtthresh && is_partialack))
 				tp->t_dupacks = 0;
                 } else {
                         if (tp->t_dupacks >= tcprexmtthresh &&
                             tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 			tp->t_dupacks = 0;
                 }
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
 		}
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->snd_scale = tp->requested_s_scale;
 				tp->rcv_scale = tp->request_r_scale;
 			}
 		}
 
 process_ACK:
 		acked = th->th_ack - tp->snd_una;
 		tcpstat.tcps_rcvackpack++;
 		tcpstat.tcps_rcvackbyte += acked;
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
 			++tcpstat.tcps_sndrexmitbad;
 			tp->snd_cwnd = tp->snd_cwnd_prev;
 			tp->snd_ssthresh = tp->snd_ssthresh_prev;
 			tp->snd_nxt = tp->snd_max;
 			tp->t_badrxtwin = 0;	/* XXX probably not required */ 
 		}
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore 
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    to.to_tsecr) {
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			callout_stop(tp->tt_rexmt);
 			needoutput = 1;
 		} else if (!callout_active(tp->tt_persist))
 			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 				      tcp_timer_rexmt, tp);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * When new data is acked, open the congestion window.
 		 * If the window gives us less than ssthresh packets
 		 * in flight, open exponentially (maxseg per packet).
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 */
 		{
 		register u_int cw = tp->snd_cwnd;
 		register u_int incr = tp->t_maxseg;
 
 		if (cw > tp->snd_ssthresh)
 			incr = incr * incr / cw;
 		/*
 		 * If t_dupacks != 0 here, it indicates that we are still
 		 * in NewReno fast recovery mode, so we leave the congestion
 		 * window alone.
 		 */
 		if (!tcp_do_newreno || tp->t_dupacks == 0)
 			tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
 		}
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		sowwakeup(so);
 		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 */
 				if (so->so_state & SS_CANTRCVMORE) {
 					soisdisconnected(so);
 					callout_reset(tp->tt_2msl, tcp_maxidle,
 						      tcp_timer_2msl, tp);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
 			break;
 
 	 	/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				tp->t_state = TCPS_TIME_WAIT;
 				tcp_canceltimers(tp);
 				/* Shorten TIME_WAIT [RFC-1644, p.28] */
 				if (tp->cc_recv != 0 &&
 				    (ticks - tp->t_starttime) < tcp_msl)
 					callout_reset(tp->tt_2msl,
 						      tp->t_rxtcur *
 						      TCPTV_TWTRUNC,
 						      tcp_timer_2msl, tp);
 				else
 					callout_reset(tp->tt_2msl, 2 * tcp_msl,
 						      tcp_timer_2msl, tp);
 				soisdisconnected(so);
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 
 		/*
 		 * In TIME_WAIT state the only thing that should arrive
 		 * is a retransmission of the remote FIN.  Acknowledge
 		 * it and restart the finack timer.
 		 */
 		case TCPS_TIME_WAIT:
 			callout_reset(tp->tt_2msl, 2 * tcp_msl,
 				      tcp_timer_2msl, tp);
 			goto dropafterack;
 		}
 	}
 
 step6:
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			tcpstat.tcps_rcvwinupd++;
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_state |= SS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen
 #ifdef SO_OOBINLINE
 		     && (so->so_options & SO_OOBINLINE) == 0
 #endif
 		     )
 			tcp_pulloutofband(so, th, m,
 				drop_hdrlen);	/* hdr drop is delayed */
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	KASSERT(headlocked, ("headlocked"));
 	INP_INFO_WUNLOCK(&tcbinfo);
 	headlocked = 0;
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp))
 				callout_reset(tp->tt_delack, tcp_delacktime,
 					      tcp_timer_delack, tp);
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);
 			if (so->so_state & SS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappend(&so->so_rcv, m);
 			sorwakeup(so);
 		} else {
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 */
 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
                                 callout_reset(tp->tt_delack, tcp_delacktime,  
                                     tcp_timer_delack, tp);  
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 	 	/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/*FALLTHROUGH*/
 		case TCPS_ESTABLISHED:
 			tp->t_state = TCPS_CLOSE_WAIT;
 			break;
 
 	 	/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tp->t_state = TCPS_CLOSING;
 			break;
 
 	 	/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			tp->t_state = TCPS_TIME_WAIT;
 			tcp_canceltimers(tp);
 			/* Shorten TIME_WAIT [RFC-1644, p.28] */
 			if (tp->cc_recv != 0 &&
 			    (ticks - tp->t_starttime) < tcp_msl) {
 				callout_reset(tp->tt_2msl,
 					      tp->t_rxtcur * TCPTV_TWTRUNC,
 					      tcp_timer_2msl, tp);
 				/* For transaction client, force ACK now. */
 				tp->t_flags |= TF_ACKNOW;
 			}
 			else
 				callout_reset(tp->tt_2msl, 2 * tcp_msl,
 					      tcp_timer_2msl, tp);
 			soisdisconnected(so);
 			break;
 
 		/*
 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
 		 */
 		case TCPS_TIME_WAIT:
 			callout_reset(tp->tt_2msl, 2 * tcp_msl,
 				      tcp_timer_2msl, tp);
 			break;
 		}
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 	INP_UNLOCK(inp);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	m_freem(m);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_UNLOCK(inp);
 	return;
 
 dropwithreset:
 	/*
 	 * Generate a RST, dropping incoming segment.
 	 * Make ACK acceptable to originator of segment.
 	 * Don't bother to respond if destination was broadcast/multicast.
 	 */
 	if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 	if (isipv6) {
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 	} else {
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    	    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 	    	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 	/* IPv6 anycast check is done at tcp6_input() */
 
 	/*
 	 * Perform bandwidth limiting.
 	 */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
  
 #ifdef TCPDEBUG
 	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	if (tp)
 		INP_UNLOCK(inp);
 
 	if (thflags & TH_ACK)
 		/* mtod() below is safe as long as hdr dropping is delayed */
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
 			    TH_RST);
 	else {
 		if (thflags & TH_SYN)
 			tlen++;
 		/* mtod() below is safe as long as hdr dropping is delayed */
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 			    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	return;
 
 drop:
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp)
 		INP_UNLOCK(inp);
 	m_freem(m);
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	return;
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(to, cp, cnt, is_syn)
 	struct tcpopt *to;
 	u_char *cp;
 	int cnt;
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (! is_syn)
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 		case TCPOPT_CC:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			to->to_flags |= TOF_CC;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_cc, sizeof(to->to_cc));
 			to->to_cc = ntohl(to->to_cc);
 			break;
 		case TCPOPT_CCNEW:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_CCNEW;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_cc, sizeof(to->to_cc));
 			to->to_cc = ntohl(to->to_cc);
 			break;
 		case TCPOPT_CCECHO:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_CCECHO;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_ccecho, sizeof(to->to_ccecho));
 			to->to_ccecho = ntohl(to->to_ccecho);
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(so, th, m, off)
 	struct socket *so;
 	struct tcphdr *th;
 	register struct mbuf *m;
 	int off;		/* delayed to be droped hdrlen */
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == 0)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(tp, rtt)
 	register struct tcpcb *tp;
 	int rtt;
 {
 	register int delta;
 
 	tcpstat.tcps_rttupdated++;
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing
  * interface without forcing IP to fragment; if bigger than
  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
  * to utilize large mbufs.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, for outgoing segments only tcp_mssopt is called.
  *
  * In case of T/TCP, we call this routine during implicit connection
  * setup as well (offer = -1), to initialize maxseg from the cached
  * MSS of our peer.
  */
 void
 tcp_mss(tp, offer)
 	struct tcpcb *tp;
 	int offer;
 {
 	register struct rtentry *rt;
 	struct ifnet *ifp;
 	register int rtt, mss;
 	u_long bufsize;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 	struct rmxp_tao *taop;
 	int origoffer = offer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const int isipv6 = 0;
 	const size_t min_protoh = sizeof (struct tcpiphdr);
 #endif
 
 	if (isipv6)
 		rt = tcp_rtlookup6(&inp->inp_inc);
 	else
 		rt = tcp_rtlookup(&inp->inp_inc);
 	if (rt == NULL) {
 		tp->t_maxopd = tp->t_maxseg =
 				isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
 		return;
 	}
 	ifp = rt->rt_ifp;
 	so = inp->inp_socket;
 
 	taop = rmx_taop(rt->rt_rmx);
 	/*
 	 * Offer == -1 means that we didn't receive SYN yet,
 	 * use cached value in that case;
 	 */
 	if (offer == -1)
 		offer = taop->tao_mssopt;
 	/*
 	 * Offer == 0 means that there was no MSS on the SYN segment,
 	 * in this case we use tcp_mssdflt.
 	 */
 	if (offer == 0)
 		offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
 	else
 		/*
 		 * Sanity check: make sure that maxopd will be large
 		 * enough to allow some data on segments even is the
 		 * all the option space is used (40bytes).  Otherwise
 		 * funny things may happen in tcp_output.
 		 */
 		offer = max(offer, 64);
 	taop->tao_mssopt = offer;
 
 	/*
 	 * While we're here, check if there's an initial rtt
 	 * or rttvar.  Convert from the route-table units
 	 * to scaled multiples of the slow timeout timer.
 	 */
 	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
 		/*
 		 * XXX the lock bit for RTT indicates that the value
 		 * is also a minimum value; this is subject to time.
 		 */
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
 			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
 			tcpstat.tcps_usedrttvar++;
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 			      tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	/*
 	 * if there's an mtu associated with the route, use it
 	 * else, use the link mtu.
 	 */
 	if (rt->rt_rmx.rmx_mtu)
 		mss = rt->rt_rmx.rmx_mtu - min_protoh;
 	else {
 		if (isipv6) {
 			mss = nd_ifinfo[rt->rt_ifp->if_index].linkmtu -
 				min_protoh;
 			if (!in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, tcp_v6mssdflt);
 		} else {
 			mss = ifp->if_mtu - min_protoh;
 			if (!in_localaddr(inp->inp_faddr))
 				mss = min(mss, tcp_mssdflt);
 		}
 	}
 	mss = min(mss, offer);
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * In case of T/TCP, origoffer==-1 indicates, that no segments
 	 * were received yet.  In this case we just guess, otherwise
 	 * we do the same as before T/TCP.
 	 */
  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
  	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
 		mss -= TCPOLEN_CC_APPA;
 
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 		if (mss > MCLBYTES)
 			mss &= ~(MCLBYTES-1);
 #else
 		if (mss > MCLBYTES)
 			mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	/*
 	 * If there's a pipesize, change the socket buffer
 	 * to that size.  Make the socket buffers an integral
 	 * number of mss units; if the mss is larger than
 	 * the socket buffer, decrease the mss.
 	 */
 #ifdef RTV_SPIPE
 	if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
 #endif
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve(&so->so_snd, bufsize, so, NULL);
 	}
 	tp->t_maxseg = mss;
 
 #ifdef RTV_RPIPE
 	if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
 #endif
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
 	}
 
 	/*
 	 * Set the slow-start flight size depending on whether this
 	 * is a local network or not.
 	 */
 	if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 	    (!isipv6 && in_localaddr(inp->inp_faddr)))
 		tp->snd_cwnd = mss * ss_fltsz_local;
 	else 
 		tp->snd_cwnd = mss * ss_fltsz;
 
 	if (rt->rt_rmx.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
 		tcpstat.tcps_usedssthresh++;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(tp)
 	struct tcpcb *tp;
 {
 	struct rtentry *rt;
 #ifdef INET6
 	int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const int isipv6 = 0;
 	const size_t min_protoh = sizeof (struct tcpiphdr);
 #endif
 
 	if (isipv6)
 		rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
 	else
 		rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
 	if (rt == NULL)
 		return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
 
 	return (rt->rt_ifp->if_mtu - min_protoh);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(tp, th)
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	callout_stop(tp->tt_rexmt);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
 }
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 105193)
+++ head/sys/netinet/tcp_output.c	(revision 105194)
@@ -1,1010 +1,1001 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #ifdef notyet
 extern struct mbuf *m_copypack();
 #endif
 
 int path_mtu_discovery = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
 
 int ss_fltsz = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 	&ss_fltsz, 1, "Slow start flight size");
 
 int ss_fltsz_local = 4;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 	&ss_fltsz_local, 1, "Slow start flight size for local networks");
 
 int     tcp_do_newreno = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
         0, "Enable NewReno Algorithms");
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, win;
 	int off, flags, error;
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	int idle, sendalot;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 	struct rmxp_tao *taop;
 	struct rmxp_tao tao_noncached;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 #ifndef INET6
 	mtx_assert(&tp->t_inpcb->inp_mtx, MA_OWNED);
 #endif
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
 		/*
 		 * We have been idle for "a while" and no acks are
 		 * expected to clock out any data we send --
 		 * slow start to get ack "clock" running again.
 		 *       
 		 * Set the slow-start flight size depending on whether
 		 * this is a local network or not.
 		 */      
 		int ss = ss_fltsz;
 #ifdef INET6
 		if (isipv6) {
 			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
 				ss = ss_fltsz_local;
 		} else
 #endif /* INET6 */
 		if (in_localaddr(tp->t_inpcb->inp_faddr))
 			ss = ss_fltsz_local;
 		tp->snd_cwnd = tp->t_maxseg * ss;
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	win = min(tp->snd_wnd, tp->snd_cwnd);
 	win = min(win, tp->snd_bwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_force) {
 		if (win == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < so->so_snd.sb_cc)
 				flags &= ~TH_FIN;
 			win = 1;
 		} else {
 			callout_stop(tp->tt_persist);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the 
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when tcp opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 */
 	len = (long)ulmin(so->so_snd.sb_cc, win) - off;
 
 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
 		taop = &tao_noncached;
 		bzero(taop, sizeof(*taop));
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		flags &= ~TH_SYN;
 		off--, len++;
 		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
 		    taop->tao_ccsent == 0)
 			return 0;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments
 	 * in cases when no CC option will be sent.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) &&
 	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
 	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	if (len < 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 */
 		len = 0;
 		if (win == 0) {
 			callout_stop(tp->tt_rexmt);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!callout_active(tp->tt_persist))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/*
 	 * len will be >= 0 after this point.  Truncate to the maximum
 	 * segment length and ensure that FIN is removed if the length
 	 * no longer contains the last data byte.
 	 */
 	if (len > tp->t_maxseg) {
 		len = tp->t_maxseg;
 		sendalot = 1;
 	}
 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 		flags &= ~TH_FIN;
 
 	win = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len == tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= so->so_snd.sb_cc &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_force)			/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 	}
 
 	/*
 	 * Compare available window to amount of window
 	 * known to peer (as advertised window less
 	 * next expected input).  If the difference is at least two
 	 * max size segments, or at least 50% of the maximum possible
 	 * window, then want to send a window update to peer.
 	 */
 	if (win > 0) {
 		/*
 		 * "adv" is the amount we can increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
 			(tp->rcv_adv - tp->rcv_nxt);
 
 		if (adv >= (long) (2 * tp->t_maxseg))
 			goto send;
 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 			goto send;
 	}
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * callout_active(tp->tt_persist)
 	 *	is true when we are in persist state.
 	 * tp->t_force
 	 *	is set when we are called to send a persist packet.
 	 * callout_active(tp->tt_rexmt)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
 	    !callout_active(tp->tt_persist)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 	return (0);
 
 send:
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 	hdrlen = sizeof (struct tcpiphdr);
 	if (flags & TH_SYN) {
 		tp->snd_nxt = tp->iss;
 		if ((tp->t_flags & TF_NOOPT) == 0) {
 			u_short mss;
 
 			opt[0] = TCPOPT_MAXSEG;
 			opt[1] = TCPOLEN_MAXSEG;
 			mss = htons((u_short) tcp_mssopt(tp));
 			(void)memcpy(opt + 2, &mss, sizeof(mss));
 			optlen = TCPOLEN_MAXSEG;
 
 			if ((tp->t_flags & TF_REQ_SCALE) &&
 			    ((flags & TH_ACK) == 0 ||
 			    (tp->t_flags & TF_RCVD_SCALE))) {
 				*((u_int32_t *)(opt + optlen)) = htonl(
 					TCPOPT_NOP << 24 |
 					TCPOPT_WINDOW << 16 |
 					TCPOLEN_WINDOW << 8 |
 					tp->request_r_scale);
 				optlen += 4;
 			}
 		}
  	}
 
  	/*
 	 * Send a timestamp and echo-reply if this is a SYN and our side
 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 	 * and our peer have sent timestamps in our SYN's.
  	 */
  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
  	    (flags & TH_RST) == 0 &&
 	    ((flags & TH_ACK) == 0 ||
 	     (tp->t_flags & TF_RCVD_TSTMP))) {
 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
 
  		/* Form timestamp option as shown in appendix A of RFC 1323. */
  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
  		*lp++ = htonl(ticks);
  		*lp   = htonl(tp->ts_recent);
  		optlen += TCPOLEN_TSTAMP_APPA;
  	}
 
  	/*
 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
 	 * options are allowed (!TF_NOOPT) and it's not a RST.
  	 */
  	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
  	     (flags & TH_RST) == 0) {
 		switch (flags & (TH_SYN|TH_ACK)) {
 		/*
 		 * This is a normal ACK, send CC if we received CC before
 		 * from our peer.
 		 */
 		case TH_ACK:
 			if (!(tp->t_flags & TF_RCVD_CC))
 				break;
 			/*FALLTHROUGH*/
 
 		/*
 		 * We can only get here in T/TCP's SYN_SENT* state, when
 		 * we're a sending a non-SYN segment without waiting for
 		 * the ACK of our SYN.  A check above assures that we only
 		 * do this if our peer understands T/TCP.
 		 */
 		case 0:
 			opt[optlen++] = TCPOPT_NOP;
 			opt[optlen++] = TCPOPT_NOP;
 			opt[optlen++] = TCPOPT_CC;
 			opt[optlen++] = TCPOLEN_CC;
 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
 
 			optlen += 4;
 			break;
 
 		/*
 		 * This is our initial SYN, check whether we have to use
 		 * CC or CC.new.
 		 */
 		case TH_SYN:
 			opt[optlen++] = TCPOPT_NOP;
 			opt[optlen++] = TCPOPT_NOP;
 			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
 						TCPOPT_CCNEW : TCPOPT_CC;
 			opt[optlen++] = TCPOLEN_CC;
 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
  			optlen += 4;
 			break;
 
 		/*
 		 * This is a SYN,ACK; send CC and CC.echo if we received
 		 * CC from our peer.
 		 */
 		case (TH_SYN|TH_ACK):
 			if (tp->t_flags & TF_RCVD_CC) {
 				opt[optlen++] = TCPOPT_NOP;
 				opt[optlen++] = TCPOPT_NOP;
 				opt[optlen++] = TCPOPT_CC;
 				opt[optlen++] = TCPOLEN_CC;
 				*(u_int32_t *)&opt[optlen] =
 					htonl(tp->cc_send);
 				optlen += 4;
 				opt[optlen++] = TCPOPT_NOP;
 				opt[optlen++] = TCPOPT_NOP;
 				opt[optlen++] = TCPOPT_CCECHO;
 				opt[optlen++] = TCPOLEN_CC;
 				*(u_int32_t *)&opt[optlen] =
 					htonl(tp->cc_recv);
 				optlen += 4;
 			}
 			break;
 		}
  	}
 
  	hdrlen += optlen;
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_hdrsiz_tcp(tp);
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxopd length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxopd) {
 		/*
 		 * If there is still more to send, don't close the connection.
 		 */
 		flags &= ~TH_FIN;
 		len = tp->t_maxopd - optlen - ipoptlen;
 		sendalot = 1;
 	}
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
  	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
  	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		if (tp->t_force && len == 1)
 			tcpstat.tcps_sndprobe++;
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 			tcpstat.tcps_sndrexmitpack++;
 			tcpstat.tcps_sndrexmitbyte += len;
 		} else {
 			tcpstat.tcps_sndpack++;
 			tcpstat.tcps_sndbyte += len;
 		}
 #ifdef notyet
 		if ((m = m_copypack(so->so_snd.sb_mb, off,
 		    (int)len, max_linkhdr + hdrlen)) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * m_copypack left space for our hdr; use it.
 		 */
 		m->m_len += hdrlen;
 		m->m_data -= hdrlen;
 #else
 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto out;
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr) {
 			MCLGET(m, M_DONTWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				m_freem(m);
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(so->so_snd.sb_mb, off, (int) len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
 			if (m->m_next == 0) {
 				(void) m_free(m);
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 #endif
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (off + len == so->so_snd.sb_cc)
 			flags |= TH_PUSH;
 	} else {
 		if (tp->t_flags & TF_ACKNOW)
 			tcpstat.tcps_sndacks++;
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			tcpstat.tcps_sndctrl++;
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			tcpstat.tcps_sndurg++;
 		else
 			tcpstat.tcps_sndwinup++;
 
 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			MH_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_create_mbuf_from_socket(so, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcp_fillheaders(tp, ip6, th);
 	} else
 #endif /* INET6 */
       {
 	ip = mtod(m, struct ip *);
 	ipov = (struct ipovly *)ip;
 	th = (struct tcphdr *)(ip + 1);
 	/* this picks up the pseudo header (w/o the length) */
 	tcp_fillheaders(tp, ip, th);
       }
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (len || (flags & (TH_SYN|TH_FIN)) 
 	    || callout_active(tp->tt_persist))
 		th->th_seq = htonl(tp->snd_nxt);
 	else
 		th->th_seq = htonl(tp->snd_max);
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 */
 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
 		win = 0;
 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 		win = (long)TCP_MAXWIN << tp->rcv_scale;
 	th->th_win = htons((u_short) (win>>tp->rcv_scale));
 
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data then can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (win == 0)
 		tp->t_flags |= TF_RXWIN0SENT;
 	else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6)
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 				       sizeof(struct tcphdr) + optlen + len);
 	else
 #endif /* INET6 */
       {
 	m->m_pkthdr.csum_flags = CSUM_TCP;
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 	if (len + optlen)
 		th->th_sum = in_addword(th->th_sum, 
 		    htons((u_short)(optlen + len)));
 
 	/* IP version must be set here for ipv4/ipv6 checking later */
 	KASSERT(ip->ip_v == IPVERSION,
 	    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
       }
 
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				tcpstat.tcps_segstimed++;
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 		if (!callout_active(tp->tt_rexmt) &&
 		    tp->snd_nxt != tp->snd_una) {
 			if (callout_active(tp->tt_persist)) {
 				callout_stop(tp->tt_persist);
 				tp->t_rxtshift = 0;
 			}
 			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 				      tcp_timer_rexmt, tp);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #endif
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
 					       tp->t_inpcb->in6p_route.ro_rt ?
 					       tp->t_inpcb->in6p_route.ro_rt->rt_ifp
 					       : NULL);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
-#ifdef IPSEC
-		if (ipsec_setsocket(m, so) != 0) {
-			m_freem(m);
-			error = ENOBUFS;
-			goto out;
-		}
-#endif /*IPSEC*/
 		error = ip6_output(m,
 			    tp->t_inpcb->in6p_outputopts,
 			    &tp->t_inpcb->in6p_route,
-			    (so->so_options & SO_DONTROUTE), NULL, NULL);
+			    (so->so_options & SO_DONTROUTE), NULL, NULL,
+			    tp->t_inpcb);
 	} else
 #endif /* INET6 */
     {
 	struct rtentry *rt;
 	ip->ip_len = m->m_pkthdr.len;
 #ifdef INET6
  	if (INP_CHECK_SOCKAF(so, AF_INET6))
  		ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
  					    tp->t_inpcb->in6p_route.ro_rt ?
  					    tp->t_inpcb->in6p_route.ro_rt->rt_ifp
  					    : NULL);
  	else
 #endif /* INET6 */
 	ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;	/* XXX */
 	ip->ip_tos = tp->t_inpcb->inp_ip_tos;	/* XXX */
 	/*
 	 * See if we should do MTU discovery.  We do it only if the following
 	 * are true:
 	 *	1) we have a valid route to the destination
 	 *	2) the MTU is not locked (if it is, then discovery has been
 	 *	   disabled)
 	 */
 	if (path_mtu_discovery
 	    && (rt = tp->t_inpcb->inp_route.ro_rt)
 	    && rt->rt_flags & RTF_UP
 	    && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
 		ip->ip_off |= IP_DF;
 	}
-#ifdef IPSEC
- 	ipsec_setsocket(m, so);
-#endif /*IPSEC*/
 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
-	    (so->so_options & SO_DONTROUTE), 0);
+	    (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb);
     }
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 */
 		if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
 			/*
 			 * No need to check for TH_FIN here because
 			 * the TF_SENTFIN flag handles that case.
 			 */
 			if ((flags & TH_SYN) == 0)
 				tp->snd_nxt -= len;
 		}
 
 out:
 		if (error == ENOBUFS) {
 	                if (!callout_active(tp->tt_rexmt) &&
                             !callout_active(tp->tt_persist))
 	                        callout_reset(tp->tt_rexmt, tp->t_rxtcur,
                                       tcp_timer_rexmt, tp);
 			tcp_quench(tp->t_inpcb, 0);
 			return (0);
 		}
 		if (error == EMSGSIZE) {
 			/*
 			 * ip_output() will have already fixed the route
 			 * for us.  tcp_mtudisc() will, as its last action,
 			 * initiate retransmission, so it is important to
 			 * not do so here.
 			 */
 			tcp_mtudisc(tp->t_inpcb, 0);
 			return 0;
 		}
 		if ((error == EHOSTUNREACH || error == ENETDOWN)
 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
 			tp->t_softerror = error;
 			return (0);
 		}
 		return (error);
 	}
 	tcpstat.tcps_sndtotal++;
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + win;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~TF_ACKNOW;
 	if (tcp_delack_enabled)
 		callout_stop(tp->tt_delack);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && (!tcp_do_newreno || --maxburst))
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(tp)
 	register struct tcpcb *tp;
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	if (callout_active(tp->tt_rexmt))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
 	callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
Index: head/sys/netinet/tcp_reass.c
===================================================================
--- head/sys/netinet/tcp_reass.c	(revision 105193)
+++ head/sys/netinet/tcp_reass.c	(revision 105194)
@@ -1,2790 +1,2790 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_ipfw.h"		/* for ipfw_fwd		*/
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 #include "opt_tcp_input.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* for ICMP_BANDLIM		*/
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM		*/
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netinet6/ipsec6.h>
 #include <netkey/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
 
 static const int tcprexmtthresh = 3;
 tcp_cc	tcp_ccgen;
 
 struct	tcpstat tcpstat;
 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
     &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 static int log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 
     &log_in_vain, 0, "Log all incoming TCP connections");
 
 static int blackhole = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send RST when dropping refused connections");
 
 int tcp_delack_enabled = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 
     &tcp_delack_enabled, 0, 
     "Delay ACK to try and piggyback it onto a data packet");
 
 #ifdef TCP_DROP_SYNFIN
 static int drop_synfin = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 #endif
 
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
 struct mtx	*tcbinfo_mtx;
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *,
 		     struct mbuf *);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
 	    (tp)->t_inpcb->in6p_route.ro_rt) \
 		nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  *	- delayed acks are enabled and
  *	- there is no delayed ack timer in progress and
  *	- our last ack wasn't a 0-sized window.  We never want to delay
  *	  the ack that opens up a 0-sized window.
  */
 #define DELAY_ACK(tp) \
 	(tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
 	(tp->t_flags & TF_RXWIN0SENT) == 0)
 
 static int
 tcp_reass(tp, th, tlenp, m)
 	register struct tcpcb *tp;
 	register struct tcphdr *th;
 	int *tlenp;
 	struct mbuf *m;
 {
 	struct tseg_qent *q;
 	struct tseg_qent *p = NULL;
 	struct tseg_qent *nq;
 	struct tseg_qent *te;
 	struct socket *so = tp->t_inpcb->inp_socket;
 	int flags;
 
 	/*
 	 * Call with th==0 after become established to
 	 * force pre-ESTABLISHED data up to user socket.
 	 */
 	if (th == 0)
 		goto present;
 
 	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 	MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
 	       M_NOWAIT);
 	if (te == NULL) {
 		tcpstat.tcps_rcvmemdrop++;
 		m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 			break;
 		p = q;
 	}
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us.
 	 */
 	if (p != NULL) {
 		register int i;
 		/* conversion to int (in i) handles seq wraparound */
 		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 		if (i > 0) {
 			if (i >= *tlenp) {
 				tcpstat.tcps_rcvduppack++;
 				tcpstat.tcps_rcvdupbyte += *tlenp;
 				m_freem(m);
 				FREE(te, M_TSEGQ);
 				/*
 				 * Try to present any queued data
 				 * at the left window edge to the user.
 				 * This is needed after the 3-WHS
 				 * completes.
 				 */
 				goto present;	/* ??? */
 			}
 			m_adj(m, i);
 			*tlenp -= i;
 			th->th_seq += i;
 		}
 	}
 	tcpstat.tcps_rcvoopack++;
 	tcpstat.tcps_rcvoobyte += *tlenp;
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	while (q) {
 		register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 		if (i <= 0)
 			break;
 		if (i < q->tqe_len) {
 			q->tqe_th->th_seq += i;
 			q->tqe_len -= i;
 			m_adj(q->tqe_m, i);
 			break;
 		}
 
 		nq = LIST_NEXT(q, tqe_q);
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		FREE(q, M_TSEGQ);
 		q = nq;
 	}
 
 	/* Insert the new segment queue entry into place. */
 	te->tqe_m = m;
 	te->tqe_th = th;
 	te->tqe_len = *tlenp;
 
 	if (p == NULL) {
 		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 	} else {
 		LIST_INSERT_AFTER(p, te, tqe_q);
 	}
 
 present:
 	/*
 	 * Present data to user, advancing rcv_nxt through
 	 * completed sequence space.
 	 */
 	if (!TCPS_HAVEESTABLISHED(tp->t_state))
 		return (0);
 	q = LIST_FIRST(&tp->t_segq);
 	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
 		return (0);
 	do {
 		tp->rcv_nxt += q->tqe_len;
 		flags = q->tqe_th->th_flags & TH_FIN;
 		nq = LIST_NEXT(q, tqe_q);
 		LIST_REMOVE(q, tqe_q);
 		if (so->so_state & SS_CANTRCVMORE)
 			m_freem(q->tqe_m);
 		else
 			sbappend(&so->so_rcv, q->tqe_m);
 		FREE(q, M_TSEGQ);
 		q = nq;
 	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 	ND6_HINT(tp);
 	sorwakeup(so);
 	return (flags);
 }
 
 /*
  * TCP input routine, follows pages 65-76 of the
  * protocol specification dated September, 1981 very closely.
  */
 #ifdef INET6
 int
 tcp6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;
 {
 	register struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {		
 		struct ip6_hdr *ip6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
 }
 #endif
 
 void
 tcp_input(m, off0)
 	register struct mbuf *m;
 	int off0;
 {
 	register struct tcphdr *th;
 	register struct ip *ip = NULL;
 	register struct ipovly *ipov;
 	register struct inpcb *inp = NULL;
 	u_char *optp = NULL;
 	int optlen = 0;
 	int len, tlen, off;
 	int drop_hdrlen;
 	register struct tcpcb *tp = 0;
 	register int thflags;
 	struct socket *so = 0;
 	int todrop, acked, ourfinisacked, needoutput = 0;
 	u_long tiwin;
 	struct tcpopt to;		/* options in this segment */
 	struct rmxp_tao *taop;		/* pointer to our TAO cache entry */
 	struct rmxp_tao	tao_noncached;	/* in case there's no cached entry */
 	int headlocked = 0;
 	struct sockaddr_in *next_hop = NULL;
 	int rstreason; /* For badport_bandlim accounting purposes */
 
 	struct ip6_hdr *ip6 = NULL;
 #ifdef INET6
 	int isipv6;
 #else
 	const int isipv6 = 0;
 #endif
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[40];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef MAC
 	int error;
 #endif
 
 	/* Grab info from MT_TAG mbufs prepended to the chain. */
 	for (;m && m->m_type == MT_TAG; m = m->m_next) { 
-		if (m->m_tag_id == PACKET_TAG_IPFORWARD)
+		if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
 			next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
 	}
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 	bzero((char *)&to, sizeof(to));
 
 	tcpstat.tcps_rcvtotal++;
 
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
 		ip6 = mtod(m, struct ip6_hdr *);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
 			tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 	} else {
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m, (struct mbuf *)0);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
 				tcpstat.tcps_rcvshort++;
 				return;
 			}
 		}
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ip->ip_len;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr,
 						htonl(m->m_pkthdr.csum_data +
 							ip->ip_len +
 							IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = sizeof (struct ip) + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = (u_short)tlen;
 			ipov->ih_len = htons(ipov->ih_len);
 			th->th_sum = in_cksum(m, len);
 		}
 		if (th->th_sum) {
 			tcpstat.tcps_rcvbadsum++;
 			goto drop;
 		}
 #ifdef INET6
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 #endif
 	}
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		tcpstat.tcps_rcvbadoff++;
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 		if (isipv6) {
 			IP6_EXTHDR_CHECK(m, off0, off, );
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		} else {
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 						== 0) {
 					tcpstat.tcps_rcvshort++;
 					return;
 				}
 				ip = mtod(m, struct ip *);
 				ipov = (struct ipovly *)ip;
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 #ifdef TCP_DROP_SYNFIN
 	/*
 	 * If the drop_synfin option is enabled, drop all packets with
 	 * both the SYN and FIN bits set. This prevents e.g. nmap from
 	 * identifying the TCP/IP stack.
 	 *
 	 * This is a violation of the TCP specification.
 	 */
 	if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
 		goto drop;
 #endif
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 
 	/*
 	 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
 	 * until after ip6_savecontrol() is called and before other functions
 	 * which don't want those proto headers.
 	 * Because ip6_savecontrol() is going to parse the mbuf to
 	 * search for data to be passed up to user-land, it wants mbuf
 	 * parameters to be unchanged.
 	 * XXX: the call of ip6_savecontrol() has been obsoleted based on
 	 * latest version of the advanced API (20020110).
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment.
 	 */
 	 INP_INFO_WLOCK(&tcbinfo);
 	 headlocked = 1;
 findpcb:
 	/* IPFIREWALL_FORWARD section */
 	if (next_hop != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this? 
 		 */
 		inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
 					ip->ip_dst, th->th_dport,
 					0, m->m_pkthdr.rcvif);
 		if (!inp) {
 			/* It's new.  Try find the ambushing socket. */
 			inp = in_pcblookup_hash(&tcbinfo,
 						ip->ip_src, th->th_sport,
 						next_hop->sin_addr,
 						next_hop->sin_port ?
 						    ntohs(next_hop->sin_port) :
 						    th->th_dport,
 						1, m->m_pkthdr.rcvif);
 		}
 	} else {
 		if (isipv6)
 			inp = in6_pcblookup_hash(&tcbinfo,
 						 &ip6->ip6_src, th->th_sport,
 						 &ip6->ip6_dst, th->th_dport,
 						 1, m->m_pkthdr.rcvif);
 		else
 			inp = in_pcblookup_hash(&tcbinfo,
 						ip->ip_src, th->th_sport,
 						ip->ip_dst, th->th_dport,
 						1, m->m_pkthdr.rcvif);
       }
 
 #ifdef IPSEC
 	if (isipv6) {
 		if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
 			ipsec6stat.in_polvio++;
 			goto drop;
 		}
 	} else {
 		if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
 			ipsecstat.in_polvio++;
 			goto drop;
 		}
 	}
 #endif
 
 	/*
 	 * If the state is CLOSED (i.e., TCB does not exist) then
 	 * all data in the incoming segment is discarded.
 	 * If the TCB exists but is in CLOSED state, it is embryonic,
 	 * but should either do a listen or a connect soon.
 	 */
 	if (inp == NULL) {
 		if (log_in_vain) {
 #ifdef INET6
 			char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2];
 #else
 			char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
 #endif
 
 			if (isipv6) {
 				strcpy(dbuf, "[");
 				strcpy(sbuf, "[");
 				strcat(dbuf, ip6_sprintf(&ip6->ip6_dst));
 				strcat(sbuf, ip6_sprintf(&ip6->ip6_src));
 				strcat(dbuf, "]");
 				strcat(sbuf, "]");
 			} else {
 				strcpy(dbuf, inet_ntoa(ip->ip_dst));
 				strcpy(sbuf, inet_ntoa(ip->ip_src));
 			}
 			switch (log_in_vain) {
 			case 1:
 				if (thflags & TH_SYN)
 					log(LOG_INFO,
 					    "Connection attempt to TCP %s:%d "
 					    "from %s:%d\n",
 					    dbuf, ntohs(th->th_dport), sbuf,
 					    ntohs(th->th_sport));
 				break;
 			case 2:
 				log(LOG_INFO,
 				    "Connection attempt to TCP %s:%d "
 				    "from %s:%d flags:0x%x\n",
 				    dbuf, ntohs(th->th_dport), sbuf,
 				    ntohs(th->th_sport), thflags);
 				break;
 			default:
 				break;
 			}
 		}
 		if (blackhole) { 
 			switch (blackhole) {
 			case 1:
 				if (thflags & TH_SYN)
 					goto drop;
 				break;
 			case 2:
 				goto drop;
 			default:
 				goto drop;
 			}
 		}
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_LOCK(inp);
 	tp = intotcpcb(inp);
 	if (tp == 0) {
 		INP_UNLOCK(inp);
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	if (tp->t_state == TCPS_CLOSED)
 		goto drop;
 
 	/* Unscale the window into a 32-bit value. */
 	if ((thflags & TH_SYN) == 0)
 		tiwin = th->th_win << tp->snd_scale;
 	else
 		tiwin = th->th_win;
 
 	so = inp->inp_socket;
 #ifdef MAC
 	error = mac_check_socket_deliver(so, m);
 	if (error)
 		goto drop;
 #endif
 	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
 		struct in_conninfo inc;
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG) {
 			ostate = tp->t_state;
 			if (isipv6)
 				bcopy((char *)ip6, (char *)tcp_saveipgen,
 					sizeof(*ip6));
 			else
 				bcopy((char *)ip, (char *)tcp_saveipgen,
 					sizeof(*ip));
 			tcp_savetcp = *th;
 		}
 #endif
 		/* skip if this isn't a listen socket */
 		if ((so->so_options & SO_ACCEPTCONN) == 0)
 			goto after_listen;
 #ifdef INET6
 		inc.inc_isipv6 = isipv6;
 #endif
 		if (isipv6) {
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 			inc.inc6_route.ro_rt = NULL;		/* XXX */
 		} else {
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 			inc.inc_route.ro_rt = NULL;		/* XXX */
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 
 	        /*
 	         * If the state is LISTEN then ignore segment if it contains
 		 * a RST.  If the segment contains an ACK then it is bad and
 		 * send a RST.  If it does not contain a SYN then it is not
 		 * interesting; drop it.
 		 *
 		 * If the state is SYN_RECEIVED (syncache) and seg contains
 		 * an ACK, but not for our SYN/ACK, send a RST.  If the seg
 		 * contains a RST, check the sequence number to see if it
 		 * is a valid reset segment.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
 			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 				if (!syncache_expand(&inc, th, &so, m)) {
 					/*
 					 * No syncache entry, or ACK was not
 					 * for our SYN/ACK.  Send a RST.
 					 */
 					tcpstat.tcps_badsyn++;
 					rstreason = BANDLIM_RST_OPENPORT;
 					goto dropwithreset;
 				}
 				if (so == NULL) {
 					/*
 					 * Could not complete 3-way handshake,
 					 * connection is being closed down, and
 					 * syncache will free mbuf.
 					 */
 					INP_UNLOCK(inp);
 					INP_INFO_WUNLOCK(&tcbinfo);
 					return;
 				}
 				/*
 				 * Socket is created in state SYN_RECEIVED.
 				 * Continue processing segment.
 				 */
 				INP_UNLOCK(inp);
 				inp = sotoinpcb(so);
 				INP_LOCK(inp);
 				tp = intotcpcb(inp);
 				/*
 				 * This is what would have happened in
 				 * tcp_output() when the SYN,ACK was sent.
 				 */
 				tp->snd_up = tp->snd_una;
 				tp->snd_max = tp->snd_nxt = tp->iss + 1;
 				tp->last_ack_sent = tp->rcv_nxt;
 /*
  * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
  * until the _second_ ACK is received:
  *    rcv SYN (set wscale opts)	 --> send SYN/ACK, set snd_wnd = window.
  *    rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
  *        move to ESTAB, set snd_wnd to tiwin.
  */        
 				tp->snd_wnd = tiwin;	/* unscaled */
 				goto after_listen;
 			}
 			if (thflags & TH_RST) {
 				syncache_chkrst(&inc, th);
 				goto drop;
 			}
 			if (thflags & TH_ACK) {
 				syncache_badack(&inc);
 				tcpstat.tcps_badsyn++;
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			goto drop;
 		}
 
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 */
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			if ((ia6 = ip6_getdstifaddr(m)) &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				INP_UNLOCK(inp);
 				tp = NULL;
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 		}
 #endif
 		/*
 		 * If it is from this socket, drop it, it must be forged.
 		 * Don't bother responding if the destination was a broadcast.
 		 */
 		if (th->th_dport == th->th_sport) {
 			if (isipv6) {
 				if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 						       &ip6->ip6_src))
 					goto drop;
 			} else {
 				if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
 					goto drop;
 			}
 		}
 		/*
 		 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 		 *
 		 * Note that it is quite possible to receive unicast
 		 * link-layer packets with a broadcast IP address. Use
 		 * in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			goto drop;
 		if (isipv6) {
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 				goto drop;
 		} else {
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 				goto drop;
 		}
 		/*
 		 * SYN appears to be valid; create compressed TCP state
 		 * for syncache, or perform t/tcp connection.
 		 */
 		if (so->so_qlen <= so->so_qlimit) {
 			tcp_dooptions(&to, optp, optlen, 1);
 			if (!syncache_add(&inc, &to, th, &so, m))
 				goto drop;
 			if (so == NULL) {
 				/*
 				 * Entry added to syncache, mbuf used to
 				 * send SYN,ACK packet.
 				 */
 				KASSERT(headlocked, ("headlocked"));
 				INP_UNLOCK(inp);
 				INP_INFO_WUNLOCK(&tcbinfo);
 				return;
 			}
 			/*
 			 * Segment passed TAO tests.
 			 */
 			INP_UNLOCK(inp);
 			inp = sotoinpcb(so);
 			INP_LOCK(inp);
 			tp = intotcpcb(inp);
 			tp->snd_wnd = tiwin;
 			tp->t_starttime = ticks;
 			tp->t_state = TCPS_ESTABLISHED;
 
 			/*
 			 * If there is a FIN, or if there is data and the
 			 * connection is local, then delay SYN,ACK(SYN) in
 			 * the hope of piggy-backing it on a response
 			 * segment.  Otherwise must send ACK now in case
 			 * the other side is slow starting.
 			 */
 			if (DELAY_ACK(tp) &&
 			    ((thflags & TH_FIN) ||
 			     (tlen != 0 &&
 			      ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 			       (!isipv6 && in_localaddr(inp->inp_faddr)))))) {
 				callout_reset(tp->tt_delack, tcp_delacktime,  
 						tcp_timer_delack, tp);  
 				tp->t_flags |= TF_NEEDSYN;
 			} else 
 				tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 			goto trimthenstep6;
 		}
 		goto drop;
 	}
 after_listen:
 
 /* XXX temp debugging */
 	/* should not happen - syncache should pick up these connections */
 	if (tp->t_state == TCPS_LISTEN)
 		panic("tcp_input: TCPS_LISTEN");
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
 
 	/*
 	 * Process options.
 	 * XXX this is tradtitional behavior, may need to be cleaned up.
 	 */
 	tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
 	if (thflags & TH_SYN) {
 		if (to.to_flags & TOF_SCALE) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->requested_s_scale = to.to_requested_s_scale;
 		}
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = ticks;
 		}
 		if (to.to_flags & (TOF_CC|TOF_CCNEW))
 			tp->t_flags |= TF_RCVD_CC;
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED above, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
 	    /*
 	     * Using the CC option is compulsory if once started:
 	     *   the segment is OK if no T/TCP was negotiated or
 	     *   if the segment has a CC option equal to CCrecv
 	     */
 	    ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
 	     ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
 	    th->th_seq == tp->rcv_nxt &&
 	    tiwin && tiwin == tp->snd_wnd &&
 	    tp->snd_nxt == tp->snd_max) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = ticks;
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
 			    tp->t_dupacks < tcprexmtthresh) {
 				KASSERT(headlocked, ("headlocked"));
 				INP_INFO_WUNLOCK(&tcbinfo);
 				headlocked = 0;
 				/*
 				 * this is a pure ack for outstanding data.
 				 */
 				++tcpstat.tcps_predack;
 				/*
 				 * "bad retransmit" recovery
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    ticks < tp->t_badrxtwin) {
 					tp->snd_cwnd = tp->snd_cwnd_prev;
 					tp->snd_ssthresh =
 					    tp->snd_ssthresh_prev;
 					tp->snd_nxt = tp->snd_max;
 					tp->t_badrxtwin = 0;
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore 
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					tcp_xmit_timer(tp,
 					    ticks - to.to_tsecr + 1);
 				} else if (tp->t_rtttime &&
 					    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				tp->snd_una = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* some progress has been done */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 				if (tp->snd_una == tp->snd_max)
 					callout_stop(tp->tt_rexmt);
 				else if (!callout_active(tp->tt_persist))
 					callout_reset(tp->tt_rexmt, 
 						      tp->t_rxtcur,
 						      tcp_timer_rexmt, tp);
 
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				INP_UNLOCK(inp);
 				return;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
 			/*
 			 * this is a pure, in-sequence data packet
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
 			/*
 			 * Add data to socket buffer.
 			 */
 			if (so->so_state & SS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappend(&so->so_rcv, m);
 			}
 			sorwakeup(so);
 			if (DELAY_ACK(tp)) {
 	                        callout_reset(tp->tt_delack, tcp_delacktime,
 	                            tcp_timer_delack, tp);
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			INP_UNLOCK(inp);
 			return;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	{ int win;
 
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
 			taop = &tao_noncached;
 			bzero(taop, sizeof(*taop));
 		}
 
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			/*
 			 * If we have a cached CCsent for the remote host,
 			 * hence we haven't just crashed and restarted,
 			 * do not send a RST.  This may be a retransmission
 			 * from the other side after our earlier ACK was lost.
 			 * Our new SYN, when it arrives, will serve as the
 			 * needed ACK.
 			 */
 			if (taop->tao_ccsent != 0)
 				goto drop;
 			else {
 				rstreason = BANDLIM_UNLIMITED;
 				goto dropwithreset;
 			}
 		}
 		if (thflags & TH_RST) {
 			if (thflags & TH_ACK)
 				tp = tcp_drop(tp, ECONNREFUSED);
 			goto drop;
 		}
 		if ((thflags & TH_SYN) == 0)
 			goto drop;
 		tp->snd_wnd = th->th_win;	/* initial send window */
 		tp->cc_recv = to.to_cc;		/* foreign CC */
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			/*
 			 * Our SYN was acked.  If segment contains CC.ECHO
 			 * option, check it to make sure this segment really
 			 * matches our SYN.  If not, just drop it as old
 			 * duplicate, but send an RST if we're still playing
 			 * by the old rules.  If no CC.ECHO option, make sure
 			 * we don't get fooled into using T/TCP.
 			 */
 			if (to.to_flags & TOF_CCECHO) {
 				if (tp->cc_send != to.to_ccecho) {
 					if (taop->tao_ccsent != 0)
 						goto drop;
 					else {
 						rstreason = BANDLIM_UNLIMITED;
 						goto dropwithreset;
 					}
 				}
 			} else
 				tp->t_flags &= ~TF_RCVD_CC;
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 #ifdef MAC
 			mac_set_socket_peer_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->snd_scale = tp->requested_s_scale;
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			/* Segment is acceptable, update cache if undefined. */
 			if (taop->tao_ccsent == 0)
 				taop->tao_ccsent = to.to_ccecho;
 
 			tp->rcv_adv += tp->rcv_wnd;
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp) && tlen != 0)
                                 callout_reset(tp->tt_delack, tcp_delacktime,  
                                     tcp_timer_delack, tp);  
 			else
 				tp->t_flags |= TF_ACKNOW;
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_state = TCPS_FIN_WAIT_1;
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
 				callout_reset(tp->tt_keep, tcp_keepidle,
 					      tcp_timer_keep, tp);
 			}
 		} else {
 			/*
 		 	 * Received initial SYN in SYN-SENT[*] state =>
 		 	 * simultaneous open.  If segment contains CC option
 		 	 * and there is a cached CC, apply TAO test.
 		 	 * If it succeeds, connection is * half-synchronized.
 		 	 * Otherwise, do 3-way handshake:
 		 	 *        SYN-SENT -> SYN-RECEIVED
 		 	 *        SYN-SENT* -> SYN-RECEIVED*
 		 	 * If there was no CC option, clear cached CC value.
 		 	 */
 			tp->t_flags |= TF_ACKNOW;
 			callout_stop(tp->tt_rexmt);
 			if (to.to_flags & TOF_CC) {
 				if (taop->tao_cc != 0 &&
 				    CC_GT(to.to_cc, taop->tao_cc)) {
 					/*
 					 * update cache and make transition:
 					 *        SYN-SENT -> ESTABLISHED*
 					 *        SYN-SENT* -> FIN-WAIT-1*
 					 */
 					taop->tao_cc = to.to_cc;
 					tp->t_starttime = ticks;
 					if (tp->t_flags & TF_NEEDFIN) {
 						tp->t_state = TCPS_FIN_WAIT_1;
 						tp->t_flags &= ~TF_NEEDFIN;
 					} else {
 						tp->t_state = TCPS_ESTABLISHED;
 						callout_reset(tp->tt_keep,
 							      tcp_keepidle,
 							      tcp_timer_keep,
 							      tp);
 					}
 					tp->t_flags |= TF_NEEDSYN;
 				} else
 					tp->t_state = TCPS_SYN_RECEIVED;
 			} else {
 				/* CC.NEW or no option => invalidate cache */
 				taop->tao_cc = 0;
 				tp->t_state = TCPS_SYN_RECEIVED;
 			}
 		}
 
 trimthenstep6:
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			tcpstat.tcps_rcvpackafterwin++;
 			tcpstat.tcps_rcvbyteafterwin += todrop;
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
  		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *	if segment contains a SYN and CC [not CC.NEW] option:
 	 *              if state == TIME_WAIT and connection duration > MSL,
 	 *                  drop packet and send RST;
 	 *
 	 *		if SEG.CC > CCrecv then is new SYN, and can implicitly
 	 *		    ack the FIN (and data) in retransmission queue.
 	 *                  Complete close and delete TCPCB.  Then reprocess
 	 *                  segment, hoping to find new TCPCB in LISTEN state;
 	 *
 	 *		else must be old SYN; drop it.
 	 *      else do normal processing.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 	case TCPS_TIME_WAIT:
 		if ((thflags & TH_SYN) &&
 		    (to.to_flags & TOF_CC) && tp->cc_recv != 0) {
 			if (tp->t_state == TCPS_TIME_WAIT &&
 					(ticks - tp->t_starttime) > tcp_msl) {
 				rstreason = BANDLIM_UNLIMITED;
 				goto dropwithreset;
 			}
 			if (CC_GT(to.to_cc, tp->cc_recv)) {
 				tp = tcp_close(tp);
 				goto findpcb;
 			}
 			else
 				goto drop;
 		}
  		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 *
 	 *
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 * Note: this does not take into account delayed ACKs, so
 	 *   we should test against last_ack_sent instead of rcv_nxt.
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 * If we have multiple segments in flight, the intial reset
 	 * segment sequence numbers will be to the left of last_ack_sent,
 	 * but they will eventually catch up.
 	 * In any case, it never made sense to trim reset segments to
 	 * fit the receive window since RFC 1122 says:
 	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 	 *
 	 *    A TCP SHOULD allow a received RST segment to include data.
 	 *
 	 *    DISCUSSION
 	 *         It has been suggested that a RST segment could contain
 	 *         ASCII text that encoded and explained the cause of the
 	 *         RST.  No standard has yet been established for such
 	 *         data.
 	 *
 	 * If the reset segment passes the sequence number test examine
 	 * the state:
 	 *    SYN_RECEIVED STATE:
 	 *	If passive open, return to LISTEN state.
 	 *	If active open, inform user that connection was refused.
 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 	 *	Inform user that connection was reset, and close tcb.
 	 *    CLOSING, LAST_ACK STATES:
 	 *	Close the tcb.
 	 *    TIME_WAIT STATE:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST) {
 		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			switch (tp->t_state) {
 
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 
 			case TCPS_ESTABLISHED:
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
 				tp->t_state = TCPS_CLOSED;
 				tcpstat.tcps_drops++;
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_TIME_WAIT:
 				break;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += tlen;
 			tcpstat.tcps_pawsdrop++;
 			goto dropafterack;
 		}
 	}
 
 	/*
 	 * T/TCP mechanism
 	 *   If T/TCP was negotiated and the segment doesn't have CC,
 	 *   or if its CC is wrong then drop the segment.
 	 *   RST segments do not have to comply with this.
 	 */
 	if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
 	    ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
  		goto dropafterack;
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += todrop;
 		} else {
 			tcpstat.tcps_rcvpartduppack++;
 			tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		tp = tcp_close(tp);
 		tcpstat.tcps_rcvafterclose++;
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 	if (todrop > 0) {
 		tcpstat.tcps_rcvpackafterwin++;
 		if (todrop >= tlen) {
 			tcpstat.tcps_rcvbyteafterwin += tlen;
 			/*
 			 * If a new connection request is received
 			 * while in TIME_WAIT, drop the old connection
 			 * and start over if the sequence numbers
 			 * are above the previous ones.
 			 */
 			if (thflags & TH_SYN &&
 			    tp->t_state == TCPS_TIME_WAIT &&
 			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
 				tp = tcp_close(tp);
 				goto findpcb;
 			}
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				tcpstat.tcps_rcvwinprobe++;
 			} else
 				goto dropafterack;
 		} else
 			tcpstat.tcps_rcvbyteafterwin += todrop;
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE that the test is modified according to the latest
 	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = ticks;
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If a SYN is in the window, then this is an
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		tcpstat.tcps_connects++;
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->snd_scale = tp->requested_s_scale;
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		/*
 		 * Upon successful completion of 3-way handshake,
 		 * update cache.CC if it was undefined, pass any queued
 		 * data to the user, and advance state appropriately.
 		 */
 		if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
 		    taop->tao_cc == 0)
 			taop->tao_cc = tp->cc_recv;
 
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_state = TCPS_FIN_WAIT_1;
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
 			callout_reset(tp->tt_keep, tcp_keepidle, 
 				      tcp_timer_keep, tp);
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 	case TCPS_TIME_WAIT:
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				tcpstat.tcps_rcvdupack++;
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change), the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 */
 				if (!callout_active(tp->tt_rexmt) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 					u_int win =
 					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 						tp->t_maxseg;
 					if (tcp_do_newreno &&
 					    SEQ_LT(th->th_ack,
 						   tp->snd_recover)) {
 						/* False retransmit, should not
 						 * cut window
 						 */
 						tp->snd_cwnd += tp->t_maxseg;
 						tp->t_dupacks = 0;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_maxseg;
 					tp->snd_recover = tp->snd_max;
 					callout_stop(tp->tt_rexmt);
 					tp->t_rtttime = 0;
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					tp->snd_cwnd = tp->snd_ssthresh +
 						tp->t_maxseg * tp->t_dupacks;
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (tp->t_dupacks > tcprexmtthresh) {
 					tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (tcp_do_newreno) {
 			int is_partialack = SEQ_LT(th->th_ack, tp->snd_recover);
 			if (tp->t_dupacks >= tcprexmtthresh) {
 				if (is_partialack) {
 					tcp_newreno_partial_ack(tp, th);
 				} else {
 					/*
 					 * Window inflation should have left us
 					 * with approximately snd_ssthresh
 					 * outstanding data.
 					 * But in case we would be inclined to
 					 * send a burst, better to do it via
 					 * the slow start mechanism.
 					 */
 					if (SEQ_GT(th->th_ack +
 							tp->snd_ssthresh,
 						   tp->snd_max))
 						tp->snd_cwnd = tp->snd_max -
 								th->th_ack +
 								tp->t_maxseg;
 					else
 						tp->snd_cwnd = tp->snd_ssthresh;
 				}
 			}
 			/*
 			 * Reset dupacks, except on partial acks in
 			 *   fast recovery.
 			 */
 			if (!(tp->t_dupacks >= tcprexmtthresh && is_partialack))
 				tp->t_dupacks = 0;
                 } else {
                         if (tp->t_dupacks >= tcprexmtthresh &&
                             tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 			tp->t_dupacks = 0;
                 }
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
 		}
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->snd_scale = tp->requested_s_scale;
 				tp->rcv_scale = tp->request_r_scale;
 			}
 		}
 
 process_ACK:
 		acked = th->th_ack - tp->snd_una;
 		tcpstat.tcps_rcvackpack++;
 		tcpstat.tcps_rcvackbyte += acked;
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
 			++tcpstat.tcps_sndrexmitbad;
 			tp->snd_cwnd = tp->snd_cwnd_prev;
 			tp->snd_ssthresh = tp->snd_ssthresh_prev;
 			tp->snd_nxt = tp->snd_max;
 			tp->t_badrxtwin = 0;	/* XXX probably not required */ 
 		}
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore 
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    to.to_tsecr) {
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			callout_stop(tp->tt_rexmt);
 			needoutput = 1;
 		} else if (!callout_active(tp->tt_persist))
 			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
 				      tcp_timer_rexmt, tp);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * When new data is acked, open the congestion window.
 		 * If the window gives us less than ssthresh packets
 		 * in flight, open exponentially (maxseg per packet).
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 */
 		{
 		register u_int cw = tp->snd_cwnd;
 		register u_int incr = tp->t_maxseg;
 
 		if (cw > tp->snd_ssthresh)
 			incr = incr * incr / cw;
 		/*
 		 * If t_dupacks != 0 here, it indicates that we are still
 		 * in NewReno fast recovery mode, so we leave the congestion
 		 * window alone.
 		 */
 		if (!tcp_do_newreno || tp->t_dupacks == 0)
 			tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
 		}
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		sowwakeup(so);
 		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 */
 				if (so->so_state & SS_CANTRCVMORE) {
 					soisdisconnected(so);
 					callout_reset(tp->tt_2msl, tcp_maxidle,
 						      tcp_timer_2msl, tp);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
 			break;
 
 	 	/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				tp->t_state = TCPS_TIME_WAIT;
 				tcp_canceltimers(tp);
 				/* Shorten TIME_WAIT [RFC-1644, p.28] */
 				if (tp->cc_recv != 0 &&
 				    (ticks - tp->t_starttime) < tcp_msl)
 					callout_reset(tp->tt_2msl,
 						      tp->t_rxtcur *
 						      TCPTV_TWTRUNC,
 						      tcp_timer_2msl, tp);
 				else
 					callout_reset(tp->tt_2msl, 2 * tcp_msl,
 						      tcp_timer_2msl, tp);
 				soisdisconnected(so);
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 
 		/*
 		 * In TIME_WAIT state the only thing that should arrive
 		 * is a retransmission of the remote FIN.  Acknowledge
 		 * it and restart the finack timer.
 		 */
 		case TCPS_TIME_WAIT:
 			callout_reset(tp->tt_2msl, 2 * tcp_msl,
 				      tcp_timer_2msl, tp);
 			goto dropafterack;
 		}
 	}
 
 step6:
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			tcpstat.tcps_rcvwinupd++;
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_state |= SS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen
 #ifdef SO_OOBINLINE
 		     && (so->so_options & SO_OOBINLINE) == 0
 #endif
 		     )
 			tcp_pulloutofband(so, th, m,
 				drop_hdrlen);	/* hdr drop is delayed */
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	KASSERT(headlocked, ("headlocked"));
 	INP_INFO_WUNLOCK(&tcbinfo);
 	headlocked = 0;
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp))
 				callout_reset(tp->tt_delack, tcp_delacktime,
 					      tcp_timer_delack, tp);
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);
 			if (so->so_state & SS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappend(&so->so_rcv, m);
 			sorwakeup(so);
 		} else {
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 */
 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
                                 callout_reset(tp->tt_delack, tcp_delacktime,  
                                     tcp_timer_delack, tp);  
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 	 	/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/*FALLTHROUGH*/
 		case TCPS_ESTABLISHED:
 			tp->t_state = TCPS_CLOSE_WAIT;
 			break;
 
 	 	/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tp->t_state = TCPS_CLOSING;
 			break;
 
 	 	/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			tp->t_state = TCPS_TIME_WAIT;
 			tcp_canceltimers(tp);
 			/* Shorten TIME_WAIT [RFC-1644, p.28] */
 			if (tp->cc_recv != 0 &&
 			    (ticks - tp->t_starttime) < tcp_msl) {
 				callout_reset(tp->tt_2msl,
 					      tp->t_rxtcur * TCPTV_TWTRUNC,
 					      tcp_timer_2msl, tp);
 				/* For transaction client, force ACK now. */
 				tp->t_flags |= TF_ACKNOW;
 			}
 			else
 				callout_reset(tp->tt_2msl, 2 * tcp_msl,
 					      tcp_timer_2msl, tp);
 			soisdisconnected(so);
 			break;
 
 		/*
 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
 		 */
 		case TCPS_TIME_WAIT:
 			callout_reset(tp->tt_2msl, 2 * tcp_msl,
 				      tcp_timer_2msl, tp);
 			break;
 		}
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 	INP_UNLOCK(inp);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	m_freem(m);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_UNLOCK(inp);
 	return;
 
 dropwithreset:
 	/*
 	 * Generate a RST, dropping incoming segment.
 	 * Make ACK acceptable to originator of segment.
 	 * Don't bother to respond if destination was broadcast/multicast.
 	 */
 	if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 	if (isipv6) {
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 	} else {
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    	    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 	    	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 	/* IPv6 anycast check is done at tcp6_input() */
 
 	/*
 	 * Perform bandwidth limiting.
 	 */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
  
 #ifdef TCPDEBUG
 	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	if (tp)
 		INP_UNLOCK(inp);
 
 	if (thflags & TH_ACK)
 		/* mtod() below is safe as long as hdr dropping is delayed */
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
 			    TH_RST);
 	else {
 		if (thflags & TH_SYN)
 			tlen++;
 		/* mtod() below is safe as long as hdr dropping is delayed */
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 			    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	return;
 
 drop:
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp)
 		INP_UNLOCK(inp);
 	m_freem(m);
 	if (headlocked)
 		INP_INFO_WUNLOCK(&tcbinfo);
 	return;
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(to, cp, cnt, is_syn)
 	struct tcpopt *to;
 	u_char *cp;
 	int cnt;
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (! is_syn)
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 		case TCPOPT_CC:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			to->to_flags |= TOF_CC;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_cc, sizeof(to->to_cc));
 			to->to_cc = ntohl(to->to_cc);
 			break;
 		case TCPOPT_CCNEW:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_CCNEW;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_cc, sizeof(to->to_cc));
 			to->to_cc = ntohl(to->to_cc);
 			break;
 		case TCPOPT_CCECHO:
 			if (optlen != TCPOLEN_CC)
 				continue;
 			if (!is_syn)
 				continue;
 			to->to_flags |= TOF_CCECHO;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_ccecho, sizeof(to->to_ccecho));
 			to->to_ccecho = ntohl(to->to_ccecho);
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(so, th, m, off)
 	struct socket *so;
 	struct tcphdr *th;
 	register struct mbuf *m;
 	int off;		/* delayed to be droped hdrlen */
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == 0)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(tp, rtt)
 	register struct tcpcb *tp;
 	int rtt;
 {
 	register int delta;
 
 	tcpstat.tcps_rttupdated++;
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing
  * interface without forcing IP to fragment; if bigger than
  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
  * to utilize large mbufs.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, for outgoing segments only tcp_mssopt is called.
  *
  * In case of T/TCP, we call this routine during implicit connection
  * setup as well (offer = -1), to initialize maxseg from the cached
  * MSS of our peer.
  */
 void
 tcp_mss(tp, offer)
 	struct tcpcb *tp;
 	int offer;
 {
 	register struct rtentry *rt;
 	struct ifnet *ifp;
 	register int rtt, mss;
 	u_long bufsize;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 	struct rmxp_tao *taop;
 	int origoffer = offer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const int isipv6 = 0;
 	const size_t min_protoh = sizeof (struct tcpiphdr);
 #endif
 
 	if (isipv6)
 		rt = tcp_rtlookup6(&inp->inp_inc);
 	else
 		rt = tcp_rtlookup(&inp->inp_inc);
 	if (rt == NULL) {
 		tp->t_maxopd = tp->t_maxseg =
 				isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
 		return;
 	}
 	ifp = rt->rt_ifp;
 	so = inp->inp_socket;
 
 	taop = rmx_taop(rt->rt_rmx);
 	/*
 	 * Offer == -1 means that we didn't receive SYN yet,
 	 * use cached value in that case;
 	 */
 	if (offer == -1)
 		offer = taop->tao_mssopt;
 	/*
 	 * Offer == 0 means that there was no MSS on the SYN segment,
 	 * in this case we use tcp_mssdflt.
 	 */
 	if (offer == 0)
 		offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
 	else
 		/*
 		 * Sanity check: make sure that maxopd will be large
 		 * enough to allow some data on segments even is the
 		 * all the option space is used (40bytes).  Otherwise
 		 * funny things may happen in tcp_output.
 		 */
 		offer = max(offer, 64);
 	taop->tao_mssopt = offer;
 
 	/*
 	 * While we're here, check if there's an initial rtt
 	 * or rttvar.  Convert from the route-table units
 	 * to scaled multiples of the slow timeout timer.
 	 */
 	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
 		/*
 		 * XXX the lock bit for RTT indicates that the value
 		 * is also a minimum value; this is subject to time.
 		 */
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
 			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
 			tcpstat.tcps_usedrttvar++;
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 			      tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	/*
 	 * if there's an mtu associated with the route, use it
 	 * else, use the link mtu.
 	 */
 	if (rt->rt_rmx.rmx_mtu)
 		mss = rt->rt_rmx.rmx_mtu - min_protoh;
 	else {
 		if (isipv6) {
 			mss = nd_ifinfo[rt->rt_ifp->if_index].linkmtu -
 				min_protoh;
 			if (!in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, tcp_v6mssdflt);
 		} else {
 			mss = ifp->if_mtu - min_protoh;
 			if (!in_localaddr(inp->inp_faddr))
 				mss = min(mss, tcp_mssdflt);
 		}
 	}
 	mss = min(mss, offer);
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * In case of T/TCP, origoffer==-1 indicates, that no segments
 	 * were received yet.  In this case we just guess, otherwise
 	 * we do the same as before T/TCP.
 	 */
  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
  	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
 		mss -= TCPOLEN_CC_APPA;
 
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 		if (mss > MCLBYTES)
 			mss &= ~(MCLBYTES-1);
 #else
 		if (mss > MCLBYTES)
 			mss = mss / MCLBYTES * MCLBYTES;
 #endif
 	/*
 	 * If there's a pipesize, change the socket buffer
 	 * to that size.  Make the socket buffers an integral
 	 * number of mss units; if the mss is larger than
 	 * the socket buffer, decrease the mss.
 	 */
 #ifdef RTV_SPIPE
 	if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
 #endif
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve(&so->so_snd, bufsize, so, NULL);
 	}
 	tp->t_maxseg = mss;
 
 #ifdef RTV_RPIPE
 	if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
 #endif
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
 	}
 
 	/*
 	 * Set the slow-start flight size depending on whether this
 	 * is a local network or not.
 	 */
 	if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 	    (!isipv6 && in_localaddr(inp->inp_faddr)))
 		tp->snd_cwnd = mss * ss_fltsz_local;
 	else 
 		tp->snd_cwnd = mss * ss_fltsz;
 
 	if (rt->rt_rmx.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
 		tcpstat.tcps_usedssthresh++;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(tp)
 	struct tcpcb *tp;
 {
 	struct rtentry *rt;
 #ifdef INET6
 	int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const int isipv6 = 0;
 	const size_t min_protoh = sizeof (struct tcpiphdr);
 #endif
 
 	if (isipv6)
 		rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
 	else
 		rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
 	if (rt == NULL)
 		return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
 
 	return (rt->rt_ifp->if_mtu - min_protoh);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(tp, th)
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	callout_stop(tp->tt_rexmt);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
 }
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 105193)
+++ head/sys/netinet/tcp_subr.c	(revision 105194)
@@ -1,1695 +1,1690 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 int 	tcp_mssdflt = TCP_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 
     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 int	tcp_v6mssdflt = TCP6_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
 	CTLFLAG_RW, &tcp_v6mssdflt , 0,
 	"Default TCP Maximum Segment Size for IPv6");
 #endif
 
 #if 0
 static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 
     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
 #endif
 
 int	tcp_do_rfc1323 = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 
     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
 
 int	tcp_do_rfc1644 = 0;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 
     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
      "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 
     &tcbinfo.ipi_count, 0, "Number of active PCBs");
 
 static int	icmp_may_rst = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static int	tcp_isn_reseed_interval = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
 /*
  * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
  * 1024 exists only for debugging.  A good production default would be 
  * something like 6100.
  */
 static int	tcp_inflight_enable = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
 
 static int	tcp_inflight_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
 
 static int	tcp_inflight_min = 1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
 
 static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
 
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	512
 #endif
 
 /*
  * This is the actual shape of what we allocate using the zone
  * allocator.  Doing it this way allows us to protect both structures
  * using the same generation count, and also eliminates the overhead
  * of allocating tcpcbs separately.  By hiding the structure here,
  * we avoid changing most of the rest of the code (although it needs
  * to be changed, eventually, for greater efficiency).
  */
 #define	ALIGNMENT	32
 #define	ALIGNM1		(ALIGNMENT - 1)
 struct	inp_tp {
 	union {
 		struct	inpcb inp;
 		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
 	} inp_tp_u;
 	struct	tcpcb tcb;
 	struct	callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
 	struct	callout inp_tp_delack;
 };
 #undef ALIGNMENT
 #undef ALIGNM1
 
 /*
  * Tcp initialization
  */
 void
 tcp_init()
 {
 	int hashsize = TCBHASHSIZE;
 	
 	tcp_ccgen = 1;
 	tcp_cleartaocache();
 
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 
 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
 	LIST_INIT(&tcb);
 	tcbinfo.listhead = &tcb;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	tcp_tcbhashsize = hashsize;
 	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
 	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
 					&tcbinfo.porthashmask);
 	tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), 
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	syncache_init();
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcp_fillheaders(tp, ip_ptr, tcp_ptr)
 	struct tcpcb *tp;
 	void *ip_ptr;
 	void *tcp_ptr;
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = sizeof(struct tcphdr);
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 		tcp_hdr->th_sum = 0;
 	} else
 #endif
 	{
 	struct ip *ip = (struct ip *) ip_ptr;
 
 	ip->ip_vhl = IP_VHL_BORING;
 	ip->ip_tos = 0;
 	ip->ip_len = 0;
 	ip->ip_id = 0;
 	ip->ip_off = 0;
 	ip->ip_ttl = 0;
 	ip->ip_sum = 0;
 	ip->ip_p = IPPROTO_TCP;
 	ip->ip_src = inp->inp_laddr;
 	ip->ip_dst = inp->inp_faddr;
 	tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		htons(sizeof(struct tcphdr) + IPPROTO_TCP));
 	}
 
 	tcp_hdr->th_sport = inp->inp_lport;
 	tcp_hdr->th_dport = inp->inp_fport;
 	tcp_hdr->th_seq = 0;
 	tcp_hdr->th_ack = 0;
 	tcp_hdr->th_x2 = 0;
 	tcp_hdr->th_off = 5;
 	tcp_hdr->th_flags = 0;
 	tcp_hdr->th_win = 0;
 	tcp_hdr->th_urp = 0;
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcp_maketemplate(tp)
 	struct tcpcb *tp;
 {
 	struct mbuf *m;
 	struct tcptemp *n;
 
 	m = m_get(M_DONTWAIT, MT_HEADER);
 	if (m == NULL)
 		return (0);
 	m->m_len = sizeof(struct tcptemp);
 	n = mtod(m, struct tcptemp *);
 
 	tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
 	return (n);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == 0, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
 	struct tcpcb *tp;
 	void *ipgen;
 	register struct tcphdr *th;
 	register struct mbuf *m;
 	tcp_seq ack, seq;
 	int flags;
 {
 	register int tlen;
 	int win = 0;
 	struct route *ro = 0;
 	struct route sro;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct route_in6 *ro6 = 0;
 	struct route_in6 sro6;
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 #ifdef INET6
 		if (isipv6)
 			ro6 = &tp->t_inpcb->in6p_route;
 		else
 #endif /* INET6 */
 		ro = &tp->t_inpcb->inp_route;
 	} else {
 #ifdef INET6
 		if (isipv6) {
 			ro6 = &sro6;
 			bzero(ro6, sizeof *ro6);
 		} else
 #endif /* INET6 */
 	      {
 		ro = &sro;
 		bzero(ro, sizeof *ro);
 	      }
 	}
 	if (m == 0) {
 		m = m_gethdr(M_DONTWAIT, MT_HEADER);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t), 
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		m_freem(m->m_next);
 		m->m_next = 0;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, n_short);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 						tlen));
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	} else
 #endif
       {
 	tlen += sizeof (struct tcpiphdr);
 	ip->ip_len = tlen;
 	ip->ip_ttl = ip_defttl;
       }
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = (struct ifnet *) 0;
 #ifdef MAC
 	if (tp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
 	} else {
 		/*
 		 * XXXMAC: This will need to call a mac function that
 		 * modifies the mbuf label in place for TCP datagrams
 		 * not associated with a PCB.
 		 */
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 #ifdef INET6
 	if (isipv6) {
 		nth->th_sum = 0;
 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
 					sizeof(struct ip6_hdr),
 					tlen - sizeof(struct ip6_hdr));
 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
 					       ro6 && ro6->ro_rt ?
 					       ro6->ro_rt->rt_ifp :
 					       NULL);
 	} else
 #endif /* INET6 */
       {
         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
         m->m_pkthdr.csum_flags = CSUM_TCP;
         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
       }
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
-#ifdef IPSEC
-	if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
-		m_freem(m);
-		return;
-	}
-#endif
 #ifdef INET6
 	if (isipv6) {
-		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
+		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
+			tp ? tp->t_inpcb : NULL);
 		if (ro6 == &sro6 && ro6->ro_rt) {
 			RTFREE(ro6->ro_rt);
 			ro6->ro_rt = NULL;
 		}
 	} else
 #endif /* INET6 */
       {
-	(void) ip_output(m, NULL, ro, ipflags, NULL);
+	(void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
 	if (ro == &sro && ro->ro_rt) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = NULL;
 	}
       }
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(inp)
 	struct inpcb *inp;
 {
 	struct inp_tp *it;
 	register struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	it = (struct inp_tp *)inp;
 	tp = &it->tcb;
 	bzero((char *) tp, sizeof(struct tcpcb));
 	LIST_INIT(&tp->t_segq);
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 		tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
 	callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
 	callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
 	callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
 	callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
 
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_rfc1644)
 		tp->t_flags |= TF_REQ_CC;
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
         /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = ip_defttl;
 	inp->inp_ppcb = (caddr_t)tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(tp, errno)
 	register struct tcpcb *tp;
 	int errno;
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
 		(void) tcp_output(tp);
 		tcpstat.tcps_drops++;
 	} else
 		tcpstat.tcps_conndrops++;
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 /*
  * Close a TCP control block:
  *	discard all space held by the tcp
  *	discard internet protocol block
  *	wake up any sleepers
  */
 struct tcpcb *
 tcp_close(tp)
 	register struct tcpcb *tp;
 {
 	register struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 	register struct rtentry *rt;
 	int dosavessthresh;
 
 	/*
 	 * Make sure that all of our timers are stopped before we
 	 * delete the PCB.
 	 */
 	callout_stop(tp->tt_rexmt);
 	callout_stop(tp->tt_persist);
 	callout_stop(tp->tt_keep);
 	callout_stop(tp->tt_2msl);
 	callout_stop(tp->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as the 16 samples.
 	 * 16 samples is enough for the srtt filter to converge
 	 * to within 5% of the correct value; fewer samples and
 	 * we could save a very bogus rtt.
 	 *
 	 * Don't update the default route's characteristics and don't
 	 * update anything that the user "locked".
 	 */
 	if (tp->t_rttupdated >= 16) {
 		register u_long i = 0;
 #ifdef INET6
 		if (isipv6) {
 			struct sockaddr_in6 *sin6;
 
 			if ((rt = inp->in6p_route.ro_rt) == NULL)
 				goto no_valid_rt;
 			sin6 = (struct sockaddr_in6 *)rt_key(rt);
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 				goto no_valid_rt;
 		}
 		else
 #endif /* INET6 */		
 		if ((rt = inp->inp_route.ro_rt) == NULL ||
 		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
 		    == INADDR_ANY)
 			goto no_valid_rt;
 
 		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
 			i = tp->t_srtt *
 			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
 			if (rt->rt_rmx.rmx_rtt && i)
 				/*
 				 * filter this update to half the old & half
 				 * the new values, converting scale.
 				 * See route.h and tcp_var.h for a
 				 * description of the scaling constants.
 				 */
 				rt->rt_rmx.rmx_rtt =
 				    (rt->rt_rmx.rmx_rtt + i) / 2;
 			else
 				rt->rt_rmx.rmx_rtt = i;
 			tcpstat.tcps_cachedrtt++;
 		}
 		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
 			i = tp->t_rttvar *
 			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
 			if (rt->rt_rmx.rmx_rttvar && i)
 				rt->rt_rmx.rmx_rttvar =
 				    (rt->rt_rmx.rmx_rttvar + i) / 2;
 			else
 				rt->rt_rmx.rmx_rttvar = i;
 			tcpstat.tcps_cachedrttvar++;
 		}
 		/*
 		 * The old comment here said:
 		 * update the pipelimit (ssthresh) if it has been updated
 		 * already or if a pipesize was specified & the threshhold
 		 * got below half the pipesize.  I.e., wait for bad news
 		 * before we start updating, then update on both good
 		 * and bad news.
 		 *
 		 * But we want to save the ssthresh even if no pipesize is
 		 * specified explicitly in the route, because such
 		 * connections still have an implicit pipesize specified
 		 * by the global tcp_sendspace.  In the absence of a reliable
 		 * way to calculate the pipesize, it will have to do.
 		 */
 		i = tp->snd_ssthresh;
 		if (rt->rt_rmx.rmx_sendpipe != 0)
 			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
 		else
 			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
 		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
 		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
 		    || dosavessthresh) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (i < 2)
 				i = 2;
 			i *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 				      (isipv6 ? sizeof (struct ip6_hdr) +
 					       sizeof (struct tcphdr) :
 #endif
 				       sizeof (struct tcpiphdr)
 #ifdef INET6
 				       )
 #endif
 				      );
 			if (rt->rt_rmx.rmx_ssthresh)
 				rt->rt_rmx.rmx_ssthresh =
 				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
 			else
 				rt->rt_rmx.rmx_ssthresh = i;
 			tcpstat.tcps_cachedssthresh++;
 		}
 	}
     no_valid_rt:
 	/* free the reassembly queue, if any */
 	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		FREE(q, M_TSEGQ);
 	}
 	inp->inp_ppcb = NULL;
 	soisdisconnected(so);
 #ifdef INET6
 	if (INP_CHECK_SOCKAF(so, AF_INET6))
 		in6_pcbdetach(inp);
 	else
 #endif /* INET6 */
 	in_pcbdetach(inp);
 	tcpstat.tcps_closed++;
 	return ((struct tcpcb *)0);
 }
 
 void
 tcp_drain()
 {
 	if (do_tcpdrain)
 	{
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 		struct tseg_qent *te;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 * 	where we're really low on mbufs, this is potentially
 	 *  	usefull.	
 	 */
 		INP_INFO_RLOCK(&tcbinfo);
 		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
 			INP_LOCK(inpb);
 			if ((tcpb = intotcpcb(inpb))) {
 				while ((te = LIST_FIRST(&tcpb->t_segq))
 			            != NULL) {
 					LIST_REMOVE(te, tqe_q);
 					m_freem(te->tqe_m);
 					FREE(te, M_TSEGQ);
 				}
 			}
 			INP_UNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&tcbinfo);
 	}
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(inp, error)
 	struct inpcb *inp;
 	int error;
 {
 	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	     (error == EHOSTUNREACH || error == ENETUNREACH ||
 	      error == EHOSTDOWN)) {
 		return inp;
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tcp_drop(tp, error);
 		return (struct inpcb *)0;
 	} else {
 		tp->t_softerror = error;
 		return inp;
 	}
 #if 0
 	wakeup((caddr_t) &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = tcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xtcpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	gencnt = tcbinfo.ipi_gencnt;
 	n = tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 
 	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xtcpcb));
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			caddr_t inp_ppcb;
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb != NULL)
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 			else
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		}
 		INP_UNLOCK(inp);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		INP_INFO_RLOCK(&tcbinfo);
 		xig.xig_gen = tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = tcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&tcbinfo);
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error, s, mapped = 0;
 
 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 			return (EINVAL);
 	}
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	if (mapped == 1)
 		inp = in_pcblookup_hash(&tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port,
 			0, NULL);
 	else
 		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
 				 addrs[1].sin6_port,
 				 &addrs[0].sin6_addr, addrs[0].sin6_port,
 				 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif
 
 
 void
 tcp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	tcp_seq icmp_seq;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_QUENCH)
 		notify = tcp_quench;
 	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	else if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (PRC_IS_REDIRECT(cmd)) {
 		ip = 0;
 		notify = in_rtchange;
 	} else if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		th = (struct tcphdr *)((caddr_t)ip 
 				       + (IP_VHL_HL(ip->ip_vhl) << 2));
 		INP_INFO_WLOCK(&tcbinfo);
 		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, 0, NULL);
 		if (inp != NULL)  {
 			INP_LOCK(inp);
 			if (inp->inp_socket != NULL) {
 				icmp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
 			    		SEQ_LT(icmp_seq, tp->snd_max))
 					inp = (*notify)(inp, inetctlerrmap[cmd]);
 			}
 			if (inp)
 				INP_UNLOCK(inp);
 		} else {
 			struct in_conninfo inc;
 
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 #ifdef INET6
 			inc.inc_isipv6 = 0;
 #endif
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&tcbinfo);
 		splx(s);
 	} else
 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 #ifdef INET6
 void
 tcp6_ctlinput(cmd, sa, d)
 	int cmd;
 	struct sockaddr *sa;
 	void *d;
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_QUENCH)
 		notify = tcp_quench;
 	else if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&tcb, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, notify);
 
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_isipv6 = 1;
 		syncache_unreach(&inc, &th);
 	} else
 		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used to generate sequence numbers.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 
 u_char isn_secret[32];
 int isn_last_reseed;
 MD5_CTX isn_ctx;
 
 tcp_seq
 tcp_new_isn(tp)
 	struct tcpcb *tp;
 {
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 
 	/* Seed if this is the first use, reseed if requested. */
 	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
 	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&isn_secret, sizeof(isn_secret));
 		isn_last_reseed = ticks;
 	}
 		
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
 	return new_isn;
 }
 
 /*
  * When a source quench is received, close congestion window
  * to one segment.  We will gradually open it again as we proceed.
  */
 struct inpcb *
 tcp_quench(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 
 	if (tp)
 		tp->snd_cwnd = tp->t_maxseg;
 	return (inp);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 
 	if (tp && tp->t_state == TCPS_SYN_SENT) {
 		tcp_drop(tp, errno);
 		return (struct inpcb *)0;
 	}
 	return inp;
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value in the route.  Also nudge TCP to send something,
  * since we know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 struct inpcb *
 tcp_mtudisc(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct rtentry *rt;
 	struct rmxp_tao *taop;
 	struct socket *so = inp->inp_socket;
 	int offered;
 	int mss;
 #ifdef INET6
 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	if (tp) {
 #ifdef INET6
 		if (isipv6)
 			rt = tcp_rtlookup6(&inp->inp_inc);
 		else
 #endif /* INET6 */
 		rt = tcp_rtlookup(&inp->inp_inc);
 		if (!rt || !rt->rt_rmx.rmx_mtu) {
 			tp->t_maxopd = tp->t_maxseg =
 #ifdef INET6
 				isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 				tcp_mssdflt;
 			return inp;
 		}
 		taop = rmx_taop(rt->rt_rmx);
 		offered = taop->tao_mssopt;
 		mss = rt->rt_rmx.rmx_mtu -
 #ifdef INET6
 			(isipv6 ?
 			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
 #endif /* INET6 */
 			 sizeof(struct tcpiphdr)
 #ifdef INET6
 			 )
 #endif /* INET6 */
 			;
 
 		if (offered)
 			mss = min(mss, offered);
 		/*
 		 * XXX - The above conditional probably violates the TCP
 		 * spec.  The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 		if (tp->t_maxopd <= mss)
 			return inp;
 		tp->t_maxopd = mss;
 
 		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 			mss -= TCPOLEN_TSTAMP_APPA;
 		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
 			mss -= TCPOLEN_CC_APPA;
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 		if (mss > MCLBYTES)
 			mss &= ~(MCLBYTES-1);
 #else
 		if (mss > MCLBYTES)
 			mss = mss / MCLBYTES * MCLBYTES;
 #endif
 		if (so->so_snd.sb_hiwat < mss)
 			mss = so->so_snd.sb_hiwat;
 
 		tp->t_maxseg = mss;
 
 		tcpstat.tcps_mturesent++;
 		tp->t_rtttime = 0;
 		tp->snd_nxt = tp->snd_una;
 		tcp_output(tp);
 	}
 	return inp;
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated the return NULL.  This routine
  * is called by TCP routines that access the rmx structure and by tcp_mss
  * to get the interface MTU.
  */
 struct rtentry *
 tcp_rtlookup(inc)
 	struct in_conninfo *inc;
 {
 	struct route *ro;
 	struct rtentry *rt;
 
 	ro = &inc->inc_route;
 	rt = ro->ro_rt;
 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 		/* No route yet, so try to acquire one */
 		if (inc->inc_faddr.s_addr != INADDR_ANY) {
 			ro->ro_dst.sa_family = AF_INET;
 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
 			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
 			    inc->inc_faddr;
 			rtalloc(ro);
 			rt = ro->ro_rt;
 		}
 	}
 	return rt;
 }
 
 #ifdef INET6
 struct rtentry *
 tcp_rtlookup6(inc)
 	struct in_conninfo *inc;
 {
 	struct route_in6 *ro6;
 	struct rtentry *rt;
 
 	ro6 = &inc->inc6_route;
 	rt = ro6->ro_rt;
 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 		/* No route yet, so try to acquire one */
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 			ro6->ro_dst.sin6_family = AF_INET6;
 			ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 			ro6->ro_dst.sin6_addr = inc->inc6_faddr;
 			rtalloc((struct route *)ro6);
 			rt = ro6->ro_rt;
 		}
 	}
 	return rt;
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(tp)
 	struct tcpcb *tp;
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif /* INET6 */
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return 0;
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return 0;
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcp_fillheaders(tp, ip6, th);
 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
       {
 	ip = mtod(m, struct ip *);
 	th = (struct tcphdr *)(ip + 1);
 	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 	tcp_fillheaders(tp, ip, th);
 	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
       }
 
 	m_free(m);
 	return hdrsiz;
 }
 #endif /*IPSEC*/
 
 /*
  * Return a pointer to the cached information about the remote host.
  * The cached information is stored in the protocol specific part of
  * the route metrics.
  */
 struct rmxp_tao *
 tcp_gettaocache(inc)
 	struct in_conninfo *inc;
 {
 	struct rtentry *rt;
 
 #ifdef INET6
 	if (inc->inc_isipv6)
 		rt = tcp_rtlookup6(inc);
 	else
 #endif /* INET6 */
 	rt = tcp_rtlookup(inc);
 
 	/* Make sure this is a host route and is up. */
 	if (rt == NULL ||
 	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
 		return NULL;
 
 	return rmx_taop(rt->rt_rmx);
 }
 
 /*
  * Clear all the TAO cache entries, called from tcp_init.
  *
  * XXX
  * This routine is just an empty one, because we assume that the routing
  * routing tables are initialized at the same time when TCP, so there is
  * nothing in the cache left over.
  */
 static void
 tcp_cleartaocache()
 {
 }
 
 /*
  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
  *
  * This code attempts to calculate the bandwidth-delay product as a
  * means of determining the optimal window size to maximize bandwidth,
  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
  * routers.  This code also does a fairly good job keeping RTTs in check
  * across slow links like modems.  We implement an algorithm which is very
  * similar (but not meant to be) TCP/Vegas.  The code operates on the
  * transmitter side of a TCP connection and so only effects the transmit
  * side of the connection.
  *
  * BACKGROUND:  TCP makes no provision for the management of buffer space
  * at the end points or at the intermediate routers and switches.  A TCP 
  * stream, whether using NewReno or not, will eventually buffer as
  * many packets as it is able and the only reason this typically works is
  * due to the fairly small default buffers made available for a connection
  * (typicaly 16K or 32K).  As machines use larger windows and/or window
  * scaling it is now fairly easy for even a single TCP connection to blow-out
  * all available buffer space not only on the local interface, but on 
  * intermediate routers and switches as well.  NewReno makes a misguided
  * attempt to 'solve' this problem by waiting for an actual failure to occur,
  * then backing off, then steadily increasing the window again until another
  * failure occurs, ad-infinitum.  This results in terrible oscillation that
  * is only made worse as network loads increase and the idea of intentionally
  * blowing out network buffers is, frankly, a terrible way to manage network
  * resources.
  *
  * It is far better to limit the transmit window prior to the failure
  * condition being achieved.  There are two general ways to do this:  First
  * you can 'scan' through different transmit window sizes and locate the
  * point where the RTT stops increasing, indicating that you have filled the
  * pipe, then scan backwards until you note that RTT stops decreasing, then
  * repeat ad-infinitum.  This method works in principle but has severe
  * implementation issues due to RTT variances, timer granularity, and
  * instability in the algorithm which can lead to many false positives and
  * create oscillations as well as interact badly with other TCP streams
  * implementing the same algorithm.
  *
  * The second method is to limit the window to the bandwidth delay product
  * of the link.  This is the method we implement.  RTT variances and our
  * own manipulation of the congestion window, bwnd, can potentially 
  * destabilize the algorithm.  For this reason we have to stabilize the
  * elements used to calculate the window.  We do this by using the minimum
  * observed RTT, the long term average of the observed bandwidth, and
  * by adding two segments worth of slop.  It isn't perfect but it is able
  * to react to changing conditions and gives us a very stable basis on
  * which to extend the algorithm.
  */
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
 	u_long bw;
 	u_long bwnd;
 	int save_ticks;
 
 	/*
 	 * If inflight_enable is disabled in the middle of a tcp connection,
 	 * make sure snd_bwnd is effectively disabled.
 	 */
 	if (tcp_inflight_enable == 0) {
 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 		tp->snd_bandwidth = 0;
 		return;
 	}
 
 	/*
 	 * Figure out the bandwidth.  Due to the tick granularity this
 	 * is a very rough number and it MUST be averaged over a fairly
 	 * long period of time.  XXX we need to take into account a link
 	 * that is not using all available bandwidth, but for now our
 	 * slop will ramp us up if this case occurs and the bandwidth later
 	 * increases.
 	 *
 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
 	    (save_ticks - tp->t_bw_rtttime);
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 		return;
 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	tp->snd_bandwidth = bw;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
 	 * segments.  The additional slop puts us squarely in the sweet
 	 * spot and also handles the bandwidth run-up case.  Without the
 	 * slop we could be locking ourselves into a lower bandwidth.
 	 *
 	 * Situations Handled:
 	 *	(1) Prevents over-queueing of packets on LANs, especially on
 	 *	    high speed LANs, allowing larger TCP buffers to be
 	 *	    specified, and also does a good job preventing 
 	 *	    over-queueing of packets over choke points like modems
 	 *	    (at least for the transmit side).
 	 *
 	 *	(2) Is able to handle changing network loads (bandwidth
 	 *	    drops so bwnd drops, bandwidth increases so bwnd
 	 *	    increases).
 	 *
 	 *	(3) Theoretically should stabilize in the face of multiple
 	 *	    connections implementing the same algorithm (this may need
 	 *	    a little work).
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 			    tp,
 			    bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
 	if ((long)bwnd < tcp_inflight_min)
 		bwnd = tcp_inflight_min;
 	if (bwnd > tcp_inflight_max)
 		bwnd = tcp_inflight_max;
 	if ((long)bwnd < tp->t_maxseg * 2)
 		bwnd = tp->t_maxseg * 2;
 	tp->snd_bwnd = bwnd;
 }
 
Index: head/sys/netinet/tcp_syncache.c
===================================================================
--- head/sys/netinet/tcp_syncache.c	(revision 105193)
+++ head/sys/netinet/tcp_syncache.c	(revision 105194)
@@ -1,1383 +1,1377 @@
 /*-
  * Copyright (c) 2001 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jonathan Lemon
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/md5.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #include <netkey/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <vm/uma.h>
 
 static int tcp_syncookies = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
     &tcp_syncookies, 0, 
     "Use TCP SYN cookies if the syncache overflows");
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
 static void	 syncache_free(struct syncache *);
 static void	 syncache_insert(struct syncache *, struct syncache_head *);
 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
 static int	 syncache_respond(struct syncache *, struct mbuf *);
 static struct 	 socket *syncache_socket(struct syncache *, struct socket *,
 		    struct mbuf *m);
 static void	 syncache_timer(void *);
 static u_int32_t syncookie_generate(struct syncache *);
 static struct syncache *syncookie_lookup(struct in_conninfo *,
 		    struct tcphdr *, struct socket *);
 
 /*
  * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
  * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
  * the odds are that the user has given up attempting to connect by then.
  */
 #define SYNCACHE_MAXREXMTS		3
 
 /* Arbitrary values */
 #define TCP_SYNCACHE_HASHSIZE		512
 #define TCP_SYNCACHE_BUCKETLIMIT	30
 
 struct tcp_syncache {
 	struct	syncache_head *hashbase;
 	uma_zone_t zone;
 	u_int	hashsize;
 	u_int	hashmask;
 	u_int	bucket_limit;
 	u_int	cache_count;
 	u_int	cache_limit;
 	u_int	rexmt_limit;
 	u_int	hash_secret;
 	u_int	next_reseed;
 	TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
 	struct	callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
 };
 static struct tcp_syncache tcp_syncache;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
 
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD,
      &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
 
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD,
      &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
 
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
      &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
 
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD,
      &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
 
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
      &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
 
 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define SYNCACHE_HASH(inc, mask) 					\
 	((tcp_syncache.hash_secret ^					\
 	  (inc)->inc_faddr.s_addr ^					\
 	  ((inc)->inc_faddr.s_addr >> 16) ^ 				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define SYNCACHE_HASH6(inc, mask) 					\
 	((tcp_syncache.hash_secret ^					\
 	  (inc)->inc6_faddr.s6_addr32[0] ^ 				\
 	  (inc)->inc6_faddr.s6_addr32[3] ^ 				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define ENDPTS_EQ(a, b) (						\
 	(a)->ie_fport == (b)->ie_fport &&				\
 	(a)->ie_lport == (b)->ie_lport &&				\
 	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
 	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
 )
 
 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
 
 #define SYNCACHE_TIMEOUT(sc, slot) do {					\
 	sc->sc_rxtslot = slot;						\
 	sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot];	\
 	TAILQ_INSERT_TAIL(&tcp_syncache.timerq[slot], sc, sc_timerq);	\
 	if (!callout_active(&tcp_syncache.tt_timerq[slot]))		\
 		callout_reset(&tcp_syncache.tt_timerq[slot],		\
 		    TCPTV_RTOBASE * tcp_backoff[slot],			\
 		    syncache_timer, (void *)((intptr_t)slot));		\
 } while (0)
 
 static void
 syncache_free(struct syncache *sc)
 {
 	struct rtentry *rt;
 
 	if (sc->sc_ipopts)
 		(void) m_free(sc->sc_ipopts);
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6)
 		rt = sc->sc_route6.ro_rt;
 	else
 #endif
 		rt = sc->sc_route.ro_rt;
 	if (rt != NULL) {
 		/*
 		 * If this is the only reference to a protocol cloned 
 		 * route, remove it immediately.
 		 */
 		if (rt->rt_flags & RTF_WASCLONED &&
 		    (sc->sc_flags & SCF_KEEPROUTE) == 0 &&
 		    rt->rt_refcnt == 1)
 			rtrequest(RTM_DELETE, rt_key(rt),
 			    rt->rt_gateway, rt_mask(rt),
 			    rt->rt_flags, NULL);
 		RTFREE(rt);
 	}
 	uma_zfree(tcp_syncache.zone, sc);
 }
 
 void
 syncache_init(void)
 {
 	int i;
 
 	tcp_syncache.cache_count = 0;
 	tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
 	tcp_syncache.cache_limit =
 	    tcp_syncache.hashsize * tcp_syncache.bucket_limit;
 	tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
 	tcp_syncache.next_reseed = 0;
 	tcp_syncache.hash_secret = arc4random();
 
         TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
 	    &tcp_syncache.hashsize);
         TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
 	    &tcp_syncache.cache_limit);
         TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
 	    &tcp_syncache.bucket_limit);
 	if (!powerof2(tcp_syncache.hashsize)) {
                 printf("WARNING: syncache hash size is not a power of 2.\n");
 		tcp_syncache.hashsize = 512;	/* safe default */
         }
 	tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
 
 	/* Allocate the hash table. */
 	MALLOC(tcp_syncache.hashbase, struct syncache_head *,
 	    tcp_syncache.hashsize * sizeof(struct syncache_head),
 	    M_SYNCACHE, M_WAITOK);
 
 	/* Initialize the hash buckets. */
 	for (i = 0; i < tcp_syncache.hashsize; i++) {
 		TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
 		tcp_syncache.hashbase[i].sch_length = 0;
 	}
 
 	/* Initialize the timer queues. */
 	for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
 		TAILQ_INIT(&tcp_syncache.timerq[i]);
 		callout_init(&tcp_syncache.tt_timerq[i], 0);
 	}
 
 	/*
 	 * Allocate the syncache entries.  Allow the zone to allocate one
 	 * more entry than cache limit, so a new entry can bump out an
 	 * older one.
 	 */
 	tcp_syncache.cache_limit -= 1;
 	tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
 }
 
 static void
 syncache_insert(sc, sch)
 	struct syncache *sc;
 	struct syncache_head *sch;
 {
 	struct syncache *sc2;
 	int s, i;
 
 	/*
 	 * Make sure that we don't overflow the per-bucket
 	 * limit or the total cache size limit.
 	 */
 	s = splnet();
 	if (sch->sch_length >= tcp_syncache.bucket_limit) {
 		/*
 		 * The bucket is full, toss the oldest element.
 		 */
 		sc2 = TAILQ_FIRST(&sch->sch_bucket);
 		sc2->sc_tp->ts_recent = ticks;
 		syncache_drop(sc2, sch);
 		tcpstat.tcps_sc_bucketoverflow++;
 	} else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
 		/*
 		 * The cache is full.  Toss the oldest entry in the
 		 * entire cache.  This is the front entry in the
 		 * first non-empty timer queue with the largest
 		 * timeout value.
 		 */
 		for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
 			sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
 			if (sc2 != NULL)
 				break;
 		}
 		sc2->sc_tp->ts_recent = ticks;
 		syncache_drop(sc2, NULL);
 		tcpstat.tcps_sc_cacheoverflow++;
 	}
 
 	/* Initialize the entry's timer. */
 	SYNCACHE_TIMEOUT(sc, 0);
 
 	/* Put it into the bucket. */
 	TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 	tcp_syncache.cache_count++;
 	tcpstat.tcps_sc_added++;
 	splx(s);
 }
 
 static void
 syncache_drop(sc, sch)
 	struct syncache *sc;
 	struct syncache_head *sch;
 {
 	int s;
 
 	if (sch == NULL) {
 #ifdef INET6
 		if (sc->sc_inc.inc_isipv6) {
 			sch = &tcp_syncache.hashbase[
 			    SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
 		} else
 #endif
 		{
 			sch = &tcp_syncache.hashbase[
 			    SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
 		}
 	}
 
 	s = splnet();
 
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 	tcp_syncache.cache_count--;
 
 	TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
 	if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
 		callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
 	splx(s);
 
 	syncache_free(sc);
 }
 
 /*
  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  * If we have retransmitted an entry the maximum number of times, expire it.
  */
 static void
 syncache_timer(xslot)
 	void *xslot;
 {
 	intptr_t slot = (intptr_t)xslot;
 	struct syncache *sc, *nsc;
 	struct inpcb *inp;
 	int s;
 
 	s = splnet();
         if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
             !callout_active(&tcp_syncache.tt_timerq[slot])) {
                 splx(s);
                 return;
         }
         callout_deactivate(&tcp_syncache.tt_timerq[slot]);
 
         nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
 	INP_INFO_RLOCK(&tcbinfo);
 	while (nsc != NULL) {
 		if (ticks < nsc->sc_rxttime)
 			break;
 		sc = nsc;
 		inp = sc->sc_tp->t_inpcb;
 		INP_LOCK(inp);
 		if (slot == SYNCACHE_MAXREXMTS ||
 		    slot >= tcp_syncache.rexmt_limit ||
 		    inp->inp_gencnt != sc->sc_inp_gencnt) {
 			nsc = TAILQ_NEXT(sc, sc_timerq);
 			syncache_drop(sc, NULL);
 			tcpstat.tcps_sc_stale++;
 			INP_UNLOCK(inp);
 			continue;
 		}
 		/*
 		 * syncache_respond() may call back into the syncache to
 		 * to modify another entry, so do not obtain the next
 		 * entry on the timer chain until it has completed.
 		 */
 		(void) syncache_respond(sc, NULL);
 		INP_UNLOCK(inp);
 		nsc = TAILQ_NEXT(sc, sc_timerq);
 		tcpstat.tcps_sc_retransmitted++;
 		TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
 		SYNCACHE_TIMEOUT(sc, slot + 1);
 	}
 	INP_INFO_RUNLOCK(&tcbinfo);
 	if (nsc != NULL)
 		callout_reset(&tcp_syncache.tt_timerq[slot],
 		    nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
 	splx(s);
 }
 
 /*
  * Find an entry in the syncache.
  */
 struct syncache *
 syncache_lookup(inc, schp)
 	struct in_conninfo *inc;
 	struct syncache_head **schp;
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	int s;
 
 #ifdef INET6
 	if (inc->inc_isipv6) {
 		sch = &tcp_syncache.hashbase[
 		    SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
 		*schp = sch;
 		s = splnet();
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
 				splx(s);
 				return (sc);
 			}
 		}
 		splx(s);
 	} else
 #endif
 	{
 		sch = &tcp_syncache.hashbase[
 		    SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
 		*schp = sch;
 		s = splnet();
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 #ifdef INET6
 			if (sc->sc_inc.inc_isipv6)
 				continue;
 #endif
 			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
 				splx(s);
 				return (sc);
 			}
 		}
 		splx(s);
 	}
 	return (NULL);
 }
 
 /*
  * This function is called when we get a RST for a
  * non-existent connection, so that we can see if the
  * connection is in the syn cache.  If it is, zap it.
  */
 void
 syncache_chkrst(inc, th)
 	struct in_conninfo *inc;
 	struct tcphdr *th;
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);
 	if (sc == NULL)
 		return;
 	/*
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 *
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 */
 	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
 	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		syncache_drop(sc, sch);
 		tcpstat.tcps_sc_reset++;
 	}
 }
 
 void
 syncache_badack(inc)
 	struct in_conninfo *inc;
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);
 	if (sc != NULL) {
 		syncache_drop(sc, sch);
 		tcpstat.tcps_sc_badack++;
 	}
 }
 
 void
 syncache_unreach(inc, th)
 	struct in_conninfo *inc;
 	struct tcphdr *th;
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	/* we are called at splnet() here */
 	sc = syncache_lookup(inc, &sch);
 	if (sc == NULL)
 		return;
 
 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 	if (ntohl(th->th_seq) != sc->sc_iss)
 		return;
 
 	/*
 	 * If we've rertransmitted 3 times and this is our second error,
 	 * we remove the entry.  Otherwise, we allow it to continue on.
 	 * This prevents us from incorrectly nuking an entry during a
 	 * spurious network outage.
 	 *
 	 * See tcp_notify().
 	 */
 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
 		sc->sc_flags |= SCF_UNREACH;
 		return;
 	}
 	syncache_drop(sc, sch);
 	tcpstat.tcps_sc_unreach++;
 }
 
 /*
  * Build a new TCP socket structure from a syncache entry.
  */
 static struct socket *
 syncache_socket(sc, lso, m)
 	struct syncache *sc;
 	struct socket *lso;
 	struct mbuf *m;
 {
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
 
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
 	 * connection when the SYN arrived.  If we can't create
 	 * the connection, abort it.
 	 */
 	so = sonewconn(lso, SS_ISCONNECTED);
 	if (so == NULL) {
 		/*
 		 * Drop the connection; we will send a RST if the peer
 		 * retransmits the ACK,
 		 */
 		tcpstat.tcps_listendrop++;
 		goto abort;
 	}
 #ifdef MAC
 	mac_set_socket_peer_from_mbuf(m, so);
 #endif
 
 	inp = sotoinpcb(so);
 
 	/*
 	 * Insert new socket into hash list.
 	 */
 	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 	} else {
 		inp->inp_vflag &= ~INP_IPV6;
 		inp->inp_vflag |= INP_IPV4;
 #endif
 		inp->inp_laddr = sc->sc_inc.inc_laddr;
 #ifdef INET6
 	}
 #endif
 	inp->inp_lport = sc->sc_inc.inc_lport;
 	if (in_pcbinshash(inp) != 0) {
 		/*
 		 * Undo the assignments above if we failed to
 		 * put the PCB on the hash lists.
 		 */
 #ifdef INET6
 		if (sc->sc_inc.inc_isipv6)
 			inp->in6p_laddr = in6addr_any;
        		else
 #endif
 			inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		goto abort;
 	}
 #ifdef IPSEC
 	/* copy old policy into new socket's */
 	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
 		printf("syncache_expand: could not copy policy\n");
 #endif
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		struct inpcb *oinp = sotoinpcb(lso);
 		struct in6_addr laddr6;
 		struct sockaddr_in6 *sin6;
 		/*
 		 * Inherit socket options from the listening socket.
 		 * Note that in6p_inputopts are not (and should not be)
 		 * copied, since it stores previously received options and is
 		 * used to detect if each new option is different than the
 		 * previous one and hence should be passed to a user.
                  * If we copied in6p_inputopts, a user would not be able to
 		 * receive options just after calling the accept system call.
 		 */
 		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
 		if (oinp->in6p_outputopts)
 			inp->in6p_outputopts =
 			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
 		inp->in6p_route = sc->sc_route6;
 		sc->sc_route6.ro_rt = NULL;
 
 		MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
 		    M_SONAME, M_NOWAIT | M_ZERO);
 		if (sin6 == NULL)
 			goto abort;
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(*sin6);
 		sin6->sin6_addr = sc->sc_inc.inc6_faddr;
 		sin6->sin6_port = sc->sc_inc.inc_fport;
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 		if (in6_pcbconnect(inp, (struct sockaddr *)sin6, &thread0)) {
 			inp->in6p_laddr = laddr6;
 			FREE(sin6, M_SONAME);
 			goto abort;
 		}
 		FREE(sin6, M_SONAME);
 	} else
 #endif
 	{
 		struct in_addr laddr;
 		struct sockaddr_in *sin;
 
 		inp->inp_options = ip_srcroute();
 		if (inp->inp_options == NULL) {
 			inp->inp_options = sc->sc_ipopts;
 			sc->sc_ipopts = NULL;
 		}
 		inp->inp_route = sc->sc_route;
 		sc->sc_route.ro_rt = NULL;
 
 		MALLOC(sin, struct sockaddr_in *, sizeof *sin,
 		    M_SONAME, M_NOWAIT | M_ZERO);
 		if (sin == NULL)
 			goto abort;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = sc->sc_inc.inc_faddr;
 		sin->sin_port = sc->sc_inc.inc_fport;
 		bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
 		if (in_pcbconnect(inp, (struct sockaddr *)sin, &thread0)) {
 			inp->inp_laddr = laddr;
 			FREE(sin, M_SONAME);
 			goto abort;
 		}
 		FREE(sin, M_SONAME);
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
 	tp->snd_wl1 = sc->sc_irs;
 	tp->rcv_up = sc->sc_irs + 1;
 	tp->rcv_wnd = sc->sc_wnd;
 	tp->rcv_adv += tp->rcv_wnd;
 
 	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 	if (sc->sc_flags & SCF_NOOPT)
 		tp->t_flags |= TF_NOOPT;
 	if (sc->sc_flags & SCF_WINSCALE) {
 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 		tp->requested_s_scale = sc->sc_requested_s_scale;
 		tp->request_r_scale = sc->sc_request_r_scale;
 	}
 	if (sc->sc_flags & SCF_TIMESTAMP) {
 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 		tp->ts_recent = sc->sc_tsrecent;
 		tp->ts_recent_age = ticks;
 	}
 	if (sc->sc_flags & SCF_CC) {
 		/*
 		 * Initialization of the tcpcb for transaction;
 		 *   set SND.WND = SEG.WND,
 		 *   initialize CCsend and CCrecv.
 		 */
 		tp->t_flags |= TF_REQ_CC|TF_RCVD_CC;
 		tp->cc_send = sc->sc_cc_send;
 		tp->cc_recv = sc->sc_cc_recv;
 	}
 
 	tcp_mss(tp, sc->sc_peer_mss);
 
 	/*
 	 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
 	 */
 	if (sc->sc_rxtslot != 0)
                 tp->snd_cwnd = tp->t_maxseg;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 
 	tcpstat.tcps_accepts++;
 	return (so);
 
 abort:
 	if (so != NULL)
 		(void) soabort(so);
 	return (NULL);
 }
 
 /*
  * This function gets called when we receive an ACK for a
  * socket in the LISTEN state.  We look up the connection
  * in the syncache, and if its there, we pull it out of
  * the cache and turn it into a full-blown connection in
  * the SYN-RECEIVED state.
  */
 int
 syncache_expand(inc, th, sop, m)
 	struct in_conninfo *inc;
 	struct tcphdr *th;
 	struct socket **sop;
 	struct mbuf *m;
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	struct socket *so;
 
 	sc = syncache_lookup(inc, &sch);
 	if (sc == NULL) {
 		/*
 		 * There is no syncache entry, so see if this ACK is 
 		 * a returning syncookie.  To do this, first:
 		 *  A. See if this socket has had a syncache entry dropped in
 		 *     the past.  We don't want to accept a bogus syncookie
  		 *     if we've never received a SYN.
 		 *  B. check that the syncookie is valid.  If it is, then
 		 *     cobble up a fake syncache entry, and return.
 		 */
 		if (!tcp_syncookies)
 			return (0);
 		sc = syncookie_lookup(inc, th, *sop);
 		if (sc == NULL)
 			return (0);
 		sch = NULL;
 		tcpstat.tcps_sc_recvcookie++;
 	}
 
 	/*
 	 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	if (th->th_ack != sc->sc_iss + 1)
 		return (0);
 
 	so = syncache_socket(sc, *sop, m);
 	if (so == NULL) {
 #if 0
 resetandabort:
 		/* XXXjlemon check this - is this correct? */
 		(void) tcp_respond(NULL, m, m, th,
 		    th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
 #endif
 		m_freem(m);			/* XXX only needed for above */
 		tcpstat.tcps_sc_aborted++;
 	} else {
 		sc->sc_flags |= SCF_KEEPROUTE;
 		tcpstat.tcps_sc_completed++;
 	}
 	if (sch == NULL)
 		syncache_free(sc);
 	else
 		syncache_drop(sc, sch);
 	*sop = so;
 	return (1);
 }
 
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  * to the source.
  *
  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  * Doing so would require that we hold onto the data and deliver it
  * to the application.  However, if we are the target of a SYN-flood
  * DoS attack, an attacker could send data which would eventually
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
 int
 syncache_add(inc, to, th, sop, m)
 	struct in_conninfo *inc;
 	struct tcpopt *to;
 	struct tcphdr *th;
 	struct socket **sop;
 	struct mbuf *m;
 {
 	struct tcpcb *tp;
 	struct socket *so;
 	struct syncache *sc = NULL;
 	struct syncache_head *sch;
 	struct mbuf *ipopts = NULL;
 	struct rmxp_tao *taop;
 	int i, s, win;
 
 	so = *sop;
 	tp = sototcpcb(so);
 
 	/*
 	 * Remember the IP options, if any.
 	 */
 #ifdef INET6
 	if (!inc->inc_isipv6)
 #endif
 		ipopts = ip_srcroute();
 
 	/*
 	 * See if we already have an entry for this connection.
 	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
 	 *
 	 * XXX
 	 * should the syncache be re-initialized with the contents
 	 * of the new SYN here (which may have different options?)
 	 */
 	sc = syncache_lookup(inc, &sch);
 	if (sc != NULL) {
 		tcpstat.tcps_sc_dupsyn++;
 		if (ipopts) {
 			/*
 			 * If we were remembering a previous source route,
 			 * forget it and use the new one we've been given.
 			 */
 			if (sc->sc_ipopts)
 				(void) m_free(sc->sc_ipopts);
 			sc->sc_ipopts = ipopts;
 		}
 		/*
 		 * Update timestamp if present.
 		 */
 		if (sc->sc_flags & SCF_TIMESTAMP)
 			sc->sc_tsrecent = to->to_tsval;
 		/*
 		 * PCB may have changed, pick up new values.
 		 */
 		sc->sc_tp = tp;
 		sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
 		if (syncache_respond(sc, m) == 0) {
 		        s = splnet();
 			TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
 			    sc, sc_timerq);
 			SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
 		        splx(s);
 		 	tcpstat.tcps_sndacks++;
 			tcpstat.tcps_sndtotal++;
 		}
 		*sop = NULL;
 		return (1);
 	}
 
 	sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
 	if (sc == NULL) {
 		/*
 		 * The zone allocator couldn't provide more entries.
 		 * Treat this as if the cache was full; drop the oldest 
 		 * entry and insert the new one.
 		 */
 		s = splnet();
 		for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
 			sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
 			if (sc != NULL)
 				break;
 		}
 		sc->sc_tp->ts_recent = ticks;
 		syncache_drop(sc, NULL);
 		splx(s);
 		tcpstat.tcps_sc_zonefail++;
 		sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
 		if (sc == NULL) {
 			if (ipopts)
 				(void) m_free(ipopts);
 			return (0);
 		}
 	}
 
 	/*
 	 * Fill in the syncache values.
 	 */
 	bzero(sc, sizeof(*sc));
 	sc->sc_tp = tp;
 	sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
 	sc->sc_ipopts = ipopts;
 	sc->sc_inc.inc_fport = inc->inc_fport;
 	sc->sc_inc.inc_lport = inc->inc_lport;
 #ifdef INET6
 	sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
 	if (inc->inc_isipv6) {
 		sc->sc_inc.inc6_faddr = inc->inc6_faddr;
 		sc->sc_inc.inc6_laddr = inc->inc6_laddr;
 		sc->sc_route6.ro_rt = NULL;
 	} else
 #endif
 	{
 		sc->sc_inc.inc_faddr = inc->inc_faddr;
 		sc->sc_inc.inc_laddr = inc->inc_laddr;
 		sc->sc_route.ro_rt = NULL;
 	}
 	sc->sc_irs = th->th_seq;
 	if (tcp_syncookies)
 		sc->sc_iss = syncookie_generate(sc);
 	else
 		sc->sc_iss = arc4random();
 
 	/* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
 	win = sbspace(&so->so_rcv);
 	win = imax(win, 0);
 	win = imin(win, TCP_MAXWIN);
 	sc->sc_wnd = win;
 
 	sc->sc_flags = 0;
 	sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
 	if (tcp_do_rfc1323) {
 		/*
 		 * A timestamp received in a SYN makes
 		 * it ok to send timestamp requests and replies.
 		 */
 		if (to->to_flags & TOF_TS) {
 			sc->sc_tsrecent = to->to_tsval;
 			sc->sc_flags |= SCF_TIMESTAMP;
 		}
 		if (to->to_flags & TOF_SCALE) {
 			int wscale = 0;
 
 			/* Compute proper scaling value from buffer space */
 			while (wscale < TCP_MAX_WINSHIFT &&
 			    (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
 				wscale++;
 			sc->sc_request_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_requested_s_scale;
 			sc->sc_flags |= SCF_WINSCALE;
 		}
 	}
 	if (tcp_do_rfc1644) {
 		/*
 		 * A CC or CC.new option received in a SYN makes
 		 * it ok to send CC in subsequent segments.
 		 */
 		if (to->to_flags & (TOF_CC|TOF_CCNEW)) {
 			sc->sc_cc_recv = to->to_cc;
 			sc->sc_cc_send = CC_INC(tcp_ccgen);
 			sc->sc_flags |= SCF_CC;
 		}
 	}
 	if (tp->t_flags & TF_NOOPT)
 		sc->sc_flags = SCF_NOOPT;
 
 	/*
 	 * XXX
 	 * We have the option here of not doing TAO (even if the segment
 	 * qualifies) and instead fall back to a normal 3WHS via the syncache.
 	 * This allows us to apply synflood protection to TAO-qualifying SYNs
 	 * also. However, there should be a hueristic to determine when to
 	 * do this, and is not present at the moment.
 	 */
 
 	/*
 	 * Perform TAO test on incoming CC (SEG.CC) option, if any.
 	 * - compare SEG.CC against cached CC from the same host, if any.
 	 * - if SEG.CC > chached value, SYN must be new and is accepted
 	 *	immediately: save new CC in the cache, mark the socket
 	 *	connected, enter ESTABLISHED state, turn on flag to
 	 *	send a SYN in the next segment.
 	 *	A virtual advertised window is set in rcv_adv to
 	 *	initialize SWS prevention.  Then enter normal segment
 	 *	processing: drop SYN, process data and FIN.
 	 * - otherwise do a normal 3-way handshake.
 	 */
 	taop = tcp_gettaocache(&sc->sc_inc);
 	if ((to->to_flags & TOF_CC) != 0) {
 		if (((tp->t_flags & TF_NOPUSH) != 0) &&
 		    sc->sc_flags & SCF_CC && 
 		    taop != NULL && taop->tao_cc != 0 &&
 		    CC_GT(to->to_cc, taop->tao_cc)) {
 			sc->sc_rxtslot = 0;
 			so = syncache_socket(sc, *sop, m);
 			if (so != NULL) {
 				sc->sc_flags |= SCF_KEEPROUTE;
 				taop->tao_cc = to->to_cc;
 				*sop = so;
 			}
 			syncache_free(sc);
 			return (so != NULL);
 		}
 	} else {
 		/*
 		 * No CC option, but maybe CC.NEW: invalidate cached value.
 		 */
 		if (taop != NULL)
 			taop->tao_cc = 0;
 	}
 	/*
 	 * TAO test failed or there was no CC option,
 	 *    do a standard 3-way handshake.
 	 */
 	if (syncache_respond(sc, m) == 0) {
 		syncache_insert(sc, sch);
 		tcpstat.tcps_sndacks++;
 		tcpstat.tcps_sndtotal++;
 	} else {
 		syncache_free(sc);
 		tcpstat.tcps_sc_dropped++;
 	}
 	*sop = NULL;
 	return (1);
 }
 
 static int
 syncache_respond(sc, m)
 	struct syncache *sc;
 	struct mbuf *m;
 {
 	u_int8_t *optp;
 	int optlen, error;
 	u_int16_t tlen, hlen, mssopt;
 	struct ip *ip = NULL;
 	struct rtentry *rt;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		rt = tcp_rtlookup6(&sc->sc_inc);
 		if (rt != NULL)
 			mssopt = rt->rt_ifp->if_mtu -
 			     (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
 		else 
 			mssopt = tcp_v6mssdflt;
 		hlen = sizeof(struct ip6_hdr);
 	} else
 #endif
 	{
 		rt = tcp_rtlookup(&sc->sc_inc);
 		if (rt != NULL)
 			mssopt = rt->rt_ifp->if_mtu -
 			     (sizeof(struct ip) + sizeof(struct tcphdr));
 		else 
 			mssopt = tcp_mssdflt;
 		hlen = sizeof(struct ip);
 	}
 
 	/* Compute the size of the TCP options. */
 	if (sc->sc_flags & SCF_NOOPT) {
 		optlen = 0;
 	} else {
 		optlen = TCPOLEN_MAXSEG +
 		    ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
 		    ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) +
 		    ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0);
 	}
 	tlen = hlen + sizeof(struct tcphdr) + optlen;
 
 	/*
 	 * XXX
 	 * assume that the entire packet will fit in a header mbuf
 	 */
 	KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
 
 	/*
 	 * XXX shouldn't this reuse the mbuf if possible ?
 	 * Create the IP+TCP header from scratch.
 	 */
 	if (m)
 		m_freem(m);
 
 	m = m_gethdr(M_DONTWAIT, MT_HEADER);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	mac_create_mbuf_from_socket(sc->sc_tp->t_inpcb->inp_socket, m);
 #endif
 
-#ifdef IPSEC
-	/* use IPsec policy on listening socket to send SYN,ACK */
-	if (ipsec_setsocket(m, sc->sc_tp->t_inpcb->inp_socket) != 0) {
-		m_freem(m);
-		return (ENOBUFS);
-	}
-#endif
-
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_src = sc->sc_inc.inc6_laddr;
 		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
 		ip6->ip6_plen = htons(tlen - hlen);
 		/* ip6_hlim is set after checksum */
 		/* ip6_flow = ??? */
 
 		th = (struct tcphdr *)(ip6 + 1);
 	} else
 #endif
 	{
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_len = tlen;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = sc->sc_inc.inc_laddr;
 		ip->ip_dst = sc->sc_inc.inc_faddr;
 		ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl;   /* XXX */
 		ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos;   /* XXX */
 
 		/*
 		 * See if we should do MTU discovery.  Route lookups are expensive,
 		 * so we will only unset the DF bit if:
 		 *
 		 *	1) path_mtu_discovery is disabled
 		 *	2) the SCF_UNREACH flag has been set
 		 */
 		if (path_mtu_discovery
 		    && ((sc->sc_flags & SCF_UNREACH) == 0)) {
 		       ip->ip_off |= IP_DF;
 		}
 
 		th = (struct tcphdr *)(ip + 1);
 	}
 	th->th_sport = sc->sc_inc.inc_lport;
 	th->th_dport = sc->sc_inc.inc_fport;
 
 	th->th_seq = htonl(sc->sc_iss);
 	th->th_ack = htonl(sc->sc_irs + 1);
 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	th->th_x2 = 0;
 	th->th_flags = TH_SYN|TH_ACK;
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
 	/* Tack on the TCP options. */
 	if (optlen == 0)
 		goto no_options;
 	optp = (u_int8_t *)(th + 1);
 	*optp++ = TCPOPT_MAXSEG;
 	*optp++ = TCPOLEN_MAXSEG;
 	*optp++ = (mssopt >> 8) & 0xff;
 	*optp++ = mssopt & 0xff;
 
 	if (sc->sc_flags & SCF_WINSCALE) {
 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 		    sc->sc_request_r_scale);
 		optp += 4;
 	}
 
 	if (sc->sc_flags & SCF_TIMESTAMP) {
 		u_int32_t *lp = (u_int32_t *)(optp);
 
 		/* Form timestamp option as shown in appendix A of RFC 1323. */
 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
 		*lp++ = htonl(ticks);
 		*lp   = htonl(sc->sc_tsrecent);
 		optp += TCPOLEN_TSTAMP_APPA;
 	}
 
 	/*
          * Send CC and CC.echo if we received CC from our peer.
          */
         if (sc->sc_flags & SCF_CC) {
 		u_int32_t *lp = (u_int32_t *)(optp);
 
 		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
 		*lp++ = htonl(sc->sc_cc_send);
 		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO));
 		*lp   = htonl(sc->sc_cc_recv);
 		optp += TCPOLEN_CC_APPA * 2;
 	}
 no_options:
 
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		struct route_in6 *ro6 = &sc->sc_route6;
 
 		th->th_sum = 0;
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 		ip6->ip6_hlim = in6_selecthlim(NULL,
 		    ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
-		error = ip6_output(m, NULL, ro6, 0, NULL, NULL);
+		error = ip6_output(m, NULL, ro6, 0, NULL, NULL,
+				sc->sc_tp->t_inpcb);
 	} else
 #endif
 	{
         	th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen - hlen + IPPROTO_TCP));
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
-		error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL);
+		error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,
+				sc->sc_tp->t_inpcb);
 	}
 	return (error);
 }
 
 /*
  * cookie layers:
  *
  *	|. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
  *	| peer iss                                                      |
  *	| MD5(laddr,faddr,lport,fport,secret)             |. . . . . . .|
  *	|                     0                       |(A)|             |
  * (A): peer mss index
  */
 
 /*
  * The values below are chosen to minimize the size of the tcp_secret
  * table, as well as providing roughly a 4 second lifetime for the cookie.
  */
 
 #define SYNCOOKIE_HASHSHIFT	2	/* log2(# of 32bit words from hash) */
 #define SYNCOOKIE_WNDBITS	7	/* exposed bits for window indexing */
 #define SYNCOOKIE_TIMESHIFT	5	/* scale ticks to window time units */
 
 #define SYNCOOKIE_HASHMASK	((1 << SYNCOOKIE_HASHSHIFT) - 1)
 #define SYNCOOKIE_WNDMASK	((1 << SYNCOOKIE_WNDBITS) - 1)
 #define SYNCOOKIE_NSECRETS	(1 << (SYNCOOKIE_WNDBITS - SYNCOOKIE_HASHSHIFT))
 #define SYNCOOKIE_TIMEOUT \
     (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
 #define SYNCOOKIE_DATAMASK 	((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
 
 static struct {
 	u_int32_t	ts_secbits;
 	u_int		ts_expire;
 } tcp_secret[SYNCOOKIE_NSECRETS];
 
 static int tcp_msstab[] = { 0, 536, 1460, 8960 };
 
 static MD5_CTX syn_ctx;
 
 #define MD5Add(v)	MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
 
 /*
  * Consider the problem of a recreated (and retransmitted) cookie.  If the
  * original SYN was accepted, the connection is established.  The second 
  * SYN is inflight, and if it arrives with an ISN that falls within the 
  * receive window, the connection is killed.  
  *
  * However, since cookies have other problems, this may not be worth
  * worrying about.
  */
 
 static u_int32_t
 syncookie_generate(struct syncache *sc)
 {
 	u_int32_t md5_buffer[4];
 	u_int32_t data;
 	int wnd, idx;
 
 	wnd = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
 	idx = wnd >> SYNCOOKIE_HASHSHIFT;
 	if (tcp_secret[idx].ts_expire < ticks) {
 		tcp_secret[idx].ts_secbits = arc4random();
 		tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
 	}
 	for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
 		if (tcp_msstab[data] <= sc->sc_peer_mss)
 			break;
 	data = (data << SYNCOOKIE_WNDBITS) | wnd;
 	data ^= sc->sc_irs;				/* peer's iss */
 	MD5Init(&syn_ctx);
 #ifdef INET6
 	if (sc->sc_inc.inc_isipv6) {
 		MD5Add(sc->sc_inc.inc6_laddr);
 		MD5Add(sc->sc_inc.inc6_faddr);
 	} else
 #endif
 	{
 		MD5Add(sc->sc_inc.inc_laddr);
 		MD5Add(sc->sc_inc.inc_faddr);
 	}
 	MD5Add(sc->sc_inc.inc_lport);
 	MD5Add(sc->sc_inc.inc_fport);
 	MD5Add(tcp_secret[idx].ts_secbits);
 	MD5Final((u_char *)&md5_buffer, &syn_ctx);
 	data ^= (md5_buffer[wnd & SYNCOOKIE_HASHMASK] & ~SYNCOOKIE_WNDMASK);
 	return (data);
 }
 
 static struct syncache *
 syncookie_lookup(inc, th, so)
 	struct in_conninfo *inc;
 	struct tcphdr *th;
 	struct socket *so;
 {
 	u_int32_t md5_buffer[4];
 	struct syncache *sc;
 	u_int32_t data;
 	int wnd, idx;
 
 	data = (th->th_ack - 1) ^ (th->th_seq - 1);	/* remove ISS */
 	wnd = data & SYNCOOKIE_WNDMASK;
 	idx = wnd >> SYNCOOKIE_HASHSHIFT;
 	if (tcp_secret[idx].ts_expire < ticks ||
 	    sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
 		return (NULL);
 	MD5Init(&syn_ctx);
 #ifdef INET6
 	if (inc->inc_isipv6) {
 		MD5Add(inc->inc6_laddr);
 		MD5Add(inc->inc6_faddr);
 	} else
 #endif
 	{
 		MD5Add(inc->inc_laddr);
 		MD5Add(inc->inc_faddr);
 	}
 	MD5Add(inc->inc_lport);
 	MD5Add(inc->inc_fport);
 	MD5Add(tcp_secret[idx].ts_secbits);
 	MD5Final((u_char *)&md5_buffer, &syn_ctx);
 	data ^= md5_buffer[wnd & SYNCOOKIE_HASHMASK];
 	if ((data & ~SYNCOOKIE_DATAMASK) != 0)
 		return (NULL);
 	data = data >> SYNCOOKIE_WNDBITS;
 
 	sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
 	if (sc == NULL)
 		return (NULL);
 	/*
 	 * Fill in the syncache values.
 	 * XXX duplicate code from syncache_add
 	 */
 	sc->sc_ipopts = NULL;
 	sc->sc_inc.inc_fport = inc->inc_fport;
 	sc->sc_inc.inc_lport = inc->inc_lport;
 #ifdef INET6
 	sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
 	if (inc->inc_isipv6) {
 		sc->sc_inc.inc6_faddr = inc->inc6_faddr;
 		sc->sc_inc.inc6_laddr = inc->inc6_laddr;
 		sc->sc_route6.ro_rt = NULL;
 	} else
 #endif
 	{
 		sc->sc_inc.inc_faddr = inc->inc_faddr;
 		sc->sc_inc.inc_laddr = inc->inc_laddr;
 		sc->sc_route.ro_rt = NULL;
 	}
 	sc->sc_irs = th->th_seq - 1;
 	sc->sc_iss = th->th_ack - 1;
 	wnd = sbspace(&so->so_rcv);
 	wnd = imax(wnd, 0);
 	wnd = imin(wnd, TCP_MAXWIN);
 	sc->sc_wnd = wnd;
 	sc->sc_flags = 0;
 	sc->sc_rxtslot = 0;
 	sc->sc_peer_mss = tcp_msstab[data];
 	return (sc);
 }
Index: head/sys/netinet/tcp_timewait.c
===================================================================
--- head/sys/netinet/tcp_timewait.c	(revision 105193)
+++ head/sys/netinet/tcp_timewait.c	(revision 105194)
@@ -1,1695 +1,1690 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #include <netinet6/ip6protosw.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 int 	tcp_mssdflt = TCP_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 
     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 int	tcp_v6mssdflt = TCP6_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
 	CTLFLAG_RW, &tcp_v6mssdflt , 0,
 	"Default TCP Maximum Segment Size for IPv6");
 #endif
 
 #if 0
 static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 
     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
 #endif
 
 int	tcp_do_rfc1323 = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 
     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
 
 int	tcp_do_rfc1644 = 0;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 
     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
      "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 
     &tcbinfo.ipi_count, 0, "Number of active PCBs");
 
 static int	icmp_may_rst = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static int	tcp_isn_reseed_interval = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
 /*
  * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
  * 1024 exists only for debugging.  A good production default would be 
  * something like 6100.
  */
 static int	tcp_inflight_enable = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
 
 static int	tcp_inflight_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
 
 static int	tcp_inflight_min = 1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
 
 static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
 
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	512
 #endif
 
 /*
  * This is the actual shape of what we allocate using the zone
  * allocator.  Doing it this way allows us to protect both structures
  * using the same generation count, and also eliminates the overhead
  * of allocating tcpcbs separately.  By hiding the structure here,
  * we avoid changing most of the rest of the code (although it needs
  * to be changed, eventually, for greater efficiency).
  */
 #define	ALIGNMENT	32
 #define	ALIGNM1		(ALIGNMENT - 1)
 struct	inp_tp {
 	union {
 		struct	inpcb inp;
 		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
 	} inp_tp_u;
 	struct	tcpcb tcb;
 	struct	callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
 	struct	callout inp_tp_delack;
 };
 #undef ALIGNMENT
 #undef ALIGNM1
 
 /*
  * Tcp initialization
  */
 void
 tcp_init()
 {
 	int hashsize = TCBHASHSIZE;
 	
 	tcp_ccgen = 1;
 	tcp_cleartaocache();
 
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 
 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
 	LIST_INIT(&tcb);
 	tcbinfo.listhead = &tcb;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
 		printf("WARNING: TCB hash size not a power of 2\n");
 		hashsize = 512; /* safe default */
 	}
 	tcp_tcbhashsize = hashsize;
 	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
 	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
 					&tcbinfo.porthashmask);
 	tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), 
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	syncache_init();
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcp_fillheaders(tp, ip_ptr, tcp_ptr)
 	struct tcpcb *tp;
 	void *ip_ptr;
 	void *tcp_ptr;
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = sizeof(struct tcphdr);
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 		tcp_hdr->th_sum = 0;
 	} else
 #endif
 	{
 	struct ip *ip = (struct ip *) ip_ptr;
 
 	ip->ip_vhl = IP_VHL_BORING;
 	ip->ip_tos = 0;
 	ip->ip_len = 0;
 	ip->ip_id = 0;
 	ip->ip_off = 0;
 	ip->ip_ttl = 0;
 	ip->ip_sum = 0;
 	ip->ip_p = IPPROTO_TCP;
 	ip->ip_src = inp->inp_laddr;
 	ip->ip_dst = inp->inp_faddr;
 	tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		htons(sizeof(struct tcphdr) + IPPROTO_TCP));
 	}
 
 	tcp_hdr->th_sport = inp->inp_lport;
 	tcp_hdr->th_dport = inp->inp_fport;
 	tcp_hdr->th_seq = 0;
 	tcp_hdr->th_ack = 0;
 	tcp_hdr->th_x2 = 0;
 	tcp_hdr->th_off = 5;
 	tcp_hdr->th_flags = 0;
 	tcp_hdr->th_win = 0;
 	tcp_hdr->th_urp = 0;
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcp_maketemplate(tp)
 	struct tcpcb *tp;
 {
 	struct mbuf *m;
 	struct tcptemp *n;
 
 	m = m_get(M_DONTWAIT, MT_HEADER);
 	if (m == NULL)
 		return (0);
 	m->m_len = sizeof(struct tcptemp);
 	n = mtod(m, struct tcptemp *);
 
 	tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
 	return (n);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == 0, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
 	struct tcpcb *tp;
 	void *ipgen;
 	register struct tcphdr *th;
 	register struct mbuf *m;
 	tcp_seq ack, seq;
 	int flags;
 {
 	register int tlen;
 	int win = 0;
 	struct route *ro = 0;
 	struct route sro;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct route_in6 *ro6 = 0;
 	struct route_in6 sro6;
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 #ifdef INET6
 		if (isipv6)
 			ro6 = &tp->t_inpcb->in6p_route;
 		else
 #endif /* INET6 */
 		ro = &tp->t_inpcb->inp_route;
 	} else {
 #ifdef INET6
 		if (isipv6) {
 			ro6 = &sro6;
 			bzero(ro6, sizeof *ro6);
 		} else
 #endif /* INET6 */
 	      {
 		ro = &sro;
 		bzero(ro, sizeof *ro);
 	      }
 	}
 	if (m == 0) {
 		m = m_gethdr(M_DONTWAIT, MT_HEADER);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t), 
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		m_freem(m->m_next);
 		m->m_next = 0;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 	      {
 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 		nth = (struct tcphdr *)(ip + 1);
 	      }
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, n_short);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 						tlen));
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	} else
 #endif
       {
 	tlen += sizeof (struct tcpiphdr);
 	ip->ip_len = tlen;
 	ip->ip_ttl = ip_defttl;
       }
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = (struct ifnet *) 0;
 #ifdef MAC
 	if (tp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
 	} else {
 		/*
 		 * XXXMAC: This will need to call a mac function that
 		 * modifies the mbuf label in place for TCP datagrams
 		 * not associated with a PCB.
 		 */
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 #ifdef INET6
 	if (isipv6) {
 		nth->th_sum = 0;
 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
 					sizeof(struct ip6_hdr),
 					tlen - sizeof(struct ip6_hdr));
 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
 					       ro6 && ro6->ro_rt ?
 					       ro6->ro_rt->rt_ifp :
 					       NULL);
 	} else
 #endif /* INET6 */
       {
         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
         m->m_pkthdr.csum_flags = CSUM_TCP;
         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
       }
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
-#ifdef IPSEC
-	if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
-		m_freem(m);
-		return;
-	}
-#endif
 #ifdef INET6
 	if (isipv6) {
-		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
+		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
+			tp ? tp->t_inpcb : NULL);
 		if (ro6 == &sro6 && ro6->ro_rt) {
 			RTFREE(ro6->ro_rt);
 			ro6->ro_rt = NULL;
 		}
 	} else
 #endif /* INET6 */
       {
-	(void) ip_output(m, NULL, ro, ipflags, NULL);
+	(void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
 	if (ro == &sro && ro->ro_rt) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = NULL;
 	}
       }
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(inp)
 	struct inpcb *inp;
 {
 	struct inp_tp *it;
 	register struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	it = (struct inp_tp *)inp;
 	tp = &it->tcb;
 	bzero((char *) tp, sizeof(struct tcpcb));
 	LIST_INIT(&tp->t_segq);
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 		tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
 	callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
 	callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
 	callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
 	callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
 
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_rfc1644)
 		tp->t_flags |= TF_REQ_CC;
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
         /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = ip_defttl;
 	inp->inp_ppcb = (caddr_t)tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(tp, errno)
 	register struct tcpcb *tp;
 	int errno;
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
 		(void) tcp_output(tp);
 		tcpstat.tcps_drops++;
 	} else
 		tcpstat.tcps_conndrops++;
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 /*
  * Close a TCP control block:
  *	discard all space held by the tcp
  *	discard internet protocol block
  *	wake up any sleepers
  */
 struct tcpcb *
 tcp_close(tp)
 	register struct tcpcb *tp;
 {
 	register struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 	register struct rtentry *rt;
 	int dosavessthresh;
 
 	/*
 	 * Make sure that all of our timers are stopped before we
 	 * delete the PCB.
 	 */
 	callout_stop(tp->tt_rexmt);
 	callout_stop(tp->tt_persist);
 	callout_stop(tp->tt_keep);
 	callout_stop(tp->tt_2msl);
 	callout_stop(tp->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as the 16 samples.
 	 * 16 samples is enough for the srtt filter to converge
 	 * to within 5% of the correct value; fewer samples and
 	 * we could save a very bogus rtt.
 	 *
 	 * Don't update the default route's characteristics and don't
 	 * update anything that the user "locked".
 	 */
 	if (tp->t_rttupdated >= 16) {
 		register u_long i = 0;
 #ifdef INET6
 		if (isipv6) {
 			struct sockaddr_in6 *sin6;
 
 			if ((rt = inp->in6p_route.ro_rt) == NULL)
 				goto no_valid_rt;
 			sin6 = (struct sockaddr_in6 *)rt_key(rt);
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 				goto no_valid_rt;
 		}
 		else
 #endif /* INET6 */		
 		if ((rt = inp->inp_route.ro_rt) == NULL ||
 		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
 		    == INADDR_ANY)
 			goto no_valid_rt;
 
 		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
 			i = tp->t_srtt *
 			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
 			if (rt->rt_rmx.rmx_rtt && i)
 				/*
 				 * filter this update to half the old & half
 				 * the new values, converting scale.
 				 * See route.h and tcp_var.h for a
 				 * description of the scaling constants.
 				 */
 				rt->rt_rmx.rmx_rtt =
 				    (rt->rt_rmx.rmx_rtt + i) / 2;
 			else
 				rt->rt_rmx.rmx_rtt = i;
 			tcpstat.tcps_cachedrtt++;
 		}
 		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
 			i = tp->t_rttvar *
 			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
 			if (rt->rt_rmx.rmx_rttvar && i)
 				rt->rt_rmx.rmx_rttvar =
 				    (rt->rt_rmx.rmx_rttvar + i) / 2;
 			else
 				rt->rt_rmx.rmx_rttvar = i;
 			tcpstat.tcps_cachedrttvar++;
 		}
 		/*
 		 * The old comment here said:
 		 * update the pipelimit (ssthresh) if it has been updated
 		 * already or if a pipesize was specified & the threshhold
 		 * got below half the pipesize.  I.e., wait for bad news
 		 * before we start updating, then update on both good
 		 * and bad news.
 		 *
 		 * But we want to save the ssthresh even if no pipesize is
 		 * specified explicitly in the route, because such
 		 * connections still have an implicit pipesize specified
 		 * by the global tcp_sendspace.  In the absence of a reliable
 		 * way to calculate the pipesize, it will have to do.
 		 */
 		i = tp->snd_ssthresh;
 		if (rt->rt_rmx.rmx_sendpipe != 0)
 			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
 		else
 			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
 		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
 		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
 		    || dosavessthresh) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (i < 2)
 				i = 2;
 			i *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 				      (isipv6 ? sizeof (struct ip6_hdr) +
 					       sizeof (struct tcphdr) :
 #endif
 				       sizeof (struct tcpiphdr)
 #ifdef INET6
 				       )
 #endif
 				      );
 			if (rt->rt_rmx.rmx_ssthresh)
 				rt->rt_rmx.rmx_ssthresh =
 				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
 			else
 				rt->rt_rmx.rmx_ssthresh = i;
 			tcpstat.tcps_cachedssthresh++;
 		}
 	}
     no_valid_rt:
 	/* free the reassembly queue, if any */
 	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
 		LIST_REMOVE(q, tqe_q);
 		m_freem(q->tqe_m);
 		FREE(q, M_TSEGQ);
 	}
 	inp->inp_ppcb = NULL;
 	soisdisconnected(so);
 #ifdef INET6
 	if (INP_CHECK_SOCKAF(so, AF_INET6))
 		in6_pcbdetach(inp);
 	else
 #endif /* INET6 */
 	in_pcbdetach(inp);
 	tcpstat.tcps_closed++;
 	return ((struct tcpcb *)0);
 }
 
 void
 tcp_drain()
 {
 	if (do_tcpdrain)
 	{
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 		struct tseg_qent *te;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 * 	where we're really low on mbufs, this is potentially
 	 *  	usefull.	
 	 */
 		INP_INFO_RLOCK(&tcbinfo);
 		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
 			INP_LOCK(inpb);
 			if ((tcpb = intotcpcb(inpb))) {
 				while ((te = LIST_FIRST(&tcpb->t_segq))
 			            != NULL) {
 					LIST_REMOVE(te, tqe_q);
 					m_freem(te->tqe_m);
 					FREE(te, M_TSEGQ);
 				}
 			}
 			INP_UNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&tcbinfo);
 	}
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(inp, error)
 	struct inpcb *inp;
 	int error;
 {
 	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	     (error == EHOSTUNREACH || error == ENETUNREACH ||
 	      error == EHOSTDOWN)) {
 		return inp;
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tcp_drop(tp, error);
 		return (struct inpcb *)0;
 	} else {
 		tp->t_softerror = error;
 		return inp;
 	}
 #if 0
 	wakeup((caddr_t) &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = tcbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xtcpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	gencnt = tcbinfo.ipi_gencnt;
 	n = tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 
 	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xtcpcb));
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			caddr_t inp_ppcb;
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb != NULL)
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 			else
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		}
 		INP_UNLOCK(inp);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		INP_INFO_RLOCK(&tcbinfo);
 		xig.xig_gen = tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = tcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&tcbinfo);
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error, s, mapped = 0;
 
 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 			return (EINVAL);
 	}
 	s = splnet();
 	INP_INFO_RLOCK(&tcbinfo);
 	if (mapped == 1)
 		inp = in_pcblookup_hash(&tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port,
 			0, NULL);
 	else
 		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
 				 addrs[1].sin6_port,
 				 &addrs[0].sin6_addr, addrs[0].sin6_port,
 				 0, NULL);
 	if (inp == NULL) {
 		error = ENOENT;
 		goto outunlocked;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_UNLOCK(inp);
 outunlocked:
 	INP_INFO_RUNLOCK(&tcbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif
 
 
 void
 tcp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	tcp_seq icmp_seq;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_QUENCH)
 		notify = tcp_quench;
 	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	else if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (PRC_IS_REDIRECT(cmd)) {
 		ip = 0;
 		notify = in_rtchange;
 	} else if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		th = (struct tcphdr *)((caddr_t)ip 
 				       + (IP_VHL_HL(ip->ip_vhl) << 2));
 		INP_INFO_WLOCK(&tcbinfo);
 		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, 0, NULL);
 		if (inp != NULL)  {
 			INP_LOCK(inp);
 			if (inp->inp_socket != NULL) {
 				icmp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
 			    		SEQ_LT(icmp_seq, tp->snd_max))
 					inp = (*notify)(inp, inetctlerrmap[cmd]);
 			}
 			if (inp)
 				INP_UNLOCK(inp);
 		} else {
 			struct in_conninfo inc;
 
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 #ifdef INET6
 			inc.inc_isipv6 = 0;
 #endif
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&tcbinfo);
 		splx(s);
 	} else
 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 #ifdef INET6
 void
 tcp6_ctlinput(cmd, sa, d)
 	int cmd;
 	struct sockaddr *sa;
 	void *d;
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_QUENCH)
 		notify = tcp_quench;
 	else if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&tcb, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, notify);
 
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_isipv6 = 1;
 		syncache_unreach(&inc, &th);
 	} else
 		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used to generate sequence numbers.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 
 u_char isn_secret[32];
 int isn_last_reseed;
 MD5_CTX isn_ctx;
 
 tcp_seq
 tcp_new_isn(tp)
 	struct tcpcb *tp;
 {
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 
 	/* Seed if this is the first use, reseed if requested. */
 	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
 	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&isn_secret, sizeof(isn_secret));
 		isn_last_reseed = ticks;
 	}
 		
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
 	return new_isn;
 }
 
 /*
  * When a source quench is received, close congestion window
  * to one segment.  We will gradually open it again as we proceed.
  */
 struct inpcb *
 tcp_quench(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 
 	if (tp)
 		tp->snd_cwnd = tp->t_maxseg;
 	return (inp);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 
 	if (tp && tp->t_state == TCPS_SYN_SENT) {
 		tcp_drop(tp, errno);
 		return (struct inpcb *)0;
 	}
 	return inp;
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value in the route.  Also nudge TCP to send something,
  * since we know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 struct inpcb *
 tcp_mtudisc(inp, errno)
 	struct inpcb *inp;
 	int errno;
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct rtentry *rt;
 	struct rmxp_tao *taop;
 	struct socket *so = inp->inp_socket;
 	int offered;
 	int mss;
 #ifdef INET6
 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	if (tp) {
 #ifdef INET6
 		if (isipv6)
 			rt = tcp_rtlookup6(&inp->inp_inc);
 		else
 #endif /* INET6 */
 		rt = tcp_rtlookup(&inp->inp_inc);
 		if (!rt || !rt->rt_rmx.rmx_mtu) {
 			tp->t_maxopd = tp->t_maxseg =
 #ifdef INET6
 				isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
 				tcp_mssdflt;
 			return inp;
 		}
 		taop = rmx_taop(rt->rt_rmx);
 		offered = taop->tao_mssopt;
 		mss = rt->rt_rmx.rmx_mtu -
 #ifdef INET6
 			(isipv6 ?
 			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
 #endif /* INET6 */
 			 sizeof(struct tcpiphdr)
 #ifdef INET6
 			 )
 #endif /* INET6 */
 			;
 
 		if (offered)
 			mss = min(mss, offered);
 		/*
 		 * XXX - The above conditional probably violates the TCP
 		 * spec.  The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 		if (tp->t_maxopd <= mss)
 			return inp;
 		tp->t_maxopd = mss;
 
 		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 			mss -= TCPOLEN_TSTAMP_APPA;
 		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
 			mss -= TCPOLEN_CC_APPA;
 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
 		if (mss > MCLBYTES)
 			mss &= ~(MCLBYTES-1);
 #else
 		if (mss > MCLBYTES)
 			mss = mss / MCLBYTES * MCLBYTES;
 #endif
 		if (so->so_snd.sb_hiwat < mss)
 			mss = so->so_snd.sb_hiwat;
 
 		tp->t_maxseg = mss;
 
 		tcpstat.tcps_mturesent++;
 		tp->t_rtttime = 0;
 		tp->snd_nxt = tp->snd_una;
 		tcp_output(tp);
 	}
 	return inp;
 }
 
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated the return NULL.  This routine
  * is called by TCP routines that access the rmx structure and by tcp_mss
  * to get the interface MTU.
  */
 struct rtentry *
 tcp_rtlookup(inc)
 	struct in_conninfo *inc;
 {
 	struct route *ro;
 	struct rtentry *rt;
 
 	ro = &inc->inc_route;
 	rt = ro->ro_rt;
 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 		/* No route yet, so try to acquire one */
 		if (inc->inc_faddr.s_addr != INADDR_ANY) {
 			ro->ro_dst.sa_family = AF_INET;
 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
 			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
 			    inc->inc_faddr;
 			rtalloc(ro);
 			rt = ro->ro_rt;
 		}
 	}
 	return rt;
 }
 
 #ifdef INET6
 struct rtentry *
 tcp_rtlookup6(inc)
 	struct in_conninfo *inc;
 {
 	struct route_in6 *ro6;
 	struct rtentry *rt;
 
 	ro6 = &inc->inc6_route;
 	rt = ro6->ro_rt;
 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 		/* No route yet, so try to acquire one */
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 			ro6->ro_dst.sin6_family = AF_INET6;
 			ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 			ro6->ro_dst.sin6_addr = inc->inc6_faddr;
 			rtalloc((struct route *)ro6);
 			rt = ro6->ro_rt;
 		}
 	}
 	return rt;
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(tp)
 	struct tcpcb *tp;
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif /* INET6 */
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return 0;
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return 0;
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcp_fillheaders(tp, ip6, th);
 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
       {
 	ip = mtod(m, struct ip *);
 	th = (struct tcphdr *)(ip + 1);
 	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 	tcp_fillheaders(tp, ip, th);
 	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
       }
 
 	m_free(m);
 	return hdrsiz;
 }
 #endif /*IPSEC*/
 
 /*
  * Return a pointer to the cached information about the remote host.
  * The cached information is stored in the protocol specific part of
  * the route metrics.
  */
 struct rmxp_tao *
 tcp_gettaocache(inc)
 	struct in_conninfo *inc;
 {
 	struct rtentry *rt;
 
 #ifdef INET6
 	if (inc->inc_isipv6)
 		rt = tcp_rtlookup6(inc);
 	else
 #endif /* INET6 */
 	rt = tcp_rtlookup(inc);
 
 	/* Make sure this is a host route and is up. */
 	if (rt == NULL ||
 	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
 		return NULL;
 
 	return rmx_taop(rt->rt_rmx);
 }
 
 /*
  * Clear all the TAO cache entries, called from tcp_init.
  *
  * XXX
  * This routine is just an empty one, because we assume that the routing
  * routing tables are initialized at the same time when TCP, so there is
  * nothing in the cache left over.
  */
 static void
 tcp_cleartaocache()
 {
 }
 
 /*
  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
  *
  * This code attempts to calculate the bandwidth-delay product as a
  * means of determining the optimal window size to maximize bandwidth,
  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
  * routers.  This code also does a fairly good job keeping RTTs in check
  * across slow links like modems.  We implement an algorithm which is very
  * similar (but not meant to be) TCP/Vegas.  The code operates on the
  * transmitter side of a TCP connection and so only effects the transmit
  * side of the connection.
  *
  * BACKGROUND:  TCP makes no provision for the management of buffer space
  * at the end points or at the intermediate routers and switches.  A TCP 
  * stream, whether using NewReno or not, will eventually buffer as
  * many packets as it is able and the only reason this typically works is
  * due to the fairly small default buffers made available for a connection
  * (typicaly 16K or 32K).  As machines use larger windows and/or window
  * scaling it is now fairly easy for even a single TCP connection to blow-out
  * all available buffer space not only on the local interface, but on 
  * intermediate routers and switches as well.  NewReno makes a misguided
  * attempt to 'solve' this problem by waiting for an actual failure to occur,
  * then backing off, then steadily increasing the window again until another
  * failure occurs, ad-infinitum.  This results in terrible oscillation that
  * is only made worse as network loads increase and the idea of intentionally
  * blowing out network buffers is, frankly, a terrible way to manage network
  * resources.
  *
  * It is far better to limit the transmit window prior to the failure
  * condition being achieved.  There are two general ways to do this:  First
  * you can 'scan' through different transmit window sizes and locate the
  * point where the RTT stops increasing, indicating that you have filled the
  * pipe, then scan backwards until you note that RTT stops decreasing, then
  * repeat ad-infinitum.  This method works in principle but has severe
  * implementation issues due to RTT variances, timer granularity, and
  * instability in the algorithm which can lead to many false positives and
  * create oscillations as well as interact badly with other TCP streams
  * implementing the same algorithm.
  *
  * The second method is to limit the window to the bandwidth delay product
  * of the link.  This is the method we implement.  RTT variances and our
  * own manipulation of the congestion window, bwnd, can potentially 
  * destabilize the algorithm.  For this reason we have to stabilize the
  * elements used to calculate the window.  We do this by using the minimum
  * observed RTT, the long term average of the observed bandwidth, and
  * by adding two segments worth of slop.  It isn't perfect but it is able
  * to react to changing conditions and gives us a very stable basis on
  * which to extend the algorithm.
  */
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
 	u_long bw;
 	u_long bwnd;
 	int save_ticks;
 
 	/*
 	 * If inflight_enable is disabled in the middle of a tcp connection,
 	 * make sure snd_bwnd is effectively disabled.
 	 */
 	if (tcp_inflight_enable == 0) {
 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 		tp->snd_bandwidth = 0;
 		return;
 	}
 
 	/*
 	 * Figure out the bandwidth.  Due to the tick granularity this
 	 * is a very rough number and it MUST be averaged over a fairly
 	 * long period of time.  XXX we need to take into account a link
 	 * that is not using all available bandwidth, but for now our
 	 * slop will ramp us up if this case occurs and the bandwidth later
 	 * increases.
 	 *
 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
 	    (save_ticks - tp->t_bw_rtttime);
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 		return;
 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	tp->snd_bandwidth = bw;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
 	 * segments.  The additional slop puts us squarely in the sweet
 	 * spot and also handles the bandwidth run-up case.  Without the
 	 * slop we could be locking ourselves into a lower bandwidth.
 	 *
 	 * Situations Handled:
 	 *	(1) Prevents over-queueing of packets on LANs, especially on
 	 *	    high speed LANs, allowing larger TCP buffers to be
 	 *	    specified, and also does a good job preventing 
 	 *	    over-queueing of packets over choke points like modems
 	 *	    (at least for the transmit side).
 	 *
 	 *	(2) Is able to handle changing network loads (bandwidth
 	 *	    drops so bwnd drops, bandwidth increases so bwnd
 	 *	    increases).
 	 *
 	 *	(3) Theoretically should stabilize in the face of multiple
 	 *	    connections implementing the same algorithm (this may need
 	 *	    a little work).
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 			    tp,
 			    bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
 	if ((long)bwnd < tcp_inflight_min)
 		bwnd = tcp_inflight_min;
 	if (bwnd > tcp_inflight_max)
 		bwnd = tcp_inflight_max;
 	if ((long)bwnd < tp->t_maxseg * 2)
 		bwnd = tp->t_maxseg * 2;
 	tp->snd_bwnd = bwnd;
 }
 
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c	(revision 105193)
+++ head/sys/netinet/udp_usrreq.c	(revision 105194)
@@ -1,1077 +1,1071 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  * $FreeBSD$
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 #ifndef	COMPAT_42
 static int	udpcksum = 1;
 #else
 static int	udpcksum = 0;		/* XXX */
 #endif
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
 		&udpcksum, 0, "");
 
 int	log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, 
     &log_in_vain, 0, "Log all incoming UDP packets");
 
 static int	blackhole = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
 	&blackhole, 0, "Do not send port unreachables for refused connects");
 
 struct	inpcbhead udb;		/* from udp_var.h */
 #define	udb6	udb  /* for KAME src sync over BSD*'s */
 struct	inpcbinfo udbinfo;
 
 #ifndef UDBHASHSIZE
 #define UDBHASHSIZE 16
 #endif
 
 struct	udpstat udpstat;	/* from udp_var.h */
 SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
     &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 static struct	sockaddr_in udp_in = { sizeof(udp_in), AF_INET };
 #ifdef INET6
 struct udp_in6 {
 	struct sockaddr_in6	uin6_sin;
 	u_char			uin6_init_done : 1;
 } udp_in6 = {
 	{ sizeof(udp_in6.uin6_sin), AF_INET6 },
 	0
 };
 struct udp_ip6 {
 	struct ip6_hdr		uip6_ip6;
 	u_char			uip6_init_done : 1;
 } udp_ip6;
 #endif /* INET6 */
 
 static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
 		int off);
 #ifdef INET6
 static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip);
 #endif
 
 static int udp_detach(struct socket *so);
 static	int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		struct mbuf *, struct thread *);
 
 void
 udp_init()
 {
 	INP_INFO_LOCK_INIT(&udbinfo, "udp");
 	LIST_INIT(&udb);
 	udbinfo.listhead = &udb;
 	udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask);
 	udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB,
 					&udbinfo.porthashmask);
 	udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
 }
 
 void
 udp_input(m, off)
 	register struct mbuf *m;
 	int off;
 {
 	int iphlen = off;
 	register struct ip *ip;
 	register struct udphdr *uh;
 	register struct inpcb *inp;
 	struct mbuf *opts = 0;
 	int len;
 	struct ip save_ip;
 	struct sockaddr *append_sa;
 #ifdef MAC
 	int error;
 #endif
 
 	udpstat.udps_ipackets++;
 
 	/*
 	 * Strip IP options, if any; should skip this,
 	 * make available to user, and use on returned packets,
 	 * but we don't yet have a way to check the checksum
 	 * with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m, (struct mbuf *)0);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			udpstat.udps_hdrops++;
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/* destination port of 0 is illegal, based on RFC768. */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Make mbuf data length reflect UDP length.
 	 * If not enough data to reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
 			udpstat.udps_badlen++;
 			goto badunlocked;
 		}
 		m_adj(m, len - ip->ip_len);
 		/* ip->ip_len = len; */
 	}
 	/*
 	 * Save a copy of the IP header in case we want restore it
 	 * for sending an ICMP error message in response.
 	 */
 	if (!blackhole)
 		save_ip = *ip;
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh->uh_sum = m->m_pkthdr.csum_data;
 			else
 	                	uh->uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh->uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh->uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh->uh_sum) {
 			udpstat.udps_badsum++;
 			m_freem(m);
 			return;
 		}
 	} else
 		udpstat.udps_nosum++;
 
 	INP_INFO_RLOCK(&udbinfo);
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 		struct inpcb *last;
 		/*
 		 * Deliver a multicast or broadcast datagram to *all* sockets
 		 * for which the local and remote addresses and ports match
 		 * those of the incoming datagram.  This allows more than
 		 * one process to receive multi/broadcasts on the same port.
 		 * (This really ought to be done for unicast datagrams as
 		 * well, but that would cause problems with existing
 		 * applications that open both address-specific sockets and
 		 * a wildcard socket listening to the same port -- they would
 		 * end up receiving duplicates of every unicast datagram.
 		 * Those applications open the multiple sockets to overcome an
 		 * inadequacy of the UDP socket interface, but for backwards
 		 * compatibility we avoid the problem here rather than
 		 * fixing the interface.  Maybe 4.5BSD will remedy this?)
 		 */
 
 		/*
 		 * Construct sockaddr format source address.
 		 */
 		udp_in.sin_port = uh->uh_sport;
 		udp_in.sin_addr = ip->ip_src;
 		/*
 		 * Locate pcb(s) for datagram.
 		 * (Algorithm copied from raw_intr().)
 		 */
 		last = NULL;
 #ifdef INET6
 		udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0;
 #endif
 		LIST_FOREACH(inp, &udb, inp_list) {
 			INP_LOCK(inp);
 			if (inp->inp_lport != uh->uh_dport) {
 		docontinue:
 				INP_UNLOCK(inp);
 				continue;
 			}
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				goto docontinue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
 				if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 					goto docontinue;
 			}
 			if (inp->inp_faddr.s_addr != INADDR_ANY) {
 				if (inp->inp_faddr.s_addr !=
 				    ip->ip_src.s_addr ||
 				    inp->inp_fport != uh->uh_sport)
 					goto docontinue;
 			}
 
 			if (last != NULL) {
 				struct mbuf *n;
 				int policyfail;
 
 				policyfail = 0;
 #ifdef IPSEC
 				/* check AH/ESP integrity. */
 				if (ipsec4_in_reject_so(m, last->inp_socket)) {
 					ipsecstat.in_polvio++;
 					policyfail = 1;
 					/* do not inject data to pcb */
 				}
 #endif /*IPSEC*/
 #ifdef MAC
 				if (mac_check_socket_deliver(last->inp_socket,
 				    m) != 0)
 					policyfail = 1;
 #endif
 				if (!policyfail) {
 					n = m_copy(m, 0, M_COPYALL);
 					if (n != NULL)
 						udp_append(last, ip, n,
 						   iphlen +
 						   sizeof(struct udphdr));
 				}
 				INP_UNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids searching
 			 * through all pcbs in the common case of a non-shared
 			 * port.  It * assumes that an application will never
 			 * clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.
 			 * (No need to send an ICMP Port Unreachable
 			 * for a broadcast or multicast datgram.)
 			 */
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 #ifdef IPSEC
 		/* check AH/ESP integrity. */
 		if (ipsec4_in_reject_so(m, last->inp_socket)) {
 			ipsecstat.in_polvio++;
 			goto badheadlocked;
 		}
 #endif /*IPSEC*/
 		INP_UNLOCK(last);
 		INP_INFO_RUNLOCK(&udbinfo);
 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr));
 		return;
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 	inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
 	    ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
 	if (inp == NULL) {
 		if (log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		udpstat.udps_noport++;
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			udpstat.udps_noportbcast++;
 			goto badheadlocked;
 		}
 		if (blackhole)
 			goto badheadlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badheadlocked;
 		*ip = save_ip;
 		ip->ip_len += iphlen;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		INP_INFO_RUNLOCK(&udbinfo);
 		return;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 #ifdef IPSEC
 	if (ipsec4_in_reject_so(m, inp->inp_socket)) {
 		ipsecstat.in_polvio++;
 		goto bad;
 	}
 #endif /*IPSEC*/
 #ifdef MAC
 	error = mac_check_socket_deliver(inp->inp_socket, m);
 	if (error)
 		goto bad;
 #endif
 
 	/*
 	 * Construct sockaddr format source address.
 	 * Stuff source address and datagram in user buffer.
 	 */
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 	if (inp->inp_flags & INP_CONTROLOPTS
 	    || inp->inp_socket->so_options & SO_TIMESTAMP) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
 			savedflags = inp->inp_flags;
 			inp->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m);
 			inp->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(inp, &opts, ip, m);
 	}
  	m_adj(m, iphlen + sizeof(struct udphdr));
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif
 	append_sa = (struct sockaddr *)&udp_in;
 	if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) {
 		udpstat.udps_fullsock++;
 		goto bad;
 	}
 	sorwakeup(inp->inp_socket);
 	INP_UNLOCK(inp);
 	return;
 
 badheadlocked:
 	INP_INFO_RUNLOCK(&udbinfo);
 bad:
 	if (inp)
 		INP_UNLOCK(inp);
 badunlocked:
 	m_freem(m);
 	if (opts)
 		m_freem(opts);
 	return;
 }
 
 #ifdef INET6
 static void
 ip_2_ip6_hdr(ip6, ip)
 	struct ip6_hdr *ip6;
 	struct ip *ip;
 {
 	bzero(ip6, sizeof(*ip6));
 
 	ip6->ip6_vfc = IPV6_VERSION;
 	ip6->ip6_plen = ip->ip_len;
 	ip6->ip6_nxt = ip->ip_p;
 	ip6->ip6_hlim = ip->ip_ttl;
 	ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] =
 		IPV6_ADDR_INT32_SMP;
 	ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr;
 	ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr;
 }
 #endif
 
 /*
  * subroutine of udp_input(), mainly for source code readability.
  * caller must properly init udp_ip6 and udp_in6 beforehand.
  */
 static void
 udp_append(last, ip, n, off)
 	struct inpcb *last;
 	struct ip *ip;
 	struct mbuf *n;
 	int off;
 {
 	struct sockaddr *append_sa;
 	struct mbuf *opts = 0;
 
 	if (last->inp_flags & INP_CONTROLOPTS ||
 	    last->inp_socket->so_options & SO_TIMESTAMP) {
 #ifdef INET6
 		if (last->inp_vflag & INP_IPV6) {
 			int savedflags;
 
 			if (udp_ip6.uip6_init_done == 0) {
 				ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
 				udp_ip6.uip6_init_done = 1;
 			}
 			savedflags = last->inp_flags;
 			last->inp_flags &= ~INP_UNMAPPABLEOPTS;
 			ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n);
 			last->inp_flags = savedflags;
 		} else
 #endif
 		ip_savecontrol(last, &opts, ip, n);
 	}
 #ifdef INET6
 	if (last->inp_vflag & INP_IPV6) {
 		if (udp_in6.uin6_init_done == 0) {
 			in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
 			udp_in6.uin6_init_done = 1;
 		}
 		append_sa = (struct sockaddr *)&udp_in6.uin6_sin;
 	} else
 #endif
 	append_sa = (struct sockaddr *)&udp_in;
 	m_adj(n, off);
 	if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) {
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		udpstat.udps_fullsock++;
 	} else
 		sorwakeup(last->inp_socket);
 }
 
 /*
  * Notify a udp user of an asynchronous error;
  * just wake up so that he can collect error status.
  */
 struct inpcb *
 udp_notify(inp, errno)
 	register struct inpcb *inp;
 	int errno;
 {
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return inp;
 }
 
 void
 udp_ctlinput(cmd, sa, vip)
 	int cmd;
 	struct sockaddr *sa;
 	void *vip;
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
         struct in_addr faddr;
 	struct inpcb *inp;
 	int s;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
         	return;
 
 	if (PRC_IS_REDIRECT(cmd)) {
 		ip = 0;
 		notify = in_rtchange;
 	} else if (cmd == PRC_HOSTDEAD)
 		ip = 0;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip) {
 		s = splnet();
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		INP_INFO_RLOCK(&udbinfo);
 		inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
                     ip->ip_src, uh->uh_sport, 0, NULL);
 		if (inp != NULL) {
 			INP_LOCK(inp);
 			if(inp->inp_socket != NULL) {
 				(*notify)(inp, inetctlerrmap[cmd]);
 			}
 			INP_UNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 	} else
 		in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n, s;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = udbinfo.ipi_count;
 		req->oldidx = 2 * (sizeof xig)
 			+ (n + n/8) * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	s = splnet();
 	gencnt = udbinfo.ipi_gencnt;
 	n = udbinfo.ipi_count;
 	splx(s);
 
 	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return ENOMEM;
 	
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
 			inp_list[i++] = inp;
 		INP_UNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_LOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		}
 		INP_UNLOCK(inp);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		s = splnet();
 		INP_INFO_RLOCK(&udbinfo);
 		xig.xig_gen = udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&udbinfo);
 		splx(s);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 	    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error, s;
 
 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	s = splnet();
 	INP_INFO_RLOCK(&udbinfo);
 	inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
 	if (inp == NULL || inp->inp_socket == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 	if (error)
 		goto out;
 	cru2x(inp->inp_socket->so_cred, &xuc);
 out:
 	INP_INFO_RUNLOCK(&udbinfo);
 	splx(s);
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 
 static int
 udp_output(inp, m, addr, control, td)
 	register struct inpcb *inp;
 	struct mbuf *m;
 	struct sockaddr *addr;
 	struct mbuf *control;
 	struct thread *td;
 {
 	register struct udpiphdr *ui;
 	register int len = m->m_pkthdr.len;
 	struct in_addr laddr;
 	struct sockaddr_in *sin;
 	int s = 0, error = 0;
 
 #ifdef MAC
 	mac_create_mbuf_from_socket(inp->inp_socket, m);
 #endif
 
 	if (control)
 		m_freem(control);		/* XXX */
 
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		error = EMSGSIZE;
 		goto release;
 	}
 
 	if (addr) {
 		sin = (struct sockaddr_in *)addr;
 		if (td && jailed(td->td_ucred))
 			prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 		laddr = inp->inp_laddr;
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 		/*
 		 * Must block input while temporarily connected.
 		 */
 		s = splnet();
 		error = in_pcbconnect(inp, addr, td);
 		if (error) {
 			splx(s);
 			goto release;
 		}
 	} else {
 		if (inp->inp_faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP headers.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT);
 	if (m == 0) {
 		error = ENOBUFS;
 		if (addr)
 			splx(s);
 		goto release;
 	}
 
 	/*
 	 * Fill in mbuf with extended UDP header
 	 * and addresses and length put into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = inp->inp_laddr;
 	ui->ui_dst = inp->inp_faddr;
 	ui->ui_sport = inp->inp_lport;
 	ui->ui_dport = inp->inp_fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (udpcksum) {
         	ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else {
 		ui->ui_sum = 0;
 	}
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
 	udpstat.udps_opackets++;
 
-#ifdef IPSEC
-	if (ipsec_setsocket(m, inp->inp_socket) != 0) {
-		error = ENOBUFS;
-		goto release;
-	}
-#endif /*IPSEC*/
 	error = ip_output(m, inp->inp_options, &inp->inp_route,
 	    (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)),
-	    inp->inp_moptions);
+	    inp->inp_moptions, inp);
 
 	if (addr) {
 		in_pcbdisconnect(inp);
 		inp->inp_laddr = laddr;	/* XXX rehash? */
 		splx(s);
 	}
 	return (error);
 
 release:
 	m_freem(m);
 	return (error);
 }
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum incoming UDP datagram size");
 
 static int
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;	/* ??? possible? panic instead? */
 	}
 	INP_LOCK(inp);
 	soisdisconnected(so);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp != 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return error;
 	}
 	s = splnet();
 	error = in_pcballoc(so, &udbinfo, td);
 	splx(s);
 	if (error)
 		return error;
 
 	inp = (struct inpcb *)so->so_pcb;
 	INP_LOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = ip_defttl;
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	error = in_pcbbind(inp, nam, td);
 	splx(s);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int s, error;
 	struct sockaddr_in *sin;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_UNLOCK(inp);
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EISCONN;
 	}
 	s = splnet();
 	sin = (struct sockaddr_in *)nam;
 	if (td && jailed(td->td_ucred))
 		prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
 	error = in_pcbconnect(inp, nam, td);
 	splx(s);
 	if (error == 0)
 		soisconnected(so);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return error;
 }
 
 static int
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	s = splnet();
 	in_pcbdetach(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	return 0;
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	int s;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		INP_UNLOCK(inp);
 		return ENOTCONN;
 	}
 
 	s = splnet();
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	splx(s);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	return 0;
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	    struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	int ret;
 
 	INP_INFO_WLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_WUNLOCK(&udbinfo);
 		m_freem(m);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	ret = udp_output(inp, m, addr, control, td);
 	INP_UNLOCK(inp);
 	INP_INFO_WUNLOCK(&udbinfo);
 	return ret; 
 }
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&udbinfo);
 	inp = sotoinpcb(so);
 	if (inp == 0) {
 		INP_INFO_RUNLOCK(&udbinfo);
 		return EINVAL;
 	}
 	INP_LOCK(inp);
 	INP_INFO_RUNLOCK(&udbinfo);
 	socantsendmore(so);
 	INP_UNLOCK(inp);
 	return 0;
 }
 
 /*
  * This is the wrapper function for in_setsockaddr.  We just pass down 
  * the pcbinfo for in_setsockaddr to lock.  We don't want to do the locking 
  * here because in_setsockaddr will call malloc and might block.
  */
 static int
 udp_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setsockaddr(so, nam, &udbinfo));
 }
 
 /*
  * This is the wrapper function for in_setpeeraddr.  We just pass down
  * the pcbinfo for in_setpeeraddr to lock.
  */
 static int
 udp_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	return (in_setpeeraddr(so, nam, &udbinfo));
 }
 
 struct pr_usrreqs udp_usrreqs = {
 	udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, 
 	pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, 
 	pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp, 
 	pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown,
 	udp_sockaddr, sosend, soreceive, sopoll
 };
Index: head/sys/netinet6/icmp6.c
===================================================================
--- head/sys/netinet6/icmp6.c	(revision 105193)
+++ head/sys/netinet6/icmp6.c	(revision 105194)
@@ -1,2869 +1,2861 @@
 /*	$FreeBSD$	*/
 /*	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netkey/key.h>
 #endif
 
 #include <net/net_osdep.h>
 
 #ifdef HAVE_NRL_INPCB
 /* inpcb members */
 #define in6pcb		inpcb
 #define in6p_laddr	inp_laddr6
 #define in6p_faddr	inp_faddr6
 #define in6p_icmp6filt	inp_icmp6filt
 #define in6p_route	inp_route
 #define in6p_socket	inp_socket
 #define in6p_flags	inp_flags
 #define in6p_moptions	inp_moptions6
 #define in6p_outputopts	inp_outputopts6
 #define in6p_ip6	inp_ipv6
 #define in6p_flowinfo	inp_flowinfo
 #define in6p_sp		inp_sp
 #define in6p_next	inp_next
 #define in6p_prev	inp_prev
 /* macro names */
 #define sotoin6pcb	sotoinpcb
 /* function names */
 #define in6_pcbdetach	in_pcbdetach
 #define in6_rtchange	in_rtchange
 
 /*
  * for KAME src sync over BSD*'s. XXX: FreeBSD (>=3) are VERY different from
  * others...
  */
 #define in6p_ip6_nxt	inp_ipv6.ip6_nxt
 #endif
 
 extern struct domain inet6domain;
 
 struct icmp6stat icmp6stat;
 
 extern struct inpcbhead ripcb;
 extern int icmp6errppslim;
 static int icmp6errpps_count = 0;
 static struct timeval icmp6errppslim_last;
 extern int icmp6_nodeinfo;
 
 static void icmp6_errcount __P((struct icmp6errstat *, int, int));
 static int icmp6_rip6_input __P((struct mbuf **, int));
 static int icmp6_ratelimit __P((const struct in6_addr *, const int, const int));
 static const char *icmp6_redirect_diag __P((struct in6_addr *,
 	struct in6_addr *, struct in6_addr *));
 #ifndef HAVE_PPSRATECHECK
 static int ppsratecheck __P((struct timeval *, int *, int));
 #endif
 static struct mbuf *ni6_input __P((struct mbuf *, int));
 static struct mbuf *ni6_nametodns __P((const char *, int, int));
 static int ni6_dnsmatch __P((const char *, int, const char *, int));
 static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, char *));
 static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int));
 static int icmp6_notify_error __P((struct mbuf *, int, int, int));
 
 #ifdef COMPAT_RFC1885
 static struct route_in6 icmp6_reflect_rt;
 #endif
 
 
 void
 icmp6_init()
 {
 	mld6_init();
 }
 
 static void
 icmp6_errcount(stat, type, code)
 	struct icmp6errstat *stat;
 	int type, code;
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			stat->icp6errs_dst_unreach_noroute++;
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			stat->icp6errs_dst_unreach_admin++;
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			stat->icp6errs_dst_unreach_beyondscope++;
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			stat->icp6errs_dst_unreach_addr++;
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			stat->icp6errs_dst_unreach_noport++;
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		stat->icp6errs_packet_too_big++;
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			stat->icp6errs_time_exceed_transit++;
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			stat->icp6errs_time_exceed_reassembly++;
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			stat->icp6errs_paramprob_header++;
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			stat->icp6errs_paramprob_nextheader++;
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			stat->icp6errs_paramprob_option++;
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		stat->icp6errs_redirect++;
 		return;
 	}
 	stat->icp6errs_unknown++;
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(m, type, code, param)
 	struct mbuf *m;
 	int type, code, param;
 {
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	icmp6stat.icp6s_error++;
 
 	/* count per-type-code statistics */
 	icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		icmp6stat.icp6s_canterror++;
 		goto freeit;
 	}
 #endif
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Multicast destination check. For unrecognized option errors,
 	 * this check has already done in ip6_unknown_opt(), so we can
 	 * check only for other errors.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/* Source address check. XXX: the case of anycast source? */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 #ifndef PULLDOWN_TEST
 		IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), );
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
 			sizeof(*icp));
 		if (icp == NULL) {
 			icmp6stat.icp6s_tooshort++;
 			return;
 		}
 #endif
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			icmp6stat.icp6s_canterror++;
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		icmp6stat.icp6s_toofreq++;
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_DONTWAIT);
 	if (m && m->m_len < preplen)
 		m = m_pullup(m, preplen);
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src))
 		oip6->ip6_src.s6_addr16[1] = 0;
 	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst))
 		oip6->ip6_dst.s6_addr16[1] = 0;
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	/*
 	 * icmp6_reflect() is designed to be in the input path.
 	 * icmp6_error() can be called from both input and outut path,
 	 * and if we are in output path rcvif could contain bogus value.
 	 * clear m->m_pkthdr.rcvif for safety, we should have enough scope
 	 * information in ip header (nip6).
 	 */
 	m->m_pkthdr.rcvif = NULL;
 
 	icmp6stat.icp6s_outhist[type]++;
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell wheter or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;
 {
 	struct mbuf *m = *mp, *n;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	int off = *offp;
 	int icmp6len = m->m_pkthdr.len - *offp;
 	int code, sum, noff;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE);
 	/* m might change if M_LOOP.  So, call mtod after this */
 #endif
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 
 	/*
 	 * calculate the checksum
 	 */
 #ifndef PULLDOWN_TEST
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return IPPROTO_DONE;
 	}
 #endif
 	code = icmp6->icmp6_code;
 
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src)));
 		icmp6stat.icp6s_checksum++;
 		goto freeit;
 	}
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/*
 		 * Deliver very specific ICMP6 type only.
 		 * This is important to deilver TOOBIG.  Otherwise PMTUD
 		 * will not work.
 		 */
 		switch (icmp6->icmp6_type) {
 		case ICMP6_DST_UNREACH:
 		case ICMP6_PACKET_TOO_BIG:
 		case ICMP6_TIME_EXCEEDED:
 			break;
 		default:
 			goto freeit;
 		}
 	}
 
 	icmp6stat.icp6s_inhist[icmp6->icmp6_type]++;
 	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error);
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib);
 			code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
 			break;
 		case ICMP6_DST_UNREACH_ADDR:
 			code = PRC_HOSTDEAD;
 			break;
 #ifdef COMPAT_RFC1885
 		case ICMP6_DST_UNREACH_NOTNEIGHBOR:
 			code = PRC_UNREACH_SRCFAIL;
 			break;
 #else
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;
 			break;
 #endif
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = PRC_UNREACH_PORT;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig);
 		if (code != 0)
 			goto badcode;
 
 		code = PRC_MSGSIZE;
 
 		/*
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 		break;
 
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			code += PRC_TIMXCEED_INTRANS;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			code = PRC_UNREACH_PROTOCOL;
 			break;
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			code = PRC_PARAMPROB;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if ((n->m_flags & M_EXT) != 0
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			const int maxlen = sizeof(*nip6) + sizeof(*nicmp6);
 
 			/*
 			 * Prepare an internal mbuf. m_pullup() doesn't
 			 * always copy the length we specified.
 			 */
 			if (maxlen >= MCLBYTES) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 			MGETHDR(n, M_DONTWAIT, n0->m_type);
 			if (n && maxlen >= MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 			M_COPY_PKTHDR(n, n0);
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len =
 				noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf. ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			n->m_pkthdr.len += n0->m_pkthdr.len;
 			n->m_next = n0;
 			n0->m_flags &= ~M_PKTHDR;
 		} else {
 			nip6 = mtod(n, struct ip6_hdr *);
 			nicmp6 = (struct icmp6_hdr *)((caddr_t)nip6 + off);
 			noff = off;
 		}
 		nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 		nicmp6->icmp6_code = 0;
 		if (n) {
 			icmp6stat.icp6s_reflect++;
 			icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 		if (icmp6len < sizeof(struct mld_hdr))
 			goto badlen;
 		if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery);
 		else
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport);
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			mld6_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		mld6_input(n, off);
 		/* m stays. */
 		break;
 
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone);
 		if (icmp6len < sizeof(struct mld_hdr))	/* necessary? */
 			goto badlen;
 		break;		/* nothing to be done in kernel */
 
 	case MLD_MTRACE_RESP:
 	case MLD_MTRACE:
 		/* XXX: these two are experimental.  not officially defind. */
 		/* XXX: per-interface statistics? */
 		break;		/* just pass it to applications */
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 
 		if (!icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 #define hostnamelen	strlen(hostname)
 		if (mode == FQDN) {
 #ifndef PULLDOWN_TEST
 			IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
 					 IPPROTO_DONE);
 #endif
 			n = m_copy(m, 0, M_COPYALL);
 			if (n)
 				n = ni6_input(n, off);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			u_char *p;
 			int maxlen, maxhlen;
 
 			if ((icmp6_nodeinfo & 5) != 5) 
 				break;
 
 			if (code != 0)
 				goto badcode;
 			maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4;
 			if (maxlen >= MCLBYTES) {
 				/* Give up remote */
 				break;
 			}
 			MGETHDR(n, M_DONTWAIT, m->m_type);
 			if (n && maxlen > MHLEN) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_free(n);
 					n = NULL;
 				}
 			}
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			n->m_pkthdr.rcvif = NULL;
 			n->m_len = 0;
 			maxhlen = M_TRAILINGSPACE(n) - maxlen;
 			if (maxhlen > hostnamelen)
 				maxhlen = hostnamelen;
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 			bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */
 			noff = sizeof(struct ip6_hdr);
 			M_COPY_PKTHDR(n, m); /* just for rcvif */
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 #undef hostnamelen
 		if (n) {
 			icmp6stat.icp6s_reflect++;
 			icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++;
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_rs_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_rs_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ra_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ra_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ns_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_ns_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_na_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		nd6_na_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			icmp6_redirect_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		icmp6_redirect_input(n, off);
 		/* m stays. */
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src),
 		    ip6_sprintf(&ip6->ip6_dst),
 		    m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			code = PRC_NCMDS;
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(m, off, icmp6len, code)) {
 			/* In this case, m should've been freed. */
 			return(IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		icmp6stat.icp6s_badcode++;
 		break;
 
 	badlen:
 		icmp6stat.icp6s_badlen++;
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	return IPPROTO_DONE;
 
  freeit:
 	m_freem(m);
 	return IPPROTO_DONE;
 }
 
 static int
 icmp6_notify_error(m, off, icmp6len, code)
 	struct mbuf *m;
 	int off, icmp6len;
 {
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		icmp6stat.icp6s_tooshort++;
 		goto freeit;
 	}
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off,
 			 sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr),
 			 -1);
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 		       sizeof(*icmp6) + sizeof(struct ip6_hdr));
 	if (icmp6 == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return(-1);
 	}
 #endif
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 	/* Detect the upper level protocol */
 	{
 		void (*ctlfunc) __P((int, struct sockaddr *, void *));
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 			sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		struct in6_addr *finaldst = NULL;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff +
 						 sizeof(struct ip6_ext),
 						 -1);
 				eh = (struct ip6_ext *)(mtod(m, caddr_t)
 							+ eoff);
 #else
 				IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
 					       eoff, sizeof(*eh));
 				if (eh == NULL) {
 					icmp6stat.icp6s_tooshort++;
 					return(-1);
 				}
 #endif
 				
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth),
 						 -1);
 				rth = (struct ip6_rthdr *)(mtod(m, caddr_t)
 							   + eoff);
 #else
 				IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
 					       eoff, sizeof(*rth));
 				if (rth == NULL) {
 					icmp6stat.icp6s_tooshort++;
 					return(-1);
 				}
 #endif
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 #ifndef PULLDOWN_TEST
 					IP6_EXTHDR_CHECK(m, 0, eoff + rthlen,
 							 -1);
 					rth0 = (struct ip6_rthdr0 *)(mtod(m, caddr_t) + eoff);
 #else
 					IP6_EXTHDR_GET(rth0,
 						       struct ip6_rthdr0 *, m,
 						       eoff, rthlen);
 					if (rth0 == NULL) {
 						icmp6stat.icp6s_tooshort++;
 						return(-1);
 					}
 #endif
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff +
 						 sizeof(struct ip6_frag),
 						 -1);
 				fh = (struct ip6_frag *)(mtod(m, caddr_t)
 							 + eoff);
 #else
 				IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
 					       eoff, sizeof(*fh));
 				if (fh == NULL) {
 					icmp6stat.icp6s_tooshort++;
 					return(-1);
 				}
 #endif
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 #ifndef PULLDOWN_TEST
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 			       sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (icmp6 == NULL) {
 			icmp6stat.icp6s_tooshort++;
 			return(-1);
 		}
 #endif
 
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 		bzero(&icmp6dst, sizeof(icmp6dst));
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (finaldst == NULL)
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		else
 			icmp6dst.sin6_addr = *finaldst;
 		icmp6dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif,
 							  &icmp6dst.sin6_addr);
 #ifndef SCOPEDROUTING
 		if (in6_embedscope(&icmp6dst.sin6_addr, &icmp6dst,
 				   NULL, NULL)) {
 			/* should be impossbile */
 			nd6log((LOG_DEBUG,
 			    "icmp6_notify_error: in6_embedscope failed\n"));
 			goto freeit;
 		}
 #endif
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 */
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		icmp6src.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif,
 							  &icmp6src.sin6_addr);
 #ifndef SCOPEDROUTING
 		if (in6_embedscope(&icmp6src.sin6_addr, &icmp6src,
 				   NULL, NULL)) {
 			/* should be impossbile */
 			nd6log((LOG_DEBUG,
 			    "icmp6_notify_error: in6_embedscope failed\n"));
 			goto freeit;
 		}
 #endif
 		icmp6src.sin6_flowinfo =
 			(eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		if (finaldst == NULL)
 			finaldst = &eip6->ip6_dst;
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = finaldst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		ctlfunc = (void (*) __P((int, struct sockaddr *, void *)))
 			(inet6sw[ip6_protox[nxt]].pr_ctlinput);
 		if (ctlfunc) {
 			(void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst,
 					  &ip6cp);
 		}
 	}
 	return(0);
 
   freeit:
 	m_freem(m);
 	return(-1);
 }
 
 void
 icmp6_mtudisc_update(ip6cp, validated)
 	struct ip6ctlparam *ip6cp;
 	int validated;
 {
 	struct in6_addr *dst = ip6cp->ip6c_finaldst;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 sin6;
 
 	if (!validated)
 		return;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_family = PF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_addr = *dst;
 	/* XXX normally, this won't happen */
 	if (IN6_IS_ADDR_LINKLOCAL(dst)) {
 		sin6.sin6_addr.s6_addr16[1] =
 		    htons(m->m_pkthdr.rcvif->if_index);
 	}
 	/* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */
 	rt = rtalloc1((struct sockaddr *)&sin6, 0,
 		      RTF_CLONING | RTF_PRCLONING);
 
 	if (rt && (rt->rt_flags & RTF_HOST)
 	    && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
 		if (mtu < IPV6_MMTU) {
 				/* xxx */
 			rt->rt_rmx.rmx_locks |= RTV_MTU;
 		} else if (mtu < rt->rt_ifp->if_mtu &&
 			   rt->rt_rmx.rmx_mtu > mtu) {
 			icmp6stat.icp6s_pmtuchg++;
 			rt->rt_rmx.rmx_mtu = mtu;
 		}
 	}
 	if (rt) { /* XXX: need braces to avoid conflict with else in RTFREE. */
 		RTFREE(rt);
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  * 
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 #define hostnamelen	strlen(hostname)
 static struct mbuf *
 ni6_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct sockaddr_in6 sin6; /* double meaning; ip6_dst and subjectaddr */
 	struct sockaddr_in6 sin6_d; /* XXX: we should retrieve this from m_aux */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 #ifndef PULLDOWN_TEST
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6));
 	if (ni6 == NULL) {
 		/* m is already reclaimed */
 		return NULL;
 	}
 #endif
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [icmp-name-lookups-07, Section 4.]
 	 */
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_dst, &sin6.sin6_addr, sizeof(sin6.sin6_addr));
 	/* XXX scopeid */
 	if ((ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)&sin6)) != NULL) {
 		/* unicast/anycast, fine */
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    (icmp6_nodeinfo & 4) == 0) {
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 	} else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr))
 		; /* link-local multicast, fine */
 	else
 		goto bad;
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(sin6.sin6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			/* m_pulldown instead of copy? */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&sin6.sin6_addr);
 			sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif,
 							      &sin6.sin6_addr);
 #ifndef SCOPEDROUTING
 			in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL);
 #endif
 			bzero(&sin6_d, sizeof(sin6_d));
 			sin6_d.sin6_family = AF_INET6; /* not used, actually */
 			sin6_d.sin6_len = sizeof(sin6_d); /* ditto */
 			sin6_d.sin6_addr = ip6->ip6_dst;
 			sin6_d.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif,
 								&ip6->ip6_dst);
 #ifndef SCOPEDROUTING
 			in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL);
 #endif
 			subj = (char *)&sin6;
 			if (SA6_ARE_ADDR_EQUAL(&sin6, &sin6_d))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y, if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			n = ni6_nametodns(hostname, hostnamelen, 0);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			IP6_EXTHDR_GET(subj, char *, m,
 			    off + sizeof(struct icmp6_nodeinfo), subjlen);
 			if (subj == NULL)
 				goto bad;
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 					n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((icmp6_nodeinfo & 1) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 		if ((icmp6_nodeinfo & 2) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 					  sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case. However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* allocate an mbuf to reply. */
 	MGETHDR(n, M_DONTWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return(NULL);
 	}
 	M_COPY_PKTHDR(n, m); /* just for recvif */
 	if (replylen > MHLEN) {
 		if (replylen > MCLBYTES) {
 			/*
 			 * XXX: should we try to allocate more? But MCLBYTES
 			 * is probably much larger than IPV6_MMTU...
 			 */
 			goto bad;
 		}
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			goto bad;
 		}
 	}
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 						sizeof(struct ip6_hdr) +
 						sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in variable "hostname"?
 		 */
 		n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 			sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return(n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return(NULL);
 }
 #undef hostnamelen
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  */
 static struct mbuf *
 ni6_nametodns(name, namelen, old)
 	const char *name;
 	int namelen;
 	int old;	/* return pascal string if non-zero */
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */
 	MGET(m, M_DONTWAIT, MT_DATA);
 	if (m && len > MLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0)
 			goto fail;
 	}
 	if (!m)
 		goto fail;
 	m->m_next = NULL;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(a, alen, b, blen)
 	const char *a;
 	int alen;
 	const char *b;
 	int blen;
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(ni6, m, ifpp, subj)
 	struct icmp6_nodeinfo *ni6;
 	struct mbuf *m;
 	struct ifnet **ifpp;
 	char *subj;
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct sockaddr_in6 *subj_ip6 = NULL; /* XXX pedant */
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return(0);
 			subj_ip6 = (struct sockaddr_in6 *)subj;
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return(0);
 		}
 	}
 
 	for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list))
 	{
 		addrsofif = 0;
 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list)
 		{
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(&subj_ip6->sin6_addr,
 					       &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (icmp6_nodeinfo & 4) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		if (iffound) {
 			*ifpp = ifp;
 			return(addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 
 	return(addrs);
 }
 
 static int
 ni6_store_addrs(ni6, nni6, ifp0, resid)
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct ifnet *ifp0;
 	int resid;
 {
 	struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet);
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return(0);	/* needless to copy */
 
   again:
 
 	for (; ifp; ifp = TAILQ_NEXT(ifp, if_list))
 	{
 		for (ifa = ifp->if_addrlist.tqh_first; ifa;
 		     ifa = ifa->ifa_list.tqe_next)
 		{
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			}
 			else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 				 allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental. not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (icmp6_nodeinfo & 4) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |=
 					NI_NODEADDR_FLAG_TRUNCATE;
 				return(copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_second)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second);
 				else
 					ltime = 0;
 			}
 			
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			      sizeof(struct in6_addr));
 			/* XXX: KAME link-local hack; remove ifindex */
 			if (IN6_IS_ADDR_LINKLOCAL(&ifa6->ia_addr.sin6_addr))
 				((struct in6_addr *)cp)->s6_addr16[1] = 0;
 			cp += sizeof(struct in6_addr);
 			
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) +
 				   sizeof(u_int32_t));
 		}
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	return(copied);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(mp, off)
 	struct	mbuf **mp;
 	int	off;
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct in6pcb *in6p;
 	struct in6pcb *last = NULL;
 	struct sockaddr_in6 rip6src;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 
 #ifndef PULLDOWN_TEST
 	/* this is assumed to be safe. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		/* m is already reclaimed */
 		return IPPROTO_DONE;
 	}
 #endif
 
 	bzero(&rip6src, sizeof(rip6src));
 	rip6src.sin6_len = sizeof(struct sockaddr_in6);
 	rip6src.sin6_family = AF_INET6;
 	/* KAME hack: recover scopeid */
 	(void)in6_recoverscope(&rip6src, &ip6->ip6_src, m->m_pkthdr.rcvif);
 
 	LIST_FOREACH(in6p, &ripcb, inp_list)
 	{
 		if ((in6p->inp_vflag & INP_IPV6) == 0)
 			continue;
 #ifdef HAVE_NRL_INPCB
 		if (!(in6p->in6p_flags & INP_IPV6))
 			continue;
 #endif
 		if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		if (in6p->in6p_icmp6filt
 		    && ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 				 in6p->in6p_icmp6filt))
 			continue;
 		if (last) {
 			struct	mbuf *n;
 			if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
 				if (last->in6p_flags & IN6P_CONTROLOPTS)
 					ip6_savecontrol(last, &opts, ip6, n);
 				/* strip intermediate headers */
 				m_adj(n, off);
 				if (sbappendaddr(&last->in6p_socket->so_rcv,
 						 (struct sockaddr *)&rip6src,
 						 n, opts) == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts) {
 						m_freem(opts);
 					}
 				} else
 					sorwakeup(last->in6p_socket);
 				opts = NULL;
 			}
 		}
 		last = in6p;
 	}
 	if (last) {
 		if (last->in6p_flags & IN6P_CONTROLOPTS)
 			ip6_savecontrol(last, &opts, ip6, m);
 		/* strip intermediate headers */
 		m_adj(m, off);
 		if (sbappendaddr(&last->in6p_socket->so_rcv,
 				 (struct sockaddr *)&rip6src, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 		} else
 			sorwakeup(last->in6p_socket);
 	} else {
 		m_freem(m);
 		ip6stat.ip6s_delivered--;
 	}
 	return IPPROTO_DONE;
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 void
 icmp6_reflect(m, off)
 	struct	mbuf *m;
 	size_t off;
 {
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia;
 	struct in6_addr t, *src = 0;
 	int plen;
 	int type, code;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 sa6_src, sa6_dst;
 #ifdef COMPAT_RFC1885
 	int mtu = IPV6_MMTU;
 	struct sockaddr_in6 *sin6 = &icmp6_reflect_rt.ro_dst;
 #endif
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 
 	t = ip6->ip6_dst;
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 
 	/*
 	 * XXX: make sure to embed scope zone information, using
 	 * already embedded IDs or the received interface (if any).
 	 * Note that rcvif may be NULL.
 	 * TODO: scoped routing case (XXX).
 	 */
 	bzero(&sa6_src, sizeof(sa6_src));
 	sa6_src.sin6_family = AF_INET6;
 	sa6_src.sin6_len = sizeof(sa6_src);
 	sa6_src.sin6_addr = ip6->ip6_dst;
 	in6_recoverscope(&sa6_src, &ip6->ip6_dst, m->m_pkthdr.rcvif);
 	in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL);
 	bzero(&sa6_dst, sizeof(sa6_dst));
 	sa6_dst.sin6_family = AF_INET6;
 	sa6_dst.sin6_len = sizeof(sa6_dst);
 	sa6_dst.sin6_addr = t;
 	in6_recoverscope(&sa6_dst, &t, m->m_pkthdr.rcvif);
 	in6_embedscope(&t, &sa6_dst, NULL, NULL);
 
 #ifdef COMPAT_RFC1885
 	/*
 	 * xxx guess MTU
 	 * RFC 1885 requires that echo reply should be truncated if it
 	 * does not fit in with (return) path MTU, but the description was
 	 * removed in the new spec.
 	 */
 	if (icmp6_reflect_rt.ro_rt == 0 ||
 	    ! (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_dst))) {
 		if (icmp6_reflect_rt.ro_rt) {
 			RTFREE(icmp6_reflect_rt.ro_rt);
 			icmp6_reflect_rt.ro_rt = 0;
 		}
 		bzero(sin6, sizeof(*sin6));
 		sin6->sin6_family = PF_INET6;
 		sin6->sin6_len = sizeof(struct sockaddr_in6);
 		sin6->sin6_addr = ip6->ip6_dst;
 
 		rtalloc_ign((struct route *)&icmp6_reflect_rt.ro_rt,
 			    RTF_PRCLONING);
 	}
 
 	if (icmp6_reflect_rt.ro_rt == 0)
 		goto bad;
 
 	if ((icmp6_reflect_rt.ro_rt->rt_flags & RTF_HOST)
 	    && mtu < icmp6_reflect_rt.ro_rt->rt_ifp->if_mtu)
 		mtu = icmp6_reflect_rt.ro_rt->rt_rmx.rmx_mtu;
 
 	if (mtu < m->m_pkthdr.len) {
 		plen -= (m->m_pkthdr.len - mtu);
 		m_adj(m, mtu - m->m_pkthdr.len);
 	}
 #endif
 	/*
 	 * If the incoming packet was addressed directly to us(i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case would be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 */
 	for (ia = in6_ifaddr; ia; ia = ia->ia_next)
 		if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) &&
 		    (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) {
 			src = &t;
 			break;
 		}
 	if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) {
 		/*
 		 * This is the case if the dst is our link-local address
 		 * and the sender is also ourselves.
 		 */
 		src = &t;
 	}
 
 	if (src == 0) {
 		int e;
 		struct route_in6 ro;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		bzero(&ro, sizeof(ro));
 		src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &e);
 		if (ro.ro_rt)
 			RTFREE(ro.ro_rt); /* XXX: we could use this */
 		if (src == NULL) {
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(&sa6_src.sin6_addr), e));
 			goto bad;
 		}
 	}
 
 	ip6->ip6_src = *src;
 
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	if (m->m_pkthdr.rcvif) {
 		/* XXX: This may not be the outgoing interface */
 		ip6->ip6_hlim = nd_ifinfo[m->m_pkthdr.rcvif->if_index].chlim;
 	} else
 		ip6->ip6_hlim = ip6_defhlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 					sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
-#ifdef IPSEC
-	/* Don't lookup socket */
-	(void)ipsec_setsocket(m, NULL);
-#endif /*IPSEC*/
 
 #ifdef COMPAT_RFC1885
-	ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif);
+	ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif, NULL);
 #else
-	ip6_output(m, NULL, NULL, 0, NULL, &outif);
+	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 #endif
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	return;
 
  bad:
 	m_freem(m);
 	return;
 }
 
 void
 icmp6_fasttimo()
 {
 
 	mld6_fasttimeo();
 }
 
 static const char *
 icmp6_redirect_diag(src6, dst6, tgt6)
 	struct in6_addr *src6;
 	struct in6_addr *dst6;
 	struct in6_addr *tgt6;
 {
 	static char buf[1024];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 		ip6_sprintf(src6), ip6_sprintf(dst6), ip6_sprintf(tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_redirect *nd_rd;
 	int icmp6len = ntohs(ip6->ip6_plen);
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	u_char *redirhdr = NULL;
 	int redirhdrlen = 0;
 	struct rtentry *rt = NULL;
 	int is_router;
 	int is_onlink;
 	struct in6_addr src6 = ip6->ip6_src;
 	struct in6_addr redtgt6;
 	struct in6_addr reddst6;
 	union nd_opts ndopts;
 
 	if (!m || !ifp)
 		return;
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (ip6_forwarding)
 		goto freeit;
 	if (!icmp6_rediraccept)
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
 	if (nd_rd == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		redtgt6.s6_addr16[1] = htons(ifp->if_index);
 	if (IN6_IS_ADDR_LINKLOCAL(&reddst6))
 		reddst6.s6_addr16[1] = htons(ifp->if_index);
 
 	/* validation */
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 			"ICMP6 redirect sent from %s rejected; "
 			"must be from linklocal\n", ip6_sprintf(&src6)));
 		goto bad;
 	}
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 			"ICMP6 redirect sent from %s rejected; "
 			"hlim=%d (must be 255)\n",
 			ip6_sprintf(&src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
 	struct sockaddr_in6 sin6;
 	struct in6_addr *gw6;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6));
 	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
 	if (rt) {
 		if (rt->rt_gateway == NULL ||
 		    rt->rt_gateway->sa_family != AF_INET6) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE(rt);
 			goto bad;
 		}
 
 		gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
 		if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
 			nd6log((LOG_ERR,
 				"ICMP6 redirect rejected; "
 				"not equal to gw-for-src=%s (must be same): "
 				"%s\n",
 				ip6_sprintf(gw6),
 				icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			RTFREE(rt);
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 			"ICMP6 redirect rejected; "
 			"no route found for redirect dst: %s\n",
 			icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	RTFREE(rt);
 	rt = NULL;
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 			"ICMP6 redirect rejected; "
 			"redirect dst must be unicast: %s\n",
 			icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 			"ICMP6 redirect rejected; "
 			"neither router case nor onlink case: %s\n",
 			icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 	/* validation passed */
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "icmp6_redirect_input: "
 			"invalid ND option, rejected: %s\n",
 			icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (ndopts.nd_opts_rh) {
 		redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len;
 		redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 			"icmp6_redirect_input: lladdrlen mismatch for %s "
 			"(if %d, icmp6 packet %d): %s\n",
 			ip6_sprintf(&redtgt6), ifp->if_addrlen, lladdrlen - 2,
 			icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 			 is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	if (!is_onlink) {	/* better router case.  perform rtredirect. */
 		/* perform rtredirect */
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&sgw, sizeof(sgw));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
 			sizeof(struct sockaddr_in6);
 		bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw,
 			   (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST,
 			   (struct sockaddr *)&ssrc,
 			   (struct rtentry **)NULL);
 	}
 	/* finally update cached route in each socket via pfctlinput */
     {
 	struct sockaddr_in6 sdst;
 
 	bzero(&sdst, sizeof(sdst));
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
 #ifdef IPSEC
 	key_sa_routechange((struct sockaddr *)&sdst);
 #endif
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	icmp6stat.icp6s_badredirect++;
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(m0, rt)
 	struct mbuf *m0;
 	struct rtentry *rt;
 {
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!ip6_forwarding || ip6_accept_rtadv)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	/* we don't currently use sin6_scope_id, but eventually use it */
 	src_sa.sin6_scope_id = in6_addr2scopeid(ifp, &sip6->ip6_src);
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
 	if (m && IPV6_MMTU >= MHLEN)
 		MCLGET(m, M_DONTWAIT);
 	if (!m)
 		goto fail;
 	m->m_pkthdr.rcvif = NULL;
 	m->m_len = 0;
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
 		struct sockaddr_in6 *sin6;
 		sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (rt->rt_flags & RTF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		      sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		      sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		      sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		      sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
     {
 	/* target lladdr option */
 	struct rtentry *rt_router = NULL;
 	int len;
 	struct sockaddr_dl *sdl;
 	struct nd_opt_hdr *nd_opt;
 	char *lladdr;
 
 	rt_router = nd6_lookup(router_ll6, 0, ifp);
 	if (!rt_router)
 		goto nolladdropt;
 	len = sizeof(*nd_opt) + ifp->if_addrlen;
 	len = (len + 7) & ~7;	/* round by 8 */
 	/* safety check */
 	if (len + (p - (u_char *)ip6) > maxlen)
 		goto nolladdropt;
 	if (!(rt_router->rt_flags & RTF_GATEWAY) &&
 	    (rt_router->rt_flags & RTF_LLINFO) &&
 	    (rt_router->rt_gateway->sa_family == AF_LINK) &&
 	    (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) &&
 	    sdl->sdl_alen) {
 		nd_opt = (struct nd_opt_hdr *)p;
 		nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		nd_opt->nd_opt_len = len >> 3;
 		lladdr = (char *)(nd_opt + 1);
 		bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen);
 		p += len;
 	}
     }
 nolladdropt:;
 
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
     {
 	/* redirected header option */
 	int len;
 	struct nd_opt_rd_hdr *nd_opt_rh;
 
 	/*
 	 * compute the maximum size for icmp6 redirect header option.
 	 * XXX room for auth header?
 	 */
 	len = maxlen - (p - (u_char *)ip6);
 	len &= ~7;
 
 	/* This is just for simplicity. */
 	if (m0->m_pkthdr.len != m0->m_len) {
 		if (m0->m_next) {
 			m_freem(m0->m_next);
 			m0->m_next = NULL;
 		}
 		m0->m_pkthdr.len = m0->m_len;
 	}
 
 	/*
 	 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 	 * about padding/truncate rule for the original IP packet.
 	 * From the discussion on IPv6imp in Feb 1999, the consensus was:
 	 * - "attach as much as possible" is the goal
 	 * - pad if not aligned (original size can be guessed by original
 	 *   ip6 header)
 	 * Following code adds the padding if it is simple enough,
 	 * and truncates if not.
 	 */
 	if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 		panic("assumption failed in %s:%d\n", __FILE__, __LINE__);
 
 	if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 		/* not enough room, truncate */
 		m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh);
 	} else {
 		/* enough room, pad or truncate */
 		size_t extra;
 
 		extra = m0->m_pkthdr.len % 8;
 		if (extra) {
 			/* pad if easy enough, truncate if not */
 			if (8 - extra <= M_TRAILINGSPACE(m0)) {
 				/* pad */
 				m0->m_len += (8 - extra);
 				m0->m_pkthdr.len += (8 - extra);
 			} else {
 				/* truncate */
 				m0->m_pkthdr.len -= extra;
 				m0->m_len -= extra;
 			}
 		}
 		len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 		m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh);
 	}
 
 	nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 	bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 	nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 	nd_opt_rh->nd_opt_rh_len = len >> 3;
 	p += sizeof(*nd_opt_rh);
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* connect m0 to m */
 	m->m_next = m0;
 	m->m_pkthdr.len = m->m_len + m0->m_len;
     }
 noredhdropt:;
 
 	if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_src))
 		sip6->ip6_src.s6_addr16[1] = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_dst))
 		sip6->ip6_dst.s6_addr16[1] = 0;
 #if 0
 	if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		ip6->ip6_src.s6_addr16[1] = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst))
 		ip6->ip6_dst.s6_addr16[1] = 0;
 #endif
 	if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_target))
 		nd_rd->nd_rd_target.s6_addr16[1] = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_dst))
 		nd_rd->nd_rd_dst.s6_addr16[1] = 0;
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum
 		= in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen));
 
 	/* send the packet to outside... */
-#ifdef IPSEC
-	/* Don't lookup socket */
-	(void)ipsec_setsocket(m, NULL);
-#endif /*IPSEC*/
-	ip6_output(m, NULL, NULL, 0, NULL, &outif);
+	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	icmp6stat.icp6s_outhist[ND_REDIRECT]++;
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 #ifdef HAVE_NRL_INPCB
 #define sotoin6pcb	sotoinpcb
 #define in6pcb		inpcb
 #define in6p_icmp6filt	inp_icmp6filt
 #endif
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter *p;
 
 			if (optlen != sizeof(*p)) {
 				error = EMSGSIZE;
 				break;
 			}
 			if (inp->in6p_icmp6filt == NULL) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen,
 				optlen);
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			if (inp->in6p_icmp6filt == NULL) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyout(sopt, inp->in6p_icmp6filt,
 				sizeof(struct icmp6_filter));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return(error);
 }
 #ifdef HAVE_NRL_INPCB
 #undef sotoin6pcb
 #undef in6pcb
 #undef in6p_icmp6filt
 #endif
 
 #ifndef HAVE_PPSRATECHECK
 #ifndef timersub
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 #endif
 
 /*
  * ppsratecheck(): packets (or events) per second limitation.
  */
 static int
 ppsratecheck(lasttime, curpps, maxpps)
 	struct timeval *lasttime;
 	int *curpps;
 	int maxpps;	/* maximum pps allowed */
 {
 	struct timeval tv, delta;
 	int s, rv;
 
 	s = splclock(); 
 	microtime(&tv);
 	splx(s);
 
 	timersub(&tv, lasttime, &delta);
 
 	/*
 	 * Check for 0,0 so that the message will be seen at least once.
 	 * If more than one second has passed since the last update of
 	 * lasttime, reset the counter.
 	 *
 	 * We do increment *curpps even in *curpps < maxpps case, as some may
 	 * try to use *curpps for stat purposes as well.
 	 */
 	if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) ||
 	    delta.tv_sec >= 1) {
 		*lasttime = tv;
 		*curpps = 0;
 		rv = 1;
 	} else if (maxpps < 0)
 		rv = 1;
 	else if (*curpps < maxpps)
 		rv = 1;
 	else
 		rv = 0;
 
 #if 1 /* DIAGNOSTIC? */
 	/* be careful about wrap-around */
 	if (*curpps + 1 > *curpps)
 		*curpps = *curpps + 1;
 #else
 	/*
 	 * assume that there's not too many calls to this function.
 	 * not sure if the assumption holds, as it depends on *caller's*
 	 * behavior, not the behavior of this function.
 	 * IMHO it is wrong to make assumption on the caller's behavior,
 	 * so the above #if is #if 1, not #ifdef DIAGNOSTIC.
 	 */
 	*curpps = *curpps + 1;
 #endif
 
 	return (rv);
 }
 #endif
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  */
 static int
 icmp6_ratelimit(dst, type, code)
 	const struct in6_addr *dst;	/* not used at this moment */
 	const int type;			/* not used at this moment */
 	const int code;			/* not used at this moment */
 {
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count,
 	    icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
Index: head/sys/netinet6/in6_gif.c
===================================================================
--- head/sys/netinet6/in6_gif.c	(revision 105193)
+++ head/sys/netinet6/in6_gif.c	(revision 105194)
@@ -1,343 +1,343 @@
 /*	$FreeBSD$	*/
 /*	$KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/queue.h>
 #include <sys/syslog.h>
 
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET
 #include <netinet/ip.h>
 #endif
 #include <netinet/ip_encap.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_gif.h>
 #include <netinet6/in6_var.h>
 #endif
 #include <netinet/ip_ecn.h>
 #ifdef INET6
 #include <netinet6/ip6_ecn.h>
 #endif
 
 #include <net/if_gif.h>
 
 #include <net/net_osdep.h>
 
 int
 in6_gif_output(ifp, family, m, rt)
 	struct ifnet *ifp;
 	int family; /* family of the packet to be encapsulate. */
 	struct mbuf *m;
 	struct rtentry *rt;
 {
 	struct gif_softc *sc = (struct gif_softc*)ifp;
 	struct sockaddr_in6 *dst = (struct sockaddr_in6 *)&sc->gif_ro6.ro_dst;
 	struct sockaddr_in6 *sin6_src = (struct sockaddr_in6 *)sc->gif_psrc;
 	struct sockaddr_in6 *sin6_dst = (struct sockaddr_in6 *)sc->gif_pdst;
 	struct ip6_hdr *ip6;
 	int proto;
 	u_int8_t itos, otos;
 
 	if (sin6_src == NULL || sin6_dst == NULL ||
 	    sin6_src->sin6_family != AF_INET6 ||
 	    sin6_dst->sin6_family != AF_INET6) {
 		m_freem(m);
 		return EAFNOSUPPORT;
 	}
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip *ip;
 
 		proto = IPPROTO_IPV4;
 		if (m->m_len < sizeof(*ip)) {
 			m = m_pullup(m, sizeof(*ip));
 			if (!m)
 				return ENOBUFS;
 		}
 		ip = mtod(m, struct ip *);
 		itos = ip->ip_tos;
 		break;
 	    }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_hdr *ip6;
 		proto = IPPROTO_IPV6;
 		if (m->m_len < sizeof(*ip6)) {
 			m = m_pullup(m, sizeof(*ip6));
 			if (!m)
 				return ENOBUFS;
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		break;
 	    }
 #endif
 	default:
 #ifdef DEBUG
 		printf("in6_gif_output: warning: unknown family %d passed\n",
 			family);
 #endif
 		m_freem(m);
 		return EAFNOSUPPORT;
 	}
 	
 	/* prepend new IP header */
 	M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
 	if (m && m->m_len < sizeof(struct ip6_hdr))
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 	if (m == NULL) {
 		printf("ENOBUFS in in6_gif_output %d\n", __LINE__);
 		return ENOBUFS;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow	= 0;
 	ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc	|= IPV6_VERSION;
 	ip6->ip6_plen	= htons((u_short)m->m_pkthdr.len);
 	ip6->ip6_nxt	= proto;
 	ip6->ip6_hlim	= ip6_gif_hlim;
 	ip6->ip6_src	= sin6_src->sin6_addr;
 	/* bidirectional configured tunnel mode */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&sin6_dst->sin6_addr))
 		ip6->ip6_dst = sin6_dst->sin6_addr;
 	else  {
 		m_freem(m);
 		return ENETUNREACH;
 	}
 	if (ifp->if_flags & IFF_LINK1)
 		ip_ecn_ingress(ECN_ALLOWED, &otos, &itos);
 	else
 		ip_ecn_ingress(ECN_NOCARE, &otos, &itos);
 	ip6->ip6_flow &= ~ntohl(0xff00000);
 	ip6->ip6_flow |= htonl((u_int32_t)otos << 20);
 
 	if (dst->sin6_family != sin6_dst->sin6_family ||
 	     !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &sin6_dst->sin6_addr)) {
 		/* cache route doesn't match */
 		bzero(dst, sizeof(*dst));
 		dst->sin6_family = sin6_dst->sin6_family;
 		dst->sin6_len = sizeof(struct sockaddr_in6);
 		dst->sin6_addr = sin6_dst->sin6_addr;
 		if (sc->gif_ro6.ro_rt) {
 			RTFREE(sc->gif_ro6.ro_rt);
 			sc->gif_ro6.ro_rt = NULL;
 		}
 #if 0
 		sc->gif_if.if_mtu = GIF_MTU;
 #endif
 	}
 
 	if (sc->gif_ro6.ro_rt == NULL) {
 		rtalloc((struct route *)&sc->gif_ro6);
 		if (sc->gif_ro6.ro_rt == NULL) {
 			m_freem(m);
 			return ENETUNREACH;
 		}
 
 		/* if it constitutes infinite encapsulation, punt. */
 		if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
 			m_freem(m);
 			return ENETUNREACH;	/*XXX*/
 		}
 #if 0
 		ifp->if_mtu = sc->gif_ro6.ro_rt->rt_ifp->if_mtu
 			- sizeof(struct ip6_hdr);
 #endif
 	}
 	
 #ifdef IPV6_MINMTU
 	/*
 	 * force fragmentation to minimum MTU, to avoid path MTU discovery.
 	 * it is too painful to ask for resend of inner packet, to achieve
 	 * path MTU discovery for encapsulated packets.
 	 */
-	return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL));
+	return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, NULL));
 #else
-	return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL));
+	return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, NULL));
 #endif
 }
 
 int in6_gif_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;
 {
 	struct mbuf *m = *mp;
 	struct ifnet *gifp = NULL;
 	struct ip6_hdr *ip6;
 	int af = 0;
 	u_int32_t otos;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	gifp = (struct ifnet *)encap_getarg(m);
 
 	if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		ip6stat.ip6s_nogif++;
 		return IPPROTO_DONE;
 	}
 
 	otos = ip6->ip6_flow;
 	m_adj(m, *offp);
 
 	switch (proto) {
 #ifdef INET
 	case IPPROTO_IPV4:
 	    {
 		struct ip *ip;
 		u_int8_t otos8;
 		af = AF_INET;
 		otos8 = (ntohl(otos) >> 20) & 0xff;
 		if (m->m_len < sizeof(*ip)) {
 			m = m_pullup(m, sizeof(*ip));
 			if (!m)
 				return IPPROTO_DONE;
 		}
 		ip = mtod(m, struct ip *);
 		if (gifp->if_flags & IFF_LINK1)
 			ip_ecn_egress(ECN_ALLOWED, &otos8, &ip->ip_tos);
 		else
 			ip_ecn_egress(ECN_NOCARE, &otos8, &ip->ip_tos);
 		break;
 	    }
 #endif /* INET */
 #ifdef INET6
 	case IPPROTO_IPV6:
 	    {
 		struct ip6_hdr *ip6;
 		af = AF_INET6;
 		if (m->m_len < sizeof(*ip6)) {
 			m = m_pullup(m, sizeof(*ip6));
 			if (!m)
 				return IPPROTO_DONE;
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (gifp->if_flags & IFF_LINK1)
 			ip6_ecn_egress(ECN_ALLOWED, &otos, &ip6->ip6_flow);
 		else
 			ip6_ecn_egress(ECN_NOCARE, &otos, &ip6->ip6_flow);
 		break;
 	    }
 #endif
 	default:
 		ip6stat.ip6s_nogif++;
 		m_freem(m);
 		return IPPROTO_DONE;
 	}
 		
 	gif_input(m, af, gifp);
 	return IPPROTO_DONE;
 }
 
 /*
  * we know that we are in IFF_UP, outer address available, and outer family
  * matched the physical addr family.  see gif_encapcheck().
  */
 int
 gif_encapcheck6(m, off, proto, arg)
 	const struct mbuf *m;
 	int off;
 	int proto;
 	void *arg;
 {
 	struct ip6_hdr ip6;
 	struct gif_softc *sc;
 	struct sockaddr_in6 *src, *dst;
 	int addrmatch;
 
 	/* sanity check done in caller */
 	sc = (struct gif_softc *)arg;
 	src = (struct sockaddr_in6 *)sc->gif_psrc;
 	dst = (struct sockaddr_in6 *)sc->gif_pdst;
 
 	m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6);
 
 	/* check for address match */
 	addrmatch = 0;
 	if (IN6_ARE_ADDR_EQUAL(&src->sin6_addr, &ip6.ip6_dst))
 		addrmatch |= 1;
 	if (IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6.ip6_src))
 		addrmatch |= 2;
 	if (addrmatch != 3)
 		return 0;
 
 	/* martian filters on outer source - done in ip6_input */
 
 	/* ingress filters on outer source */
 	if ((sc->gif_if.if_flags & IFF_LINK2) == 0 &&
 	    (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) {
 		struct sockaddr_in6 sin6;
 		struct rtentry *rt;
 
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(struct sockaddr_in6);
 		sin6.sin6_addr = ip6.ip6_src;
 		/* XXX scopeid */
 		rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
 		if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) {
 #if 0
 			log(LOG_WARNING, "%s: packet from %s dropped "
 			    "due to ingress filter\n", if_name(&sc->gif_if),
 			    ip6_sprintf(&sin6.sin6_addr));
 #endif
 			if (rt)
 				rtfree(rt);
 			return 0;
 		}
 		rtfree(rt);
 	}
 
 	return 128 * 2;
 }
Index: head/sys/netinet6/ip6_input.c
===================================================================
--- head/sys/netinet6/ip6_input.c	(revision 105193)
+++ head/sys/netinet6/ip6_input.c	(revision 105194)
@@ -1,1672 +1,1657 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include "opt_ip6fw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_pfil_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/intrq.h>
 #ifdef PFIL_HOOKS
 #include <net/pfil.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_prefix.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #endif
 
 #include <netinet6/ip6_fw.h>
 
 #include <netinet6/ip6protosw.h>
 
 #include <net/net_osdep.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 static int ip6qmaxlen = IFQ_MAXLEN;
 struct in6_ifaddr *in6_ifaddr;
 
 extern struct callout in6_tmpaddrtimer_ch;
 
 int ip6_forward_srcrt;			/* XXX */
 int ip6_sourcecheck;			/* XXX */
 int ip6_sourcecheck_interval;		/* XXX */
 
 int ip6_ours_check_algorithm;
 
 
 /* firewall hooks */
 ip6_fw_chk_t *ip6_fw_chk_ptr;
 ip6_fw_ctl_t *ip6_fw_ctl_ptr;
 int ip6_fw_enable = 1;
 
 struct ip6stat ip6stat;
 
 static void ip6_init2 __P((void *));
-static struct mbuf *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *));
+static struct ip6aux *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *));
 static int ip6_hopopts_input __P((u_int32_t *, u_int32_t *, struct mbuf **, int *));
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr __P((struct mbuf *, size_t, int));
 #endif
 
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init()
 {
 	struct ip6protosw *pr;
 	int i;
 	struct timeval tv;
 
 #ifdef DIAGNOSTIC
 	if (sizeof(struct protosw) != sizeof(struct ip6protosw))
 		panic("sizeof(protosw) != sizeof(ip6protosw)");
 #endif
 	pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == 0)
 		panic("ip6_init");
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	for (pr = (struct ip6protosw *)inet6domain.dom_protosw;
 	    pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 	ip6intrq.ifq_maxlen = ip6qmaxlen;
 	mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF);
 	ip6intrq_present = 1;
 	register_netisr(NETISR_IPV6, ip6intr);
 	nd6_init();
 	frag6_init();
 	/*
 	 * in many cases, random() here does NOT return random number
 	 * as initialization during bootstrap time occur in fixed order.
 	 */
 	microtime(&tv);
 	ip6_flow_seq = random() ^ tv.tv_usec;
 	microtime(&tv);
 	ip6_desync_factor = (random() ^ tv.tv_usec) % MAX_TEMP_DESYNC_FACTOR;
 }
 
 static void
 ip6_init2(dummy)
 	void *dummy;
 {
 
 	/*
 	 * to route local address of p2p link to loopback,
 	 * assign loopback address first.
 	 */
 	in6_ifattach(&loif[0], NULL);
 
 	/* nd6_timer_init */
 	callout_init(&nd6_timer_ch, 0);
 	callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL);
 
 	/* router renumbering prefix list maintenance */
 	callout_init(&in6_rr_timer_ch, 0);
 	callout_reset(&in6_rr_timer_ch, hz, in6_rr_timer, NULL);
 
 	/* timer for regeneranation of temporary addresses randomize ID */
 	callout_reset(&in6_tmpaddrtimer_ch,
 		      (ip6_temp_preferred_lifetime - ip6_desync_factor -
 		       ip6_temp_regen_advance) * hz,
 		      in6_tmpaddrtimer, NULL);
 }
 
 /* cheat */
 /* This must be after route_init(), which is now SI_ORDER_THIRD */
 SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL);
 
 /*
  * IP6 input interrupt handling. Just pass the packet to ip6_input.
  */
 void
 ip6intr()
 {
 	int s;
 	struct mbuf *m;
 
 	for (;;) {
 		s = splimp();
 		IF_DEQUEUE(&ip6intrq, m);
 		splx(s);
 		if (m == 0)
 			return;
 		ip6_input(m);
 	}
 }
 
 extern struct	route_in6 ip6_forward_rt;
 
 void
 ip6_input(m)
 	struct mbuf *m;
 {
 	struct ip6_hdr *ip6;
 	int off = sizeof(struct ip6_hdr), nest;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int nxt, ours = 0;
 	struct ifnet *deliverifp = NULL;
 #ifdef  PFIL_HOOKS
 	struct packet_filter_hook *pfh;
 	struct mbuf *m0;
 	int rv;
 #endif  /* PFIL_HOOKS */
 
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 */
 	if (m) {
 		m->m_flags &= ~M_AUTHIPHDR;
 		m->m_flags &= ~M_AUTHIPDGM;
 	}
 #endif
 
 	/*
 	 * make sure we don't have onion peering information into m_aux.
 	 */
 	ip6_delaux(m);
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			ip6stat.ip6s_mext2m++;
 		else
 			ip6stat.ip6s_mext1++;
 	} else {
 #define M2MMAX	(sizeof(ip6stat.ip6s_m2m)/sizeof(ip6stat.ip6s_m2m[0]))
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				ip6stat.ip6s_m2m[loif[0].if_index]++;	/* XXX */
 			} else if (m->m_pkthdr.rcvif->if_index < M2MMAX)
 				ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++;
 			else
 				ip6stat.ip6s_m2m[0]++;
 		} else
 			ip6stat.ip6s_m1++;
 #undef M2MMAX
 	}
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	ip6stat.ip6s_total++;
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
 		if (n)
 			M_COPY_PKTHDR(n, m);
 		if (n && m->m_pkthdr.len > MHLEN) {
 			MCLGET(n, M_DONTWAIT);
 			if ((n->m_flags & M_EXT) == 0) {
 				m_freem(n);
 				n = NULL;
 			}
 		}
 		if (n == NULL) {
 			m_freem(m);
 			return;	/*ENOBUFS*/
 		}
 
 		m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = m->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /*nothing*/);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == 0) {
 			ip6stat.ip6s_toosmall++;
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		ip6stat.ip6s_badvers++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 #ifdef PFIL_HOOKS
 	/*
 	 * Run through list of hooks for input packets.  If there are any
 	 * filters which require that additional packets in the flow are
 	 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
 	 * Note that filters must _never_ set this flag, as another filter
 	 * in the list may have previously cleared it.
 	 */
 	m0 = m;
 	pfh = pfil_hook_get(PFIL_IN, &inet6sw[ip6_protox[IPPROTO_IPV6]].pr_pfh);
 	for (; pfh; pfh = pfh->pfil_link.tqe_next)
 		if (pfh->pfil_func) {
 			rv = pfh->pfil_func(ip6, sizeof(*ip6),
 					    m->m_pkthdr.rcvif, 0, &m0);
 			if (rv)
 				return;
 			m = m0;
 			if (m == NULL)
 				return;
 			ip6 = mtod(m, struct ip6_hdr *);
 		}
 #endif /* PFIL_HOOKS */
 
 	ip6stat.ip6s_nxthist[ip6->ip6_nxt]++;
 
 	/*
 	 * Check with the firewall...
 	 */
 	if (ip6_fw_enable && ip6_fw_chk_ptr) {
 		u_short port = 0;
 		/* If ipfw says divert, we have to just drop packet */
 		/* use port as a dummy argument */
 		if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
 			m_freem(m);
 			m = NULL;
 		}
 		if (!m)
 			return;
 	}
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		ip6stat.ip6s_badscope++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if ((IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) ||
 	     IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst)) &&
 	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
 		ip6stat.ip6s_badscope++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).	Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partical support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		ip6stat.ip6s_badscope++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		ip6stat.ip6s_badscope++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 
 	/* drop packets if interface ID portion is already filled */
 	if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src) &&
 		    ip6->ip6_src.s6_addr16[1]) {
 			ip6stat.ip6s_badscope++;
 			goto bad;
 		}
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst) &&
 		    ip6->ip6_dst.s6_addr16[1]) {
 			ip6stat.ip6s_badscope++;
 			goto bad;
 		}
 	}
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src))
 		ip6->ip6_src.s6_addr16[1]
 			= htons(m->m_pkthdr.rcvif->if_index);
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst))
 		ip6->ip6_dst.s6_addr16[1]
 			= htons(m->m_pkthdr.rcvif->if_index);
 
 #if 0 /* this case seems to be unnecessary. (jinmei, 20010401) */
 	/*
 	 * We use rt->rt_ifp to determine if the address is ours or not.
 	 * If rt_ifp is lo0, the address is ours.
 	 * The problem here is, rt->rt_ifp for fe80::%lo0/64 is set to lo0,
 	 * so any address under fe80::%lo0/64 will be mistakenly considered
 	 * local.  The special case is supplied to handle the case properly
 	 * by actually looking at interface addresses
 	 * (using in6ifa_ifpwithaddr).
 	 */
 	if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) != 0 &&
 	    IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) {
 		if (!in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst)) {
 			icmp6_error(m, ICMP6_DST_UNREACH,
 			    ICMP6_DST_UNREACH_ADDR, 0);
 			/* m is already freed */
 			return;
 		}
 
 		ours = 1;
 		deliverifp = m->m_pkthdr.rcvif;
 		goto hbhcheck;
 	}
 #endif
 
 	/*
 	 * Multicast check
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 	  	struct	in6_multi *in6m = 0;
 
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		/*
 		 * See if we belong to the destination multicast group on the
 		 * arrival interface.
 		 */
 		IN6_LOOKUP_MULTI(ip6->ip6_dst, m->m_pkthdr.rcvif, in6m);
 		if (in6m)
 			ours = 1;
 		else if (!ip6_mrouter) {
 			ip6stat.ip6s_notmember++;
 			ip6stat.ip6s_cantforward++;
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 		deliverifp = m->m_pkthdr.rcvif;
 		goto hbhcheck;
 	}
 
 	/*
 	 *  Unicast check
 	 */
 	switch (ip6_ours_check_algorithm) {
 	default:
 		/*
 		 * XXX: I intentionally broke our indentation rule here,
 		 *      since this switch-case is just for measurement and
 		 *      therefore should soon be removed.
 		 */
 	if (ip6_forward_rt.ro_rt != NULL &&
 	    (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && 
 	    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 			       &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr))
 		ip6stat.ip6s_forward_cachehit++;
 	else {
 		struct sockaddr_in6 *dst6;
 
 		if (ip6_forward_rt.ro_rt) {
 			/* route is down or destination is different */
 			ip6stat.ip6s_forward_cachemiss++;
 			RTFREE(ip6_forward_rt.ro_rt);
 			ip6_forward_rt.ro_rt = 0;
 		}
 
 		bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6));
 		dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst;
 		dst6->sin6_len = sizeof(struct sockaddr_in6);
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_addr = ip6->ip6_dst;
 #ifdef SCOPEDROUTING
 		ip6_forward_rt.ro_dst.sin6_scope_id =
 			in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst);
 #endif
 
 		rtalloc_ign((struct route *)&ip6_forward_rt, RTF_PRCLONING);
 	}
 
 #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key))
 
 	/*
 	 * Accept the packet if the forwarding interface to the destination
 	 * according to the routing table is the loopback interface,
 	 * unless the associated route has a gateway.
 	 * Note that this approach causes to accept a packet if there is a
 	 * route to the loopback interface for the destination of the packet.
 	 * But we think it's even useful in some situations, e.g. when using
 	 * a special daemon which wants to intercept the packet.
 	 *
 	 * XXX: some OSes automatically make a cloned route for the destination
 	 * of an outgoing packet.  If the outgoing interface of the packet
 	 * is a loopback one, the kernel would consider the packet to be
 	 * accepted, even if we have no such address assinged on the interface.
 	 * We check the cloned flag of the route entry to reject such cases,
 	 * assuming that route entries for our own addresses are not made by
 	 * cloning (it should be true because in6_addloop explicitly installs
 	 * the host route).  However, we might have to do an explicit check
 	 * while it would be less efficient.  Or, should we rather install a
 	 * reject route for such a case?
 	 */
 	if (ip6_forward_rt.ro_rt &&
 	    (ip6_forward_rt.ro_rt->rt_flags &
 	     (RTF_HOST|RTF_GATEWAY)) == RTF_HOST &&
 #ifdef RTF_WASCLONED
 	    !(ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) &&
 #endif
 #ifdef RTF_CLONED
 	    !(ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) &&
 #endif
 #if 0
 	    /*
 	     * The check below is redundant since the comparison of
 	     * the destination and the key of the rtentry has
 	     * already done through looking up the routing table.
 	     */
 	    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 				&rt6_key(ip6_forward_rt.ro_rt)->sin6_addr)
 #endif
 	    ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) {
 		struct in6_ifaddr *ia6 =
 			(struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa;
 
 		/*
 		 * record address information into m_aux.
 		 */
 		(void)ip6_setdstifaddr(m, ia6);
 
 		/*
 		 * packets to a tentative, duplicated, or somehow invalid
 		 * address must not be accepted.
 		 */
 		if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) {
 			/* this address is ready */
 			ours = 1;
 			deliverifp = ia6->ia_ifp;	/* correct? */
 			/* Count the packet in the ip address stats */
 			ia6->ia_ifa.if_ipackets++;
 			ia6->ia_ifa.if_ibytes += m->m_pkthdr.len;
 			goto hbhcheck;
 		} else {
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(&ip6->ip6_src),
 			    ip6_sprintf(&ip6->ip6_dst)));
 
 			goto bad;
 		}
 	}
 	} /* XXX indentation (see above) */
 
 	/*
 	 * FAITH(Firewall Aided Internet Translator)
 	 */
 	if (ip6_keepfaith) {
 		if (ip6_forward_rt.ro_rt && ip6_forward_rt.ro_rt->rt_ifp
 		 && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) {
 			/* XXX do we need more sanity checks? */
 			ours = 1;
 			deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */
 			goto hbhcheck;
 		}
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!ip6_forwarding) {
 		ip6stat.ip6s_cantforward++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * record address information into m_aux, if we don't have one yet.
 	 * note that we are unable to record it, if the address is not listed
 	 * as our interface address (e.g. multicast addresses, addresses
 	 * within FAITH prefixes and such).
 	 */
 	if (deliverifp && !ip6_getdstifaddr(m)) {
 		struct in6_ifaddr *ia6;
 
 		ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
 		if (ia6) {
 			if (!ip6_setdstifaddr(m, ia6)) {
 				/*
 				 * XXX maybe we should drop the packet here,
 				 * as we could not provide enough information
 				 * to the upper layers.
 				 */
 			}
 		}
 	}
 
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		struct ip6_hbh *hbh;
 
 		if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) {
 #if 0	/*touches NULL pointer*/
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 			return;	/* m have already been freed */
 		}
 
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		/*
 		 * if the payload length field is 0 and the next header field
 		 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 		 * option MUST be included.
 		 */
 		if (ip6->ip6_plen == 0 && plen == 0) {
 			/*
 			 * Note that if a valid jumbo payload option is
 			 * contained, ip6_hoptops_input() must set a valid
 			 * (non-zero) payload length to the variable plen. 
 			 */
 			ip6stat.ip6s_badoptions++;
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 			return;
 		}
 #ifndef PULLDOWN_TEST
 		/* ip6_hopopts_input() ensures that mbuf is contiguous */
 		hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 		IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 			sizeof(struct ip6_hbh));
 		if (hbh == NULL) {
 			ip6stat.ip6s_tooshort++;
 			return;
 		}
 #endif
 		nxt = hbh->ip6h_nxt;
 
 		/*
 		 * accept the packet if a router alert option is included
 		 * and we act as an IPv6 router.
 		 */
 		if (rtalert != ~0 && ip6_forwarding)
 			ours = 1;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		ip6stat.ip6s_tooshort++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 */
 		if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			ip6stat.ip6s_cantforward++;
 			m_freem(m);
 			return;
 		}
 		if (!ours) {
 			m_freem(m);
 			return;
 		}
 	} else if (!ours) {
 		ip6_forward(m, 0);
 		return;
 	}	
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		ip6stat.ip6s_badscope++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	ip6stat.ip6s_delivered++;
 	in6_ifstat_inc(deliverifp, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
 			ip6stat.ip6s_toomanyhdr++;
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			ip6stat.ip6s_tooshort++;
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #if 0
 		/*
 		 * do we need to do it for every header?  yeah, other
 		 * functions can play with it (like re-allocate and copy).
 		 */
 		mhist = ip6_addaux(m);
 		if (mhist && M_TRAILINGSPACE(mhist) >= sizeof(nxt)) {
 			hist = mtod(mhist, caddr_t) + mhist->m_len;
 			bcopy(&nxt, hist, sizeof(nxt));
 			mhist->m_len += sizeof(nxt);
 		} else {
 			ip6stat.ip6s_toomanyhdr++;
 			goto bad;
 		}
 #endif
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 &&
 		    ipsec6_in_reject(m, NULL)) {
 			ipsec6stat.in_polvio++;
 			goto bad;
 		}
 #endif
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
  bad:
 	m_freem(m);
 }
 
 /*
  * set/grab in6_ifaddr correspond to IPv6 destination address.
  * XXX backward compatibility wrapper
  */
-static struct mbuf *
+static struct ip6aux *
 ip6_setdstifaddr(m, ia6)
 	struct mbuf *m;
 	struct in6_ifaddr *ia6;
 {
-	struct mbuf *n;
+	struct ip6aux *n;
 
 	n = ip6_addaux(m);
 	if (n)
-		mtod(n, struct ip6aux *)->ip6a_dstia6 = ia6;
+		n->ip6a_dstia6 = ia6;
 	return n;	/* NULL if failed to set */
 }
 
 struct in6_ifaddr *
 ip6_getdstifaddr(m)
 	struct mbuf *m;
 {
-	struct mbuf *n;
+	struct ip6aux *n;
 
 	n = ip6_findaux(m);
 	if (n)
-		return mtod(n, struct ip6aux *)->ip6a_dstia6;
+		return n->ip6a_dstia6;
 	else
 		return NULL;
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  */
 static int
 ip6_hopopts_input(plenp, rtalertp, mp, offp)
 	u_int32_t *plenp;
 	u_int32_t *rtalertp;	/* XXX: should be stored more smart way */
 	struct mbuf **mp;
 	int *offp;
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 	u_int8_t *opt;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		ip6stat.ip6s_tooshort++;
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		ip6stat.ip6s_tooshort++;
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh);
 
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return(-1);
 
 	*offp = off;
 	*mp = m;
 	return(0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in continuous memory region.
  */
 int
 ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp)
 	struct mbuf *m;
 	u_int8_t *opthead;
 	int hbhlen;
 	u_int32_t *rtalertp;
 	u_int32_t *plenp;
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				ip6stat.ip6s_toosmall++;
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_RTALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				ip6stat.ip6s_toosmall++;
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 					    ICMP6_PARAMPROB_HEADER,
 					    erroff + opt + 1 - opthead);
 				return(-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				ip6stat.ip6s_toosmall++;
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 					    ICMP6_PARAMPROB_HEADER,
 					    erroff + opt + 1 - opthead);
 				return(-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				ip6stat.ip6s_badoptions++;
 				icmp6_error(m, ICMP6_PARAM_PROB,
 					    ICMP6_PARAMPROB_HEADER,
 					    erroff + opt - opthead);
 				return(-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				ip6stat.ip6s_badoptions++;
 				icmp6_error(m, ICMP6_PARAM_PROB,
 					    ICMP6_PARAMPROB_HEADER,
 					    erroff + opt + 2 - opthead);
 				return(-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				ip6stat.ip6s_badoptions++;
 				icmp6_error(m, ICMP6_PARAM_PROB,
 					    ICMP6_PARAMPROB_HEADER,
 					    erroff + opt + 2 - opthead);
 				return(-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				ip6stat.ip6s_toosmall++;
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return(-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return(0);
 
   bad:
 	m_freem(m);
 	return(-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not continuous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(optp, m, off)
 	u_int8_t *optp;
 	struct mbuf *m;
 	int off;
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return(-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		ip6stat.ip6s_badoptions++;
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return(-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		ip6stat.ip6s_badoptions++;
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return(-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return(-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * The function will not modify mbuf chain at all.
  *
  * with KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  */
 void
 ip6_savecontrol(in6p, mp, ip6, m)
 	struct inpcb *in6p;
 	struct mbuf **mp;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 {
 #if __FreeBSD_version >= 500000
 	struct thread *td = curthread;	/* XXX */
 #else
 	struct proc *td = curproc;	/* XXX */
 #endif
 	int privileged = 0;
 	int rthdr_exist = 0;
 
 
 	if (td && !suser(td))
  		privileged++;
 
 #ifdef SO_TIMESTAMP
 	if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 				      SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp) {
 			mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 	/* RFC 2292 sec. 5 */
 	if ((in6p->in6p_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 		bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 		if (IN6_IS_SCOPE_LINKLOCAL(&pi6.ipi6_addr))
 			pi6.ipi6_addr.s6_addr16[1] = 0;
 		pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif)
 					? m->m_pkthdr.rcvif->if_index
 					: 0;
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 			sizeof(struct in6_pktinfo), IPV6_PKTINFO,
 			IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((in6p->in6p_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim = ip6->ip6_hlim & 0xff;
 		*mp = sbcreatecontrol((caddr_t) &hlim,
 			sizeof(int), IPV6_HOPLIMIT, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	/*
 	 * IPV6_HOPOPTS socket option. We require super-user privilege
 	 * for the option, but it might be too strict, since there might
 	 * be some hop-by-hop options which can be returned to normal user.
 	 * See RFC 2292 section 6.
 	 */
 	if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0 && privileged) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which fact is assured through
 		 * the IPv6 input processing.
 		 */
 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				ip6stat.ip6s_tooshort++;
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				ip6stat.ip6s_tooshort++;
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy whole the header even if a jumbo
 			 * payload option is included, which option is to
 			 * be removed before returning in the RFC 2292.
 			 * Note: this constraint is removed in 2292bis.
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 					      IPV6_HOPOPTS, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	/* IPV6_DSTOPTS and IPV6_RTHDR socket options */
 	if ((in6p->in6p_flags & (IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) {
 		int proto, off, nxt;
 
 		/*
 		 * go through the header chain to see if a routing header is
 		 * contained in the packet. We need this information to store
 		 * destination options headers (if any) properly.
 		 * XXX: performance issue. We should record this info when
 		 * processing extension headers in incoming routine.
 		 * (todo) use m_aux? 
 		 */
 		proto = IPPROTO_IPV6;
 		off = 0;
 		nxt = -1;
 		while (1) {
 			int newoff;
 
 			newoff = ip6_nexthdr(m, off, proto, &nxt);
 			if (newoff < 0)
 				break;
 			if (newoff < off) /* invalid, check for safety */
 				break;
 			if ((proto = nxt) == IPPROTO_ROUTING) {
 				rthdr_exist = 1;
 				break;
 			}
 			off = newoff;
 		}
 	}
 
 	if ((in6p->in6p_flags &
 	     (IN6P_RTHDR | IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) {
 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				ip6stat.ip6s_tooshort++;
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				ip6stat.ip6s_tooshort++;
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if ((in6p->in6p_flags & IN6P_DSTOPTS) == 0)
 					break;
 
 				/*
 				 * We also require super-user privilege for
 				 * the option.
 				 * See the comments on IN6_HOPOPTS.
 				 */
 				if (!privileged)
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 						      IPV6_DSTOPTS,
 						      IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!in6p->in6p_flags & IN6P_RTHDR)
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 						      IPV6_RTHDR,
 						      IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 			 	 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(m, off, nxt)
 	struct mbuf *m;
 	size_t off;
 	int nxt;
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	MGET(n, M_DONTWAIT, MT_DATA);
 	if (n && elen >= MLEN) {
 		MCLGET(n, M_DONTWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_free(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return NULL;
 
 	n->m_len = 0;
 	if (elen >= M_TRAILINGSPACE(n)) {
 		m_free(n);
 		return NULL;
 	}
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return(&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return(&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(m, off, proto, nxtp)
 	struct mbuf *m;
 	int off;
 	int proto;
 	int *nxtp;
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		if ((ntohs(fh.ip6f_offlg) & IP6F_OFF_MASK) != 0)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	return -1;
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(m, off, proto, nxtp)
 	struct mbuf *m;
 	int off;
 	int proto;
 	int *nxtp;
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
-struct mbuf *
+struct ip6aux *
 ip6_addaux(m)
 	struct mbuf *m;
 {
-	struct mbuf *n;
-
-#ifdef DIAGNOSTIC
-	if (sizeof(struct ip6aux) > MHLEN)
-		panic("assumption failed on sizeof(ip6aux)");
-#endif
-	n = m_aux_find(m, AF_INET6, -1);
-	if (n) {
-		if (n->m_len < sizeof(struct ip6aux)) {
-			printf("conflicting use of ip6aux");
-			return NULL;
-		}
-	} else {
-		n = m_aux_add(m, AF_INET6, -1);
-		n->m_len = sizeof(struct ip6aux);
-		bzero(mtod(n, caddr_t), n->m_len);
+	struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
+	if (!tag) {
+		tag = m_tag_get(PACKET_TAG_IPV6_INPUT,
+				sizeof (struct ip6aux),
+				M_NOWAIT);
+		if (tag)
+			m_tag_prepend(m, tag);
 	}
-	return n;
+	if (tag)
+		bzero(tag+1, sizeof (struct ip6aux));
+	return tag ? (struct ip6aux*)(tag+1) : NULL;
 }
 
-struct mbuf *
+struct ip6aux *
 ip6_findaux(m)
 	struct mbuf *m;
 {
-	struct mbuf *n;
-
-	n = m_aux_find(m, AF_INET6, -1);
-	if (n && n->m_len < sizeof(struct ip6aux)) {
-		printf("conflicting use of ip6aux");
-		n = NULL;
-	}
-	return n;
+	struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
+	return tag ? (struct ip6aux*)(tag+1) : NULL;
 }
 
 void
 ip6_delaux(m)
 	struct mbuf *m;
 {
-	struct mbuf *n;
-
-	n = m_aux_find(m, AF_INET6, -1);
-	if (n)
-		m_aux_delete(m, n);
+	struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
+	if (tag)
+		m_tag_delete(m, tag);
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };
Index: head/sys/netinet6/ip6_mroute.c
===================================================================
--- head/sys/netinet6/ip6_mroute.c	(revision 105193)
+++ head/sys/netinet6/ip6_mroute.c	(revision 105194)
@@ -1,1814 +1,1814 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $	*/
 
 /*
  * Copyright (C) 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*	BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp	*/
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenenr, PARC, April 1994
  *
  * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/pim6.h>
 #include <netinet6/pim6_var.h>
 
 #include <net/net_osdep.h>
 
 static MALLOC_DEFINE(M_MRTABLE, "mf6c", "multicast forwarding cache entry");
 
 #define M_HASCL(m) ((m)->m_flags & M_EXT)
 
 static int ip6_mdq __P((struct mbuf *, struct ifnet *, struct mf6c *));
 static void phyint_send __P((struct ip6_hdr *, struct mif6 *, struct mbuf *));
 
 static int set_pim6 __P((int *));
 static int socket_send __P((struct socket *, struct mbuf *,
 			    struct sockaddr_in6 *));
 static int register_send __P((struct ip6_hdr *, struct mif6 *,
 			      struct mbuf *));
 
 /*
  * Globals.  All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static,
  * except for netstat or debugging purposes.
  */
 struct socket  *ip6_mrouter = NULL;
 int		ip6_mrouter_ver = 0;
 int		ip6_mrtproto = IPPROTO_PIM;    /* for netstat only */
 struct mrt6stat	mrt6stat;
 
 #define NO_RTE_FOUND 	0x1
 #define RTE_FOUND	0x2
 
 struct mf6c	*mf6ctable[MF6CTBLSIZ];
 u_char		n6expire[MF6CTBLSIZ];
 static struct mif6 mif6table[MAXMIFS];
 #ifdef MRT6DEBUG
 u_int		mrt6debug = 0;	  /* debug level 	*/
 #define		DEBUG_MFC	0x02
 #define		DEBUG_FORWARD	0x04
 #define		DEBUG_EXPIRE	0x08
 #define		DEBUG_XMIT	0x10
 #define         DEBUG_REG       0x20
 #define         DEBUG_PIM       0x40
 #endif
 
 static void	expire_upcalls __P((void *));
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
 #define		UPCALL_EXPIRE	6		/* number of timeouts */
 
 #ifdef INET
 #ifdef MROUTING
 extern struct socket *ip_mrouter;
 #endif
 #endif
 
 /*
  * 'Interfaces' associated with decapsulator (so we can tell
  * packets that went through it from ones that get reflected
  * by a broken gateway).  These interfaces are never linked into
  * the system ifnet list & no routes point to them.  I.e., packets
  * can't be sent this way.  They only exist as a placeholder for
  * multicast source verification.
  */
 struct ifnet multicast_register_if;
 
 #define ENCAP_HOPS 64
 
 /*
  * Private variables.
  */
 static mifi_t nummifs = 0;
 static mifi_t reg_mif_num = (mifi_t)-1;
 
 static struct pim6stat pim6stat;
 static int pim6;
 
 /*
  * Hash function for a source, group entry
  */
 #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
 				   (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
 				   (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
 				   (g).s6_addr32[2] ^ (g).s6_addr32[3])
 
 /*
  * Find a route for a given origin IPv6 address and Multicast group address.
  * Quality of service parameter to be added in the future!!!
  */
 
 #define MF6CFIND(o, g, rt) do { \
 	struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
 	rt = NULL; \
 	mrt6stat.mrt6s_mfc_lookups++; \
 	while (_rt) { \
 		if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
 		    IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
 		    (_rt->mf6c_stall == NULL)) { \
 			rt = _rt; \
 			break; \
 		} \
 		_rt = _rt->mf6c_next; \
 	} \
 	if (rt == NULL) { \
 		mrt6stat.mrt6s_mfc_misses++; \
 	} \
 } while (0)
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  */
 #define TV_DELTA(a, b, delta) do { \
 	    int xxs; \
 		\
 	    delta = (a).tv_usec - (b).tv_usec; \
 	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
 	       switch (xxs) { \
 		      case 2: \
 			  delta += 1000000; \
 			      /* fall through */ \
 		      case 1: \
 			  delta += 1000000; \
 			  break; \
 		      default: \
 			  delta += (1000000 * xxs); \
 	       } \
 	    } \
 } while (0)
 
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 #ifdef UPCALL_TIMING
 #define UPCALL_MAX	50
 u_long upcall_data[UPCALL_MAX + 1];
 static void collate();
 #endif /* UPCALL_TIMING */
 
 static int get_sg_cnt __P((struct sioc_sg_req6 *));
 static int get_mif6_cnt __P((struct sioc_mif_req6 *));
 static int ip6_mrouter_init __P((struct socket *, struct mbuf *, int));
 static int add_m6if __P((struct mif6ctl *));
 static int del_m6if __P((mifi_t *));
 static int add_m6fc __P((struct mf6cctl *));
 static int del_m6fc __P((struct mf6cctl *));
 
 static struct callout expire_upcalls_ch;
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 int
 ip6_mrouter_set(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int	error = 0;
 	struct mbuf *m;
 
 	if (so != ip6_mrouter && sopt->sopt_name != MRT6_INIT)
 		return (EACCES);
 
 	if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 		return (error);
 	if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 		return (error);
 
 	switch (sopt->sopt_name) {
 	case MRT6_INIT:
 #ifdef MRT6_OINIT
 	case MRT6_OINIT:
 #endif
 		error = ip6_mrouter_init(so, m, sopt->sopt_name);
 		break;
 	case MRT6_DONE:
 		error = ip6_mrouter_done();
 		break;
 	case MRT6_ADD_MIF:
 		error = add_m6if(mtod(m, struct mif6ctl *));
 		break;
 	case MRT6_DEL_MIF:
 		error = del_m6if(mtod(m, mifi_t *));
 		break;
 	case MRT6_ADD_MFC:
 		error = add_m6fc(mtod(m, struct mf6cctl *));
 		break;
 	case MRT6_DEL_MFC:
 		error = del_m6fc(mtod(m, struct mf6cctl *));
 		break;
 	case MRT6_PIM:
 		error = set_pim6(mtod(m, int *));
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	(void)m_freem(m);
 	return(error);
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 int
 ip6_mrouter_get(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error = 0;
 
 	if (so != ip6_mrouter) return EACCES;
 
 	switch (sopt->sopt_name) {
 		case MRT6_PIM:
 			error = sooptcopyout(sopt, &pim6, sizeof(pim6));
 			break;
 	}
 	return (error);
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 int
 mrt6_ioctl(cmd, data)
 	int cmd;
 	caddr_t data;
 {
 	int error = 0;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 		return(get_sg_cnt((struct sioc_sg_req6 *)data));
 		break;		/* for safety */
 	case SIOCGETMIFCNT_IN6:
 		return(get_mif6_cnt((struct sioc_mif_req6 *)data));
 		break;		/* for safety */
 	default:
 		return (EINVAL);
 		break;
 	}
 	return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(req)
 	struct sioc_sg_req6 *req;
 {
 	struct mf6c *rt;
 	int s;
 
 	s = splnet();
 	MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
 	splx(s);
 	if (rt != NULL) {
 		req->pktcnt = rt->mf6c_pkt_cnt;
 		req->bytecnt = rt->mf6c_byte_cnt;
 		req->wrong_if = rt->mf6c_wrong_if;
 	} else
 		return(ESRCH);
 #if 0
 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 #endif
 
 	return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the mif provided
  */
 static int
 get_mif6_cnt(req)
 	struct sioc_mif_req6 *req;
 {
 	mifi_t mifi = req->mifi;
 
 	if (mifi >= nummifs)
 		return EINVAL;
 
 	req->icount = mif6table[mifi].m6_pkt_in;
 	req->ocount = mif6table[mifi].m6_pkt_out;
 	req->ibytes = mif6table[mifi].m6_bytes_in;
 	req->obytes = mif6table[mifi].m6_bytes_out;
 
 	return 0;
 }
 
 static int
 set_pim6(i)
 	int *i;
 {
 	if ((*i != 1) && (*i != 0))
 		return EINVAL;
 
 	pim6 = *i;
 
 	return 0;
 }
 
 /*
  * Enable multicast routing
  */
 static int
 ip6_mrouter_init(so, m, cmd)
 	struct socket *so;
 	struct mbuf *m;
 	int cmd;
 {
 	int *v;
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG,
 		    "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
 		    so->so_type, so->so_proto->pr_protocol);
 #endif
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_ICMPV6)
 		return EOPNOTSUPP;
 
 	if (!m || (m->m_len != sizeof(int *)))
 		return ENOPROTOOPT;
 
 	v = mtod(m, int *);
 	if (*v != 1)
 		return ENOPROTOOPT;
 
 	if (ip6_mrouter != NULL) return EADDRINUSE;
 
 	ip6_mrouter = so;
 	ip6_mrouter_ver = cmd;
 
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	bzero((caddr_t)n6expire, sizeof(n6expire));
 
 	pim6 = 0;/* used for stubbing out/in pim stuff */
 
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG, "ip6_mrouter_init\n");
 #endif
 
 	return 0;
 }
 
 /*
  * Disable multicast routing
  */
 int
 ip6_mrouter_done()
 {
 	mifi_t mifi;
 	int i;
 	struct ifnet *ifp;
 	struct in6_ifreq ifr;
 	struct mf6c *rt;
 	struct rtdetq *rte;
 	int s;
 
 	s = splnet();
 
 	/*
 	 * For each phyint in use, disable promiscuous reception of all IPv6
 	 * multicasts.
 	 */
 #ifdef INET
 #ifdef MROUTING
 	/*
 	 * If there is still IPv4 multicast routing daemon,
 	 * we remain interfaces to receive all muliticasted packets.
 	 * XXX: there may be an interface in which the IPv4 multicast
 	 * daemon is not interested...
 	 */
 	if (!ip_mrouter)
 #endif
 #endif
 	{
 		for (mifi = 0; mifi < nummifs; mifi++) {
 			if (mif6table[mifi].m6_ifp &&
 			    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 				ifr.ifr_addr.sin6_family = AF_INET6;
 				ifr.ifr_addr.sin6_addr= in6addr_any;
 				ifp = mif6table[mifi].m6_ifp;
 				(*ifp->if_ioctl)(ifp, SIOCDELMULTI,
 						 (caddr_t)&ifr);
 			}
 		}
 	}
 #ifdef notyet
 	bzero((caddr_t)qtable, sizeof(qtable));
 	bzero((caddr_t)tbftable, sizeof(tbftable));
 #endif
 	bzero((caddr_t)mif6table, sizeof(mif6table));
 	nummifs = 0;
 
 	pim6 = 0; /* used to stub out/in pim specific code */
 
 	callout_stop(&expire_upcalls_ch);
 
 	/*
 	 * Free all multicast forwarding cache entries.
 	 */
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		rt = mf6ctable[i];
 		while (rt) {
 			struct mf6c *frt;
 
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 
 				m_free(rte->m);
 				free(rte, M_MRTABLE);
 				rte = n;
 			}
 			frt = rt;
 			rt = rt->mf6c_next;
 			free(frt, M_MRTABLE);
 		}
 	}
 
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 
 	/*
 	 * Reset de-encapsulation cache
 	 */
 	reg_mif_num = -1;
 
 	ip6_mrouter = NULL;
 	ip6_mrouter_ver = 0;
 
 	splx(s);
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG, "ip6_mrouter_done\n");
 #endif
 
 	return 0;
 }
 
 static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 
 /*
  * Add a mif to the mif table
  */
 static int
 add_m6if(mifcp)
 	struct mif6ctl *mifcp;
 {
 	struct mif6 *mifp;
 	struct ifnet *ifp;
 	int error, s;
 #ifdef notyet
 	struct tbf *m_tbf = tbftable + mifcp->mif6c_mifi;
 #endif
 
 	if (mifcp->mif6c_mifi >= MAXMIFS)
 		return EINVAL;
 	mifp = mif6table + mifcp->mif6c_mifi;
 	if (mifp->m6_ifp)
 		return EADDRINUSE; /* XXX: is it appropriate? */
 	if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index)
 		return ENXIO;
 	ifp = ifnet_byindex(mifcp->mif6c_pifi);
 
 	if (mifcp->mif6c_flags & MIFF_REGISTER) {
 		if (reg_mif_num == (mifi_t)-1) {
 			multicast_register_if.if_name = "register_mif";
 			multicast_register_if.if_flags |= IFF_LOOPBACK;
 			multicast_register_if.if_index = mifcp->mif6c_mifi;
 			reg_mif_num = mifcp->mif6c_mifi;
 		}
 
 		ifp = &multicast_register_if;
 
 	} /* if REGISTER */
 	else {
 		/* Make sure the interface supports multicast */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return EOPNOTSUPP;
 
 		s = splnet();
 		error = if_allmulti(ifp, 1);
 		splx(s);
 		if (error)
 			return error;
 	}
 
 	s = splnet();
 	mifp->m6_flags     = mifcp->mif6c_flags;
 	mifp->m6_ifp       = ifp;
 #ifdef notyet
 	/* scaling up here allows division by 1024 in critical code */
 	mifp->m6_rate_limit = mifcp->mif6c_rate_limit * 1024 / 1000;
 #endif
 	/* initialize per mif pkt counters */
 	mifp->m6_pkt_in    = 0;
 	mifp->m6_pkt_out   = 0;
 	mifp->m6_bytes_in  = 0;
 	mifp->m6_bytes_out = 0;
 	splx(s);
 
 	/* Adjust nummifs up if the mifi is higher than nummifs */
 	if (nummifs <= mifcp->mif6c_mifi)
 		nummifs = mifcp->mif6c_mifi + 1;
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG,
 		    "add_mif #%d, phyint %s%d\n",
 		    mifcp->mif6c_mifi,
 		    ifp->if_name, ifp->if_unit);
 #endif
 
 	return 0;
 }
 
 /*
  * Delete a mif from the mif table
  */
 static int
 del_m6if(mifip)
 	mifi_t *mifip;
 {
 	struct mif6 *mifp = mif6table + *mifip;
 	mifi_t mifi;
 	struct ifnet *ifp;
 	int s;
 
 	if (*mifip >= nummifs)
 		return EINVAL;
 	if (mifp->m6_ifp == NULL)
 		return EINVAL;
 
 	s = splnet();
 
 	if (!(mifp->m6_flags & MIFF_REGISTER)) {
 		/*
 		 * XXX: what if there is yet IPv4 multicast daemon
 		 *      using the interface?
 		 */
 		ifp = mifp->m6_ifp;
 
 		if_allmulti(ifp, 0);
 	}
 
 #ifdef notyet
 	bzero((caddr_t)qtable[*mifip], sizeof(qtable[*mifip]));
 	bzero((caddr_t)mifp->m6_tbf, sizeof(*(mifp->m6_tbf)));
 #endif
 	bzero((caddr_t)mifp, sizeof (*mifp));
 
 	/* Adjust nummifs down */
 	for (mifi = nummifs; mifi > 0; mifi--)
 		if (mif6table[mifi - 1].m6_ifp)
 			break;
 	nummifs = mifi;
 
 	splx(s);
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
 #endif
 
 	return 0;
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_m6fc(mfccp)
 	struct mf6cctl *mfccp;
 {
 	struct mf6c *rt;
 	u_long hash;
 	struct rtdetq *rte;
 	u_short nstl;
 	int s;
 
 	MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
 		 mfccp->mf6cc_mcastgrp.sin6_addr, rt);
 
 	/* If an entry already exists, just update the fields */
 	if (rt) {
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_MFC)
 			log(LOG_DEBUG,
 			    "add_m6fc no upcall h %d o %s g %s p %x\n",
 			    ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr),
 			    ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr),
 			    mfccp->mf6cc_parent);
 #endif
 
 		s = splnet();
 		rt->mf6c_parent = mfccp->mf6cc_parent;
 		rt->mf6c_ifset = mfccp->mf6cc_ifset;
 		splx(s);
 		return 0;
 	}
 
 	/*
 	 * Find the entry for which the upcall was made and update
 	 */
 	s = splnet();
 	hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
 			mfccp->mf6cc_mcastgrp.sin6_addr);
 	for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 				       &mfccp->mf6cc_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 				       &mfccp->mf6cc_mcastgrp.sin6_addr) &&
 		    (rt->mf6c_stall != NULL)) {
 
 			if (nstl++)
 				log(LOG_ERR,
 				    "add_m6fc: %s o %s g %s p %x dbx %p\n",
 				    "multiple kernel entries",
 				    ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 #ifdef MRT6DEBUG
 			if (mrt6debug & DEBUG_MFC)
 				log(LOG_DEBUG,
 				    "add_m6fc o %s g %s p %x dbg %x\n",
 				    ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 #endif
 
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 
 			rt->mf6c_expire = 0;	/* Don't clean this guy up */
 			n6expire[hash]--;
 
 			/* free packets Qed at the end of this entry */
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 				ip6_mdq(rte->m, rte->ifp, rt);
 				m_freem(rte->m);
 #ifdef UPCALL_TIMING
 				collate(&(rte->t));
 #endif /* UPCALL_TIMING */
 				free(rte, M_MRTABLE);
 				rte = n;
 			}
 			rt->mf6c_stall = NULL;
 		}
 	}
 
 	/*
 	 * It is possible that an entry is being inserted without an upcall
 	 */
 	if (nstl == 0) {
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_MFC)
 			log(LOG_DEBUG,"add_mfc no upcall h %d o %s g %s p %x\n",
 			    hash,
 			    ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr),
 			    ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr),
 			    mfccp->mf6cc_parent);
 #endif
 
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 	
 			if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 					       &mfccp->mf6cc_origin.sin6_addr)&&
 			    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 					       &mfccp->mf6cc_mcastgrp.sin6_addr)) {
 
 				rt->mf6c_origin     = mfccp->mf6cc_origin;
 				rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 				rt->mf6c_parent     = mfccp->mf6cc_parent;
 				rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 				/* initialize pkt counters per src-grp */
 				rt->mf6c_pkt_cnt    = 0;
 				rt->mf6c_byte_cnt   = 0;
 				rt->mf6c_wrong_if   = 0;
 
 				if (rt->mf6c_expire)
 					n6expire[hash]--;
 				rt->mf6c_expire	   = 0;
 			}
 		}
 		if (rt == NULL) {
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				splx(s);
 				return ENOBUFS;
 			}
 	
 			/* insert new entry at head of hash chain */
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 			rt->mf6c_expire     = 0;
 			rt->mf6c_stall = NULL;
 	
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 		}
 	}
 	splx(s);
 	return 0;
 }
 
 #ifdef UPCALL_TIMING
 /*
  * collect delay statistics on the upcalls
  */
 static void
 collate(t)
 	struct timeval *t;
 {
 	u_long d;
 	struct timeval tp;
 	u_long delta;
 
 	GET_TIME(tp);
 
 	if (TV_LT(*t, tp))
 	{
 		TV_DELTA(tp, *t, delta);
 	
 		d = delta >> 10;
 		if (d > UPCALL_MAX)
 			d = UPCALL_MAX;
 	
 		++upcall_data[d];
 	}
 }
 #endif /* UPCALL_TIMING */
 
 /*
  * Delete an mfc entry
  */
 static int
 del_m6fc(mfccp)
 	struct mf6cctl *mfccp;
 {
 	struct sockaddr_in6 	origin;
 	struct sockaddr_in6 	mcastgrp;
 	struct mf6c 		*rt;
 	struct mf6c	 	**nptr;
 	u_long 		hash;
 	int s;
 
 	origin = mfccp->mf6cc_origin;
 	mcastgrp = mfccp->mf6cc_mcastgrp;
 	hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
 
 #ifdef MRT6DEBUG
 	if (mrt6debug & DEBUG_MFC)
 		log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
 		    ip6_sprintf(&origin.sin6_addr),
 		    ip6_sprintf(&mcastgrp.sin6_addr));
 #endif
 
 	s = splnet();
 
 	nptr = &mf6ctable[hash];
 	while ((rt = *nptr) != NULL) {
 		if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
 				       &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
 				       &rt->mf6c_mcastgrp.sin6_addr) &&
 		    rt->mf6c_stall == NULL)
 			break;
 
 		nptr = &rt->mf6c_next;
 	}
 	if (rt == NULL) {
 		splx(s);
 		return EADDRNOTAVAIL;
 	}
 
 	*nptr = rt->mf6c_next;
 	free(rt, M_MRTABLE);
 
 	splx(s);
 
 	return 0;
 }
 
 static int
 socket_send(s, mm, src)
 	struct socket *s;
 	struct mbuf *mm;
 	struct sockaddr_in6 *src;
 {
 	if (s) {
 		if (sbappendaddr(&s->so_rcv,
 				 (struct sockaddr *)src,
 				 mm, (struct mbuf *)0) != 0) {
 			sorwakeup(s);
 			return 0;
 		}
 	}
 	m_freem(mm);
 	return -1;
 }
 
 /*
  * IPv6 multicast forwarding function. This function assumes that the packet
  * pointed to by "ip6" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IPv6 multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 int
 ip6_mforward(ip6, ifp, m)
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp;
 	struct mbuf *m;
 {
 	struct mf6c *rt;
 	struct mif6 *mifp;
 	struct mbuf *mm;
 	int s;
 	mifi_t mifi;
 
 #ifdef MRT6DEBUG
 	if (mrt6debug & DEBUG_FORWARD)
 		log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
 		    ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst),
 		    ifp->if_index);
 #endif
 
 	/*
 	 * Don't forward a packet with Hop limit of zero or one,
 	 * or a packet destined to a local-only group.
 	 */
 	if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 		return 0;
 	ip6->ip6_hlim--;
 
 	/*
 	 * Source address check: do not forward packets with unspecified
 	 * source. It was discussed in July 2000, on ipngwg mailing list.
 	 * This is rather more serious than unicast cases, because some
 	 * MLD packets can be sent with the unspecified source address
 	 * (although such packets must normally set 1 to the hop limit field).
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 		ip6stat.ip6s_cantforward++;
 		if (ip6_log_time + ip6_log_interval < time_second) {
 			ip6_log_time = time_second;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on %s\n",
 			    ip6_sprintf(&ip6->ip6_src),
 			    ip6_sprintf(&ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif));
 		}
 		return 0;
 	}
 
 	/*
 	 * Determine forwarding mifs from the forwarding cache table
 	 */
 	s = splnet();
 	MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
 
 	/* Entry exists, so forward if necessary */
 	if (rt) {
 		splx(s);
 		return (ip6_mdq(m, ifp, rt));
 	} else {
 		/*
 		 * If we don't have a route for packet's origin,
 		 * Make a copy of the packet &
 		 * send message to routing daemon
 		 */
 
 		struct mbuf *mb0;
 		struct rtdetq *rte;
 		u_long hash;
 /*		int i, npkts;*/
 #ifdef UPCALL_TIMING
 		struct timeval tp;
 
 		GET_TIME(tp);
 #endif /* UPCALL_TIMING */
 
 		mrt6stat.mrt6s_no_route++;
 #ifdef MRT6DEBUG
 		if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC))
 			log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
 			    ip6_sprintf(&ip6->ip6_src),
 			    ip6_sprintf(&ip6->ip6_dst));
 #endif
 
 		/*
 		 * Allocate mbufs early so that we don't do extra work if we
 		 * are just going to fail anyway.
 		 */
 		rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE,
 					      M_NOWAIT);
 		if (rte == NULL) {
 			splx(s);
 			return ENOBUFS;
 		}
 		mb0 = m_copy(m, 0, M_COPYALL);
 		/*
 		 * Pullup packet header if needed before storing it,
 		 * as other references may modify it in the meantime.
 		 */
 		if (mb0 &&
 		    (M_HASCL(mb0) || mb0->m_len < sizeof(struct ip6_hdr)))
 			mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
 		if (mb0 == NULL) {
 			free(rte, M_MRTABLE);
 			splx(s);
 			return ENOBUFS;
 		}
 	
 		/* is there an upcall waiting for this packet? */
 		hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
 					       &rt->mf6c_origin.sin6_addr) &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 					       &rt->mf6c_mcastgrp.sin6_addr) &&
 			    (rt->mf6c_stall != NULL))
 				break;
 		}
 
 		if (rt == NULL) {
 			struct mrt6msg *im;
 #ifdef MRT6_OINIT
 			struct omrt6msg *oim;
 #endif
 
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				free(rte, M_MRTABLE);
 				m_freem(mb0);
 				splx(s);
 				return ENOBUFS;
 			}
 			/*
 			 * Make a copy of the header to send to the user
 			 * level process
 			 */
 			mm = m_copy(mb0, 0, sizeof(struct ip6_hdr));
 
 			if (mm == NULL) {
 				free(rte, M_MRTABLE);
 				m_freem(mb0);
 				free(rt, M_MRTABLE);
 				splx(s);
 				return ENOBUFS;
 			}
 
 			/*
 			 * Send message to routing daemon
 			 */
 			sin6.sin6_addr = ip6->ip6_src;
 	
 			im = NULL;
 #ifdef MRT6_OINIT
 			oim = NULL;
 #endif
 			switch (ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 			case MRT6_OINIT:
 				oim = mtod(mm, struct omrt6msg *);
 				oim->im6_msgtype = MRT6MSG_NOCACHE;
 				oim->im6_mbz = 0;
 				break;
 #endif
 			case MRT6_INIT:
 				im = mtod(mm, struct mrt6msg *);
 				im->im6_msgtype = MRT6MSG_NOCACHE;
 				im->im6_mbz = 0;
 				break;
 			default:
 				free(rte, M_MRTABLE);
 				m_freem(mb0);
 				free(rt, M_MRTABLE);
 				splx(s);
 				return EINVAL;
 			}
 
 #ifdef MRT6DEBUG
 			if (mrt6debug & DEBUG_FORWARD)
 				log(LOG_DEBUG,
 				    "getting the iif info in the kernel\n");
 #endif
 
 			for (mifp = mif6table, mifi = 0;
 			     mifi < nummifs && mifp->m6_ifp != ifp;
 			     mifp++, mifi++)
 				;
 
 			switch (ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 			case MRT6_OINIT:
 				oim->im6_mif = mifi;
 				break;
 #endif
 			case MRT6_INIT:
 				im->im6_mif = mifi;
 				break;
 			}
 
 			if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
 				log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
 				    "socket queue full\n");
 				mrt6stat.mrt6s_upq_sockfull++;
 				free(rte, M_MRTABLE);
 				m_freem(mb0);
 				free(rt, M_MRTABLE);
 				splx(s);
 				return ENOBUFS;
 			}
 
 			mrt6stat.mrt6s_upcalls++;
 
 			/* insert new entry at head of hash chain */
 			bzero(rt, sizeof(*rt));
 			rt->mf6c_origin.sin6_family = AF_INET6;
 			rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6);
 			rt->mf6c_origin.sin6_addr = ip6->ip6_src;
 			rt->mf6c_mcastgrp.sin6_family = AF_INET6;
 			rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6);
 			rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst;
 			rt->mf6c_expire = UPCALL_EXPIRE;
 			n6expire[hash]++;
 			rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
 
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 			/* Add this entry to the end of the queue */
 			rt->mf6c_stall = rte;
 		} else {
 			/* determine if q has overflowed */
 			struct rtdetq **p;
 			int npkts = 0;
 
 			for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next)
 				if (++npkts > MAX_UPQ6) {
 					mrt6stat.mrt6s_upq_ovflw++;
 					free(rte, M_MRTABLE);
 					m_freem(mb0);
 					splx(s);
 					return 0;
 				}
 
 			/* Add this entry to the end of the queue */
 			*p = rte;
 		}
 
 		rte->next = NULL;
 		rte->m = mb0;
 		rte->ifp = ifp;
 #ifdef UPCALL_TIMING
 		rte->t = tp;
 #endif /* UPCALL_TIMING */
 
 		splx(s);
 
 		return 0;
 	}
 }
 
 /*
  * Clean up cache entries if upcalls are not serviced
  * Call from the Slow Timeout mechanism, every half second.
  */
 static void
 expire_upcalls(unused)
 	void *unused;
 {
 	struct rtdetq *rte;
 	struct mf6c *mfc, **nptr;
 	int i;
 	int s;
 
 	s = splnet();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		if (n6expire[i] == 0)
 			continue;
 		nptr = &mf6ctable[i];
 		while ((mfc = *nptr) != NULL) {
 			rte = mfc->mf6c_stall;
 			/*
 			 * Skip real cache entries
 			 * Make sure it wasn't marked to not expire (shouldn't happen)
 			 * If it expires now
 			 */
 			if (rte != NULL &&
 			    mfc->mf6c_expire != 0 &&
 			    --mfc->mf6c_expire == 0) {
 #ifdef MRT6DEBUG
 				if (mrt6debug & DEBUG_EXPIRE)
 					log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n",
 					    ip6_sprintf(&mfc->mf6c_origin.sin6_addr),
 					    ip6_sprintf(&mfc->mf6c_mcastgrp.sin6_addr));
 #endif
 				/*
 				 * drop all the packets
 				 * free the mbuf with the pkt, if, timing info
 				 */
 				do {
 					struct rtdetq *n = rte->next;
 					m_freem(rte->m);
 					free(rte, M_MRTABLE);
 					rte = n;
 				} while (rte != NULL);
 				mrt6stat.mrt6s_cache_cleanups++;
 				n6expire[i]--;
 
 				*nptr = mfc->mf6c_next;
 				free(mfc, M_MRTABLE);
 			} else {
 				nptr = &mfc->mf6c_next;
 			}
 		}
 	}
 	splx(s);
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip6_mdq(m, ifp, rt)
 	struct mbuf *m;
 	struct ifnet *ifp;
 	struct mf6c *rt;
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	mifi_t mifi, iif;
 	struct mif6 *mifp;
 	int plen = m->m_pkthdr.len;
 
 /*
  * Macro to send packet on mif.  Since RSVP packets don't get counted on
  * input, they shouldn't get counted on output, so statistics keeping is
  * separate.
  */
 
 #define MC6_SEND(ip6, mifp, m) do {				\
 		if ((mifp)->m6_flags & MIFF_REGISTER)		\
 		    register_send((ip6), (mifp), (m));		\
 		else						\
 		    phyint_send((ip6), (mifp), (m));		\
 } while (0)
 
 	/*
 	 * Don't forward if it didn't arrive from the parent mif
 	 * for its origin.
 	 */
 	mifi = rt->mf6c_parent;
 	if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
 		/* came in the wrong interface */
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_FORWARD)
 			log(LOG_DEBUG,
 			    "wrong if: ifid %d mifi %d mififid %x\n",
 			    ifp->if_index, mifi,
 			    mif6table[mifi].m6_ifp->if_index);
 #endif
 		mrt6stat.mrt6s_wrong_if++;
 		rt->mf6c_wrong_if++;
 		/*
 		 * If we are doing PIM processing, and we are forwarding
 		 * packets on this interface, send a message to the
 		 * routing daemon.
 		 */
 		/* have to make sure this is a valid mif */
 		if (mifi < nummifs && mif6table[mifi].m6_ifp)
 			if (pim6 && (m->m_flags & M_LOOP) == 0) {
 				/*
 				 * Check the M_LOOP flag to avoid an
 				 * unnecessary PIM assert.
 				 * XXX: M_LOOP is an ad-hoc hack...
 				 */
 				static struct sockaddr_in6 sin6 =
 				{ sizeof(sin6), AF_INET6 };
 
 				struct mbuf *mm;
 				struct mrt6msg *im;
 #ifdef MRT6_OINIT
 				struct omrt6msg *oim;
 #endif
 
 				mm = m_copy(m, 0, sizeof(struct ip6_hdr));
 				if (mm &&
 				    (M_HASCL(mm) ||
 				     mm->m_len < sizeof(struct ip6_hdr)))
 					mm = m_pullup(mm, sizeof(struct ip6_hdr));
 				if (mm == NULL)
 					return ENOBUFS;
 	
 #ifdef MRT6_OINIT
 				oim = NULL;
 #endif
 				im = NULL;
 				switch (ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim = mtod(mm, struct omrt6msg *);
 					oim->im6_msgtype = MRT6MSG_WRONGMIF;
 					oim->im6_mbz = 0;
 					break;
 #endif
 				case MRT6_INIT:
 					im = mtod(mm, struct mrt6msg *);
 					im->im6_msgtype = MRT6MSG_WRONGMIF;
 					im->im6_mbz = 0;
 					break;
 				default:
 					m_freem(mm);
 					return EINVAL;
 				}
 
 				for (mifp = mif6table, iif = 0;
 				     iif < nummifs && mifp &&
 					     mifp->m6_ifp != ifp;
 				     mifp++, iif++)
 					;
 
 				switch (ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim->im6_mif = iif;
 					sin6.sin6_addr = oim->im6_src;
 					break;
 #endif
 				case MRT6_INIT:
 					im->im6_mif = iif;
 					sin6.sin6_addr = im->im6_src;
 					break;
 				}
 
 				mrt6stat.mrt6s_upcalls++;
 
 				if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
 #ifdef MRT6DEBUG
 					if (mrt6debug)
 						log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
 #endif
 					++mrt6stat.mrt6s_upq_sockfull;
 					return ENOBUFS;
 				}	/* if socket Q full */
 			}		/* if PIM */
 		return 0;
 	}			/* if wrong iif */
 
 	/* If I sourced this packet, it counts as output, else it was input. */
 	if (m->m_pkthdr.rcvif == NULL) {
 		/* XXX: is rcvif really NULL when output?? */
 		mif6table[mifi].m6_pkt_out++;
 		mif6table[mifi].m6_bytes_out += plen;
 	} else {
 		mif6table[mifi].m6_pkt_in++;
 		mif6table[mifi].m6_bytes_in += plen;
 	}
 	rt->mf6c_pkt_cnt++;
 	rt->mf6c_byte_cnt += plen;
 
 	/*
 	 * For each mif, forward a copy of the packet if there are group
 	 * members downstream on the interface.
 	 */
 	for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++)
 		if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
 			/*
 			 * check if the outgoing packet is going to break
 			 * a scope boundary.
 			 * XXX For packets through PIM register tunnel
 			 * interface, we believe a routing daemon.
 			 */
 			if ((mif6table[rt->mf6c_parent].m6_flags &
 			     MIFF_REGISTER) == 0 &&
 			    (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0 &&
 			    (in6_addr2scopeid(ifp, &ip6->ip6_dst) !=
 			     in6_addr2scopeid(mif6table[mifi].m6_ifp,
 					      &ip6->ip6_dst) ||
 			     in6_addr2scopeid(ifp, &ip6->ip6_src) !=
 			     in6_addr2scopeid(mif6table[mifi].m6_ifp,
 					      &ip6->ip6_src))) {
 				ip6stat.ip6s_badscope++;
 				continue;
 			}
 
 			mifp->m6_pkt_out++;
 			mifp->m6_bytes_out += plen;
 			MC6_SEND(ip6, mifp, m);
 		}
 	return 0;
 }
 
 static void
 phyint_send(ip6, mifp, m)
     struct ip6_hdr *ip6;
     struct mif6 *mifp;
     struct mbuf *m;
 {
 	struct mbuf *mb_copy;
 	struct ifnet *ifp = mifp->m6_ifp;
 	int error = 0;
 	int s = splnet();	/* needs to protect static "ro" below. */
 	static struct route_in6 ro;
 	struct	in6_multi *in6m;
 	struct sockaddr_in6 *dst6;
 
 	/*
 	 * Make a new reference to the packet; make sure that
 	 * the IPv6 header is actually copied, not just referenced,
 	 * so that ip6_output() only scribbles on the copy.
 	 */
 	mb_copy = m_copy(m, 0, M_COPYALL);
 	if (mb_copy &&
 	    (M_HASCL(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr)))
 		mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
 	if (mb_copy == NULL) {
 		splx(s);
 		return;
 	}
 	/* set MCAST flag to the outgoing packet */
 	mb_copy->m_flags |= M_MCAST;
 
 	/*
 	 * If we sourced the packet, call ip6_output since we may devide
 	 * the packet into fragments when the packet is too big for the
 	 * outgoing interface.
 	 * Otherwise, we can simply send the packet to the interface
 	 * sending queue.
 	 */
 	if (m->m_pkthdr.rcvif == NULL) {
 		struct ip6_moptions im6o;
 
 		im6o.im6o_multicast_ifp = ifp;
 		/* XXX: ip6_output will override ip6->ip6_hlim */
 		im6o.im6o_multicast_hlim = ip6->ip6_hlim;
 		im6o.im6o_multicast_loop = 1;
 		error = ip6_output(mb_copy, NULL, &ro,
-				   IPV6_FORWARDING, &im6o, NULL);
+				   IPV6_FORWARDING, &im6o, NULL, NULL);
 
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_XMIT)
 			log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
 			    mifp - mif6table, error);
 #endif
 		splx(s);
 		return;
 	}
 
 	/*
 	 * If we belong to the destination multicast group
 	 * on the outgoing interface, loop back a copy.
 	 */
 	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
 	IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m);
 	if (in6m != NULL) {
 		dst6->sin6_len = sizeof(struct sockaddr_in6);
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_addr = ip6->ip6_dst;
 		ip6_mloopback(ifp, m, (struct sockaddr_in6 *)&ro.ro_dst);
 	}
 	/*
 	 * Put the packet into the sending queue of the outgoing interface
 	 * if it would fit in the MTU of the interface.
 	 */
 	if (mb_copy->m_pkthdr.len < ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) {
 		dst6->sin6_len = sizeof(struct sockaddr_in6);
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_addr = ip6->ip6_dst;
 		/*
 		 * We just call if_output instead of nd6_output here, since
 		 * we need no ND for a multicast forwarded packet...right?
 		 */
 		error = (*ifp->if_output)(ifp, mb_copy,
 		    (struct sockaddr *)&ro.ro_dst, NULL);
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_XMIT)
 			log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
 			    mifp - mif6table, error);
 #endif
 	} else {
 #ifdef MULTICAST_PMTUD
 		icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
 #else
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_XMIT)
 			log(LOG_DEBUG,
 			    "phyint_send: packet too big on %s o %s g %s"
 			    " size %d(discarded)\n",
 			    if_name(ifp),
 			    ip6_sprintf(&ip6->ip6_src),
 			    ip6_sprintf(&ip6->ip6_dst),
 			    mb_copy->m_pkthdr.len);
 #endif /* MRT6DEBUG */
 		m_freem(mb_copy); /* simply discard the packet */
 #endif
 	}
 
 	splx(s);
 }
 
 static int
 register_send(ip6, mif, m)
 	struct ip6_hdr *ip6;
 	struct mif6 *mif;
 	struct mbuf *m;
 {
 	struct mbuf *mm;
 	int i, len = m->m_pkthdr.len;
 	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 	struct mrt6msg *im6;
 
 #ifdef MRT6DEBUG
 	if (mrt6debug)
 		log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n",
 		    ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst));
 #endif
 	++pim6stat.pim6s_snd_registers;
 
 	/* Make a copy of the packet to send to the user level process */
 	MGETHDR(mm, M_DONTWAIT, MT_HEADER);
 	if (mm == NULL)
 		return ENOBUFS;
 	mm->m_pkthdr.rcvif = NULL;
 	mm->m_data += max_linkhdr;
 	mm->m_len = sizeof(struct ip6_hdr);
 
 	if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 		m_freem(mm);
 		return ENOBUFS;
 	}
 	i = MHLEN - M_LEADINGSPACE(mm);
 	if (i > len)
 		i = len;
 	mm = m_pullup(mm, i);
 	if (mm == NULL)
 		return ENOBUFS;
 /* TODO: check it! */
 	mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
 
 	/*
 	 * Send message to routing daemon
 	 */
 	sin6.sin6_addr = ip6->ip6_src;
 
 	im6 = mtod(mm, struct mrt6msg *);
 	im6->im6_msgtype      = MRT6MSG_WHOLEPKT;
 	im6->im6_mbz          = 0;
 
 	im6->im6_mif = mif - mif6table;
 
 	/* iif info is not given for reg. encap.n */
 	mrt6stat.mrt6s_upcalls++;
 
 	if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
 #ifdef MRT6DEBUG
 		if (mrt6debug)
 			log(LOG_WARNING,
 			    "register_send: ip6_mrouter socket queue full\n");
 #endif
 		++mrt6stat.mrt6s_upq_sockfull;
 		return ENOBUFS;
 	}
 	return 0;
 }
 
 /*
  * PIM sparse mode hook
  * Receives the pim control messages, and passes them up to the listening
  * socket, using rip6_input.
  * The only message processed is the REGISTER pim message; the pim header
  * is stripped off, and the inner packet is passed to register_mforward.
  */
 int
 pim6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;
 {
 	struct pim *pim; /* pointer to a pim struct */
 	struct ip6_hdr *ip6;
 	int pimlen;
 	struct mbuf *m = *mp;
 	int minlen;
 	int off = *offp;
 
 	++pim6stat.pim6s_rcv_total;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	pimlen = m->m_pkthdr.len - *offp;
 
 	/*
 	 * Validate lengths
 	 */
 	if (pimlen < PIM_MINLEN) {
 		++pim6stat.pim6s_rcv_tooshort;
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_PIM)
 			log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
 #endif
 		m_freem(m);
 		return(IPPROTO_DONE);
 	}
 
 	/*
 	 * if the packet is at least as big as a REGISTER, go ahead
 	 * and grab the PIM REGISTER header size, to avoid another
 	 * possible m_pullup() later.
 	 *
 	 * PIM_MINLEN       == pimhdr + u_int32 == 8
 	 * PIM6_REG_MINLEN   == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
 	 */
 	minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
 
 	/*
 	 * Make sure that the IP6 and PIM headers in contiguous memory, and
 	 * possibly the PIM REGISTER header
 	 */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE);
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf to point to the PIM header */
 	pim = (struct pim *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
 	if (pim == NULL) {
 		pim6stat.pim6s_rcv_tooshort++;
 		return IPPROTO_DONE;
 	}
 #endif
 
 #define PIM6_CHECKSUM
 #ifdef PIM6_CHECKSUM
 	{
 		int cksumlen;
 
 		/*
 		 * Validate checksum.
 		 * If PIM REGISTER, exclude the data packet
 		 */
 		if (pim->pim_type == PIM_REGISTER)
 			cksumlen = PIM_MINLEN;
 		else
 			cksumlen = pimlen;
 
 		if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
 			++pim6stat.pim6s_rcv_badsum;
 #ifdef MRT6DEBUG
 			if (mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: invalid checksum\n");
 #endif
 			m_freem(m);
 			return(IPPROTO_DONE);
 		}
 	}
 #endif /* PIM_CHECKSUM */
 
 	/* PIM version check */
 	if (pim->pim_ver != PIM_VERSION) {
 		++pim6stat.pim6s_rcv_badversion;
 #ifdef MRT6DEBUG
 		log(LOG_ERR,
 		    "pim6_input: incorrect version %d, expecting %d\n",
 		    pim->pim_ver, PIM_VERSION);
 #endif
 		m_freem(m);
 		return(IPPROTO_DONE);
 	}
 
 	if (pim->pim_type == PIM_REGISTER) {
 		/*
 		 * since this is a REGISTER, we'll make a copy of the register
 		 * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
 		 * routing daemon.
 		 */
 		static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 };
 
 		struct mbuf *mcp;
 		struct ip6_hdr *eip6;
 		u_int32_t *reghdr;
 		int rc;
 	
 		++pim6stat.pim6s_rcv_registers;
 
 		if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
 #ifdef MRT6DEBUG
 			if (mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: register mif not set: %d\n",
 				    reg_mif_num);
 #endif
 			m_freem(m);
 			return(IPPROTO_DONE);
 		}
 	
 		reghdr = (u_int32_t *)(pim + 1);
 	
 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 			goto pim6_input_to_daemon;
 
 		/*
 		 * Validate length
 		 */
 		if (pimlen < PIM6_REG_MINLEN) {
 			++pim6stat.pim6s_rcv_tooshort;
 			++pim6stat.pim6s_rcv_badregisters;
 #ifdef MRT6DEBUG
 			log(LOG_ERR,
 			    "pim6_input: register packet size too "
 			    "small %d from %s\n",
 			    pimlen, ip6_sprintf(&ip6->ip6_src));
 #endif
 			m_freem(m);
 			return(IPPROTO_DONE);
 		}
 	
 		eip6 = (struct ip6_hdr *) (reghdr + 1);
 #ifdef MRT6DEBUG	
 		if (mrt6debug & DEBUG_PIM)
 			log(LOG_DEBUG,
 			    "pim6_input[register], eip6: %s -> %s, "
 			    "eip6 plen %d\n",
 			    ip6_sprintf(&eip6->ip6_src),
 			    ip6_sprintf(&eip6->ip6_dst),
 			    ntohs(eip6->ip6_plen));
 #endif
 
 		/* verify the version number of the inner packet */
 		if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			++pim6stat.pim6s_rcv_badregisters;
 #ifdef MRT6DEBUG
 			log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
 			    "of the inner packet\n",
 			    (eip6->ip6_vfc & IPV6_VERSION));
 #endif
 			m_freem(m);
 			return(IPPROTO_NONE);
 		}
 	
 		/* verify the inner packet is destined to a mcast group */
 		if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
 			++pim6stat.pim6s_rcv_badregisters;
 #ifdef MRT6DEBUG
 			if (mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: inner packet of register "
 				    "is not multicast %s\n",
 				    ip6_sprintf(&eip6->ip6_dst));
 #endif
 			m_freem(m);
 			return(IPPROTO_DONE);
 		}
 	
 		/*
 		 * make a copy of the whole header to pass to the daemon later.
 		 */
 		mcp = m_copy(m, 0, off + PIM6_REG_MINLEN);
 		if (mcp == NULL) {
 #ifdef MRT6DEBUG
 			log(LOG_ERR,
 			    "pim6_input: pim register: "
 			    "could not copy register head\n");
 #endif
 			m_freem(m);
 			return(IPPROTO_DONE);
 		}
 	
 		/*
 		 * forward the inner ip6 packet; point m_data at the inner ip6.
 		 */
 		m_adj(m, off + PIM_MINLEN);
 #ifdef MRT6DEBUG
 		if (mrt6debug & DEBUG_PIM) {
 			log(LOG_DEBUG,
 			    "pim6_input: forwarding decapsulated register: "
 			    "src %s, dst %s, mif %d\n",
 			    ip6_sprintf(&eip6->ip6_src),
 			    ip6_sprintf(&eip6->ip6_dst),
 			    reg_mif_num);
 		}
 #endif
 
  		rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m,
 				dst.sin6_family, 0);
 	
 		/* prepare the register head to send to the mrouting daemon */
 		m = mcp;
 	}
 
 	/*
 	 * Pass the PIM message up to the daemon; if it is a register message
 	 * pass the 'head' only up to the daemon. This includes the
 	 * encapsulator ip6 header, pim header, register header and the
 	 * encapsulated ip6 header.
 	 */
   pim6_input_to_daemon:
 	rip6_input(&m, offp, proto);
 	return(IPPROTO_DONE);
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 105193)
+++ head/sys/netinet6/ip6_output.c	(revision 105194)
@@ -1,2558 +1,2556 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include "opt_ip6fw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_pfil_hooks.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #ifdef PFIL_HOOKS
 #include <net/pfil.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/nd6.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #include <netkey/key.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6_fw.h>
 
 #include <net/net_osdep.h>
 
 #include <netinet6/ip6protosw.h>
 
 static MALLOC_DEFINE(M_IPMOPTS, "ip6_moptions", "internet multicast options");
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *,
 			    struct socket *, struct sockopt *sopt));
 static int ip6_setmoptions __P((int, struct ip6_moptions **, struct mbuf *));
 static int ip6_getmoptions __P((int, struct ip6_moptions *, struct mbuf **));
 static int ip6_copyexthdr __P((struct mbuf **, caddr_t, int));
 static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int,
 				  struct ip6_frag **));
 static int ip6_insert_jumboopt __P((struct ip6_exthdrs *, u_int32_t));
 static int ip6_splithdr __P((struct mbuf *, struct ip6_exthdrs *));
 
 /*
  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
  * header (with pri, len, nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  *
  * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
  * which is rt_rmx.rmx_mtu.
  */
 int
-ip6_output(m0, opt, ro, flags, im6o, ifpp)
+ip6_output(m0, opt, ro, flags, im6o, ifpp, inp)
 	struct mbuf *m0;
 	struct ip6_pktopts *opt;
 	struct route_in6 *ro;
 	int flags;
 	struct ip6_moptions *im6o;
 	struct ifnet **ifpp;		/* XXX: just for statistics */
+	struct inpcb *inp;
 {
 	struct ip6_hdr *ip6, *mhip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	int hlen, tlen, len, off;
 	struct route_in6 ip6route;
 	struct sockaddr_in6 *dst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst;
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int needipsec = 0;
 #ifdef PFIL_HOOKS
 	struct packet_filter_hook *pfh;
 	struct mbuf *m1;
 	int rv;
 #endif /* PFIL_HOOKS */
 #ifdef IPSEC
 	int needipsectun = 0;
-	struct socket *so;
 	struct secpolicy *sp = NULL;
+	struct socket *so = inp ? inp->inp_socket : NULL;
 
-	/* for AH processing. stupid to have "socket" variable in IP layer... */
-	so = ipsec_getsocket(m);
-	(void)ipsec_setsocket(m, NULL);
 	ip6 = mtod(m, struct ip6_hdr *);
 #endif /* IPSEC */
 
 #define MAKE_EXTHDR(hp, mp)						\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp), 		\
 				       ((eh)->ip6e_len + 1) << 3);	\
 		if (error)						\
 			goto freehdrs;					\
 	}								\
     } while (0)
 	
 	bzero(&exthdrs, sizeof(exthdrs));
 	
 	if (opt) {
 		/* Hop-by-Hop options header */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
 		/* Destination options header(1st part) */
 		MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
 		/* Routing header */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
 		/* Destination options header(2nd part) */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
 	}
 
 #ifdef IPSEC
 	/* get a security policy for this packet */
 	if (so == NULL)
 		sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error);
 	else
 		sp = ipsec6_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
 
 	if (sp == NULL) {
 		ipsec6stat.out_inval++;
 		goto freehdrs;
 	}
 
 	error = 0;
 
 	/* check policy */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		/*
 		 * This packet is just discarded.
 		 */
 		ipsec6stat.out_polvio++;
 		goto freehdrs;
 
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		/* no need to do IPsec. */
 		needipsec = 0;
 		break;
 	
 	case IPSEC_POLICY_IPSEC:
 		if (sp->req == NULL) {
 			/* acquire a policy */
 			error = key_spdacquire(sp);
 			goto freehdrs;
 		}
 		needipsec = 1;
 		break;
 
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		printf("ip6_output: Invalid policy found. %d\n", sp->policy);
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	optlen = 0;
 	if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len;
 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 	/* NOTE: we don't add AH/ESP length here. do that later. */
 	if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len;
 
 	/*
 	 * If we need IPsec, or there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	if ((needipsec || optlen) && !hdrsplit) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		hdrsplit++;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf packet header length */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			hdrsplit++;
 		}
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 
 	/*
 	 * Concatenate headers and fill in next header fields.
 	 * Here we have, on "m"
 	 *	IPv6 payload
 	 * and we insert headers accordingly.  Finally, we should be getting:
 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
 	 *
 	 * during the header composing process, "m" points to IPv6 header.
 	 * "mprev" points to an extension header prior to esp.
 	 */
 	{
 		u_char *nexthdrp = &ip6->ip6_nxt;
 		struct mbuf *mprev = m;
 
 		/*
 		 * we treat dest2 specially.  this makes IPsec processing
 		 * much easier.  the goal here is to make mprev point the
 		 * mbuf prior to dest2.
 		 *
 		 * result: IPv6 dest2 payload
 		 * m and mprev will point to IPv6 header.
 		 */
 		if (exthdrs.ip6e_dest2) {
 			if (!hdrsplit)
 				panic("assumption failed: hdr not split");
 			exthdrs.ip6e_dest2->m_next = m->m_next;
 			m->m_next = exthdrs.ip6e_dest2;
 			*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_DSTOPTS;
 		}
 
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("assumption failed: hdr not split"); \
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (0)
 		/*
 		 * result: IPv6 hbh dest1 rthdr dest2 payload
 		 * m will point to IPv6 header.  mprev will point to the
 		 * extension header prior to dest2 (rthdr in the above case).
 		 */
 		MAKE_CHAIN(exthdrs.ip6e_hbh, mprev,
 			   nexthdrp, IPPROTO_HOPOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_dest1, mprev,
 			   nexthdrp, IPPROTO_DSTOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev,
 			   nexthdrp, IPPROTO_ROUTING);
 
 #ifdef IPSEC
 		if (!needipsec)
 			goto skip_ipsec2;
 
 		/*
 		 * pointers after IPsec headers are not valid any more.
 		 * other pointers need a great care too.
 		 * (IPsec routines should not mangle mbufs prior to AH/ESP)
 		 */
 		exthdrs.ip6e_dest2 = NULL;
 
 	    {
 		struct ip6_rthdr *rh = NULL;
 		int segleft_org = 0;
 		struct ipsec_output_state state;
 
 		if (exthdrs.ip6e_rthdr) {
 			rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
 			segleft_org = rh->ip6r_segleft;
 			rh->ip6r_segleft = 0;
 		}
 
 		bzero(&state, sizeof(state));
 		state.m = m;
 		error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags,
 			&needipsectun);
 		m = state.m;
 		if (error) {
 			/* mbuf is already reclaimed in ipsec6_output_trans. */
 			m = NULL;
 			switch (error) {
 			case EHOSTUNREACH:
 			case ENETUNREACH:
 			case EMSGSIZE:
 			case ENOBUFS:
 			case ENOMEM:
 				break;
 			default:
 				printf("ip6_output (ipsec): error code %d\n", error);
 				/* fall through */
 			case ENOENT:
 				/* don't show these error codes to the user */
 				error = 0;
 				break;
 			}
 			goto bad;
 		}
 		if (exthdrs.ip6e_rthdr) {
 			/* ah6_output doesn't modify mbuf chain */
 			rh->ip6r_segleft = segleft_org;
 		}
 	    }
 skip_ipsec2:;
 #endif
 	}
 
 	/*
 	 * If there is a routing header, replace destination address field
 	 * with the first hop of the routing header.
 	 */
 	if (exthdrs.ip6e_rthdr) {
 		struct ip6_rthdr *rh =
 			(struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr,
 						  struct ip6_rthdr *));
 		struct ip6_rthdr0 *rh0;
 
 		finaldst = ip6->ip6_dst;
 		switch (rh->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			 rh0 = (struct ip6_rthdr0 *)rh;
 			 ip6->ip6_dst = rh0->ip6r0_addr[0];
 			 bcopy((caddr_t)&rh0->ip6r0_addr[1],
 			       (caddr_t)&rh0->ip6r0_addr[0],
 			       sizeof(struct in6_addr)*(rh0->ip6r0_segleft - 1)
 				 );
 			 rh0->ip6r0_addr[rh0->ip6r0_segleft - 1] = finaldst;
 			 break;
 		default:	/* is it possible? */
 			 error = EINVAL;
 			 goto bad;
 		}
 	}
 
 	/* Source address validation */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_DADOUTPUT) == 0) {
 		error = EOPNOTSUPP;
 		ip6stat.ip6s_badscope++;
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		ip6stat.ip6s_badscope++;
 		goto bad;
 	}
 
 	ip6stat.ip6s_localout++;
 
 	/*
 	 * Route packet.
 	 */
 	if (ro == 0) {
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up. If not, free it and try again.
 	 */
 	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 			 dst->sin6_family != AF_INET6 ||
 			 !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_dst))) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = (struct rtentry *)0;
 	}
 	if (ro->ro_rt == 0) {
 		bzero(dst, sizeof(*dst));
 		dst->sin6_family = AF_INET6;
 		dst->sin6_len = sizeof(struct sockaddr_in6);
 		dst->sin6_addr = ip6->ip6_dst;
 #ifdef SCOPEDROUTING
 		/* XXX: sin6_scope_id should already be fixed at this point */
 		if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
 			dst->sin6_scope_id = ntohs(dst->sin6_addr.s6_addr16[1]);
 #endif
 	}
 #ifdef IPSEC
 	if (needipsec && needipsectun) {
 		struct ipsec_output_state state;
 
 		/*
 		 * All the extension headers will become inaccessible
 		 * (since they can be encrypted).
 		 * Don't panic, we need no more updates to extension headers
 		 * on inner IPv6 packet (since they are now encapsulated).
 		 *
 		 * IPv6 [ESP|AH] IPv6 [extension headers] payload
 		 */
 		bzero(&exthdrs, sizeof(exthdrs));
 		exthdrs.ip6e_ip6 = m;
 
 		bzero(&state, sizeof(state));
 		state.m = m;
 		state.ro = (struct route *)ro;
 		state.dst = (struct sockaddr *)dst;
 
 		error = ipsec6_output_tunnel(&state, sp, flags);
 
 		m = state.m;
 		ro = (struct route_in6 *)state.ro;
 		dst = (struct sockaddr_in6 *)state.dst;
 		if (error) {
 			/* mbuf is already reclaimed in ipsec6_output_tunnel. */
 			m0 = m = NULL;
 			m = NULL;
 			switch (error) {
 			case EHOSTUNREACH:
 			case ENETUNREACH:
 			case EMSGSIZE:
 			case ENOBUFS:
 			case ENOMEM:
 				break;
 			default:
 				printf("ip6_output (ipsec): error code %d\n", error);
 				/* fall through */
 			case ENOENT:
 				/* don't show these error codes to the user */
 				error = 0;
 				break;
 			}
 			goto bad;
 		}
 
 		exthdrs.ip6e_ip6 = m;
 	}
 #endif /* IPSEC */
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/* Unicast */
 
 #define ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 #define sin6tosa(sin6)	((struct sockaddr *)(sin6))
 		/* xxx
 		 * interface selection comes here
 		 * if an interface is specified from an upper layer,
 		 * ifp must point it.
 		 */
 		if (ro->ro_rt == 0) {
 			/*
 			 * non-bsdi always clone routes, if parent is
 			 * PRF_CLONING.
 			 */
 			rtalloc((struct route *)ro);
 		}
 		if (ro->ro_rt == 0) {
 			ip6stat.ip6s_noroute++;
 			error = EHOSTUNREACH;
 			/* XXX in6_ifstat_inc(ifp, ifs6_out_discard); */
 			goto bad;
 		}
 		ia = ifatoia6(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 		ro->ro_rt->rt_use++;
 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 			dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway;
 		m->m_flags &= ~(M_BCAST | M_MCAST);	/* just in case */
 
 		in6_ifstat_inc(ifp, ifs6_out_request);
 
 		/*
 		 * Check if the outgoing interface conflicts with
 		 * the interface specified by ifi6_ifindex (if specified).
 		 * Note that loopback interface is always okay.
 		 * (this may happen when we are sending a packet to one of
 		 *  our own addresses.)
 		 */
 		if (opt && opt->ip6po_pktinfo
 		 && opt->ip6po_pktinfo->ipi6_ifindex) {
 			if (!(ifp->if_flags & IFF_LOOPBACK)
 			 && ifp->if_index != opt->ip6po_pktinfo->ipi6_ifindex) {
 				ip6stat.ip6s_noroute++;
 				in6_ifstat_inc(ifp, ifs6_out_discard);
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 		}
 
 		if (opt && opt->ip6po_hlim != -1)
 			ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	} else {
 		/* Multicast */
 		struct	in6_multi *in6m;
 
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		ifp = NULL;
 		if (im6o != NULL) {
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 			if (im6o->im6o_multicast_ifp != NULL)
 				ifp = im6o->im6o_multicast_ifp;
 		} else
 			ip6->ip6_hlim = ip6_defmcasthlim;
 
 		/*
 		 * See if the caller provided the outgoing interface
 		 * as an ancillary data.
 		 * Boundary check for ifindex is assumed to be already done.
 		 */
 		if (opt && opt->ip6po_pktinfo && opt->ip6po_pktinfo->ipi6_ifindex)
 			ifp = ifnet_byindex(opt->ip6po_pktinfo->ipi6_ifindex);
 
 		/*
 		 * If the destination is a node-local scope multicast,
 		 * the packet should be loop-backed only.
 		 */
 		if (IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst)) {
 			/*
 			 * If the outgoing interface is already specified,
 			 * it should be a loopback interface.
 			 */
 			if (ifp && (ifp->if_flags & IFF_LOOPBACK) == 0) {
 				ip6stat.ip6s_badscope++;
 				error = ENETUNREACH; /* XXX: better error? */
 				/* XXX correct ifp? */
 				in6_ifstat_inc(ifp, ifs6_out_discard);
 				goto bad;
 			} else {
 				ifp = &loif[0];
 			}
 		}
 
 		if (opt && opt->ip6po_hlim != -1)
 			ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 
 		/*
 		 * If caller did not provide an interface lookup a
 		 * default in the routing table.  This is either a
 		 * default for the speicfied group (i.e. a host
 		 * route), or a multicast default (a route for the
 		 * ``net'' ff00::/8).
 		 */
 		if (ifp == NULL) {
 			if (ro->ro_rt == 0) {
 				ro->ro_rt = rtalloc1((struct sockaddr *)
 						&ro->ro_dst, 0, 0UL);
 			}
 			if (ro->ro_rt == 0) {
 				ip6stat.ip6s_noroute++;
 				error = EHOSTUNREACH;
 				/* XXX in6_ifstat_inc(ifp, ifs6_out_discard) */
 				goto bad;
 			}
 			ia = ifatoia6(ro->ro_rt->rt_ifa);
 			ifp = ro->ro_rt->rt_ifp;
 			ro->ro_rt->rt_use++;
 		}
 
 		if ((flags & IPV6_FORWARDING) == 0)
 			in6_ifstat_inc(ifp, ifs6_out_request);
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 			ip6stat.ip6s_noroute++;
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m);
 		if (in6m != NULL &&
 		   (im6o == NULL || im6o->im6o_multicast_loop)) {
 			/*
 			 * If we belong to the destination multicast group
 			 * on the outgoing interface, and the caller did not
 			 * forbid loopback, loop back a copy.
 			 */
 			ip6_mloopback(ifp, m, dst);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/*
 	 * Determine path MTU.
 	 */
 	if (ro_pmtu != ro) {
 		/* The first hop and the final destination may differ. */
 		struct sockaddr_in6 *sin6_fin =
 			(struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 		if (ro_pmtu->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 				       !IN6_ARE_ADDR_EQUAL(&sin6_fin->sin6_addr,
 							   &finaldst))) {
 			RTFREE(ro_pmtu->ro_rt);
 			ro_pmtu->ro_rt = (struct rtentry *)0;
 		}
 		if (ro_pmtu->ro_rt == 0) {
 			bzero(sin6_fin, sizeof(*sin6_fin));
 			sin6_fin->sin6_family = AF_INET6;
 			sin6_fin->sin6_len = sizeof(struct sockaddr_in6);
 			sin6_fin->sin6_addr = finaldst;
 
 			rtalloc((struct route *)ro_pmtu);
 		}
 	}
 	if (ro_pmtu->ro_rt != NULL) {
 		u_int32_t ifmtu = nd_ifinfo[ifp->if_index].linkmtu;
 
 		mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
 		if (mtu > ifmtu || mtu == 0) {
 			/*
 			 * The MTU on the route is larger than the MTU on
 			 * the interface!  This shouldn't happen, unless the
 			 * MTU of the interface has been changed after the
 			 * interface was brought up.  Change the MTU in the
 			 * route to match the interface MTU (as long as the
 			 * field isn't locked).
 			 *
 			 * if MTU on the route is 0, we need to fix the MTU.
 			 * this case happens with path MTU discovery timeouts.
 			 */
 			 mtu = ifmtu;
 			 if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
 				 ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */
 		}
 	} else {
 		mtu = nd_ifinfo[ifp->if_index].linkmtu;
 	}
 
 	/*
 	 * advanced API (IPV6_USE_MIN_MTU) overrides mtu setting
 	 */
 	if ((flags & IPV6_MINMTU) != 0 && mtu > IPV6_MMTU)
 		mtu = IPV6_MMTU;
 
 	/* Fake scoped addresses */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		/*
 		 * If source or destination address is a scoped address, and
 		 * the packet is going to be sent to a loopback interface,
 		 * we should keep the original interface.
 		 */
 
 		/*
 		 * XXX: this is a very experimental and temporary solution.
 		 * We eventually have sockaddr_in6 and use the sin6_scope_id
 		 * field of the structure here.
 		 * We rely on the consistency between two scope zone ids
 		 * of source and destination, which should already be assured.
 		 * Larger scopes than link will be supported in the future. 
 		 */
 		origifp = NULL;
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src))
 			origifp = ifnet_byindex(ntohs(ip6->ip6_src.s6_addr16[1]));
 		else if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst))
 			origifp = ifnet_byindex(ntohs(ip6->ip6_dst.s6_addr16[1]));
 		/*
 		 * XXX: origifp can be NULL even in those two cases above.
 		 * For example, if we remove the (only) link-local address
 		 * from the loopback interface, and try to send a link-local
 		 * address without link-id information.  Then the source
 		 * address is ::1, and the destination address is the
 		 * link-local address with its s6_addr16[1] being zero.
 		 * What is worse, if the packet goes to the loopback interface
 		 * by a default rejected route, the null pointer would be
 		 * passed to looutput, and the kernel would hang.
 		 * The following last resort would prevent such disaster.
 		 */
 		if (origifp == NULL)
 			origifp = ifp;
 	}
 	else
 		origifp = ifp;
 #ifndef SCOPEDROUTING
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 #endif
 
 	/*
 	 * Check with the firewall...
 	 */
         if (ip6_fw_enable && ip6_fw_chk_ptr) {
 		u_short port = 0;
 		m->m_pkthdr.rcvif = NULL;	/* XXX */
 		/* If ipfw says divert, we have to just drop packet */
 		if ((*ip6_fw_chk_ptr)(&ip6, ifp, &port, &m)) {
 			m_freem(m);
 			goto done;
 		}
 		if (!m) {
 			error = EACCES;
 			goto done;
 		}
 	}
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy1; /* XXX unused */
 		u_int32_t dummy2; /* XXX unused */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not continuous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       continuous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m,
 					(u_int8_t *)(hbh + 1),
 					((hbh->ip6h_len + 1) << 3) -
 					sizeof(struct ip6_hbh),
 					&dummy1, &dummy2) < 0) {
 			/* m was already freed at this point */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 #ifdef PFIL_HOOKS
 	/*
 	 * Run through list of hooks for output packets.
 	 */
 	m1 = m;
 	pfh = pfil_hook_get(PFIL_OUT, &inet6sw[ip6_protox[IPPROTO_IPV6]].pr_pfh);
 	for (; pfh; pfh = pfh->pfil_link.tqe_next)
 		if (pfh->pfil_func) {
 			rv = pfh->pfil_func(ip6, sizeof(*ip6), ifp, 1, &m1);
 			if (rv) {
 				error = EHOSTUNREACH;
 				goto done;
 			}
 			m = m1;
 			if (m == NULL)
 				goto done;
 			ip6 = mtod(m, struct ip6_hdr *);
 		}
 #endif /* PFIL_HOOKS */
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 */
 	tlen = m->m_pkthdr.len;
 	if (tlen <= mtu
 #ifdef notyet
 	    /*
 	     * On any link that cannot convey a 1280-octet packet in one piece,
 	     * link-specific fragmentation and reassembly must be provided at
 	     * a layer below IPv6. [RFC 2460, sec.5]
 	     * Thus if the interface has ability of link-level fragmentation,
 	     * we can just send the packet even if the packet size is
 	     * larger than the link's MTU.
 	     * XXX: IFF_FRAGMENTABLE (or such) flag has not been defined yet...
 	     */
 	
 	    || ifp->if_flags & IFF_FRAGMENTABLE
 #endif
 	    )
 	{
  		/* Record statistics for this interface address. */
  		if (ia && !(flags & IPV6_FORWARDING)) {
  			ia->ia_ifa.if_opackets++;
  			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
  		}
 #ifdef IPSEC
 		/* clean ipsec history once it goes out of the node */
 		ipsec_delaux(m);
 #endif
 		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		goto done;
 	} else if (mtu < IPV6_MMTU) {
 		/*
 		 * note that path MTU is never less than IPV6_MMTU
 		 * (see icmp6_input).
 		 */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		struct mbuf **mnext, *m_frgpart;
 		struct ip6_frag *ip6f;
 		u_int32_t id = htonl(ip6_id++);
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		hlen = unfragpartlen;
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		mnext = &m->m_nextpkt;
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		for (off = hlen; off < tlen; off += len) {
 			MGETHDR(m, M_DONTWAIT, MT_HEADER);
 			if (!m) {
 				error = ENOBUFS;
 				ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			m->m_pkthdr.rcvif = NULL;
 			m->m_flags = m0->m_flags & M_COPYFLAGS;
 			*mnext = m;
 			mnext = &m->m_nextpkt;
 			m->m_data += max_linkhdr;
 			mhip6 = mtod(m, struct ip6_hdr *);
 			*mhip6 = *ip6;
 			m->m_len = sizeof(*mhip6);
  			error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
  			if (error) {
 				ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 			if (off + len >= tlen)
 				len = tlen - off;
 			else
 				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 			mhip6->ip6_plen = htons((u_short)(len + hlen +
 							  sizeof(*ip6f) -
 							  sizeof(struct ip6_hdr)));
 			if ((m_frgpart = m_copy(m0, off, len)) == 0) {
 				error = ENOBUFS;
 				ip6stat.ip6s_odropped++;
 				goto sendorfree;
 			}
 			m_cat(m, m_frgpart);
 			m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
 			m->m_pkthdr.rcvif = (struct ifnet *)0;
 			ip6f->ip6f_reserved = 0;
 			ip6f->ip6f_ident = id;
 			ip6f->ip6f_nxt = nextproto;
 			ip6stat.ip6s_ofragments++;
 			in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 		}
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/*
 	 * Remove leading garbages.
 	 */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (m0 = m; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
  			/* Record statistics for this interface address. */
  			if (ia) {
  				ia->ia_ifa.if_opackets++;
  				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
  			}
 #ifdef IPSEC
 			/* clean ipsec history once it goes out of the node */
 			ipsec_delaux(m);
 #endif
 			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		ip6stat.ip6s_fragmented++;
 
 done:
 	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
 		RTFREE(ro->ro_rt);
 	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
 		RTFREE(ro_pmtu->ro_rt);
 	}
 
 #ifdef IPSEC
 	if (sp != NULL)
 		key_freesp(sp);
 #endif /* IPSEC */
 
 	return(error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* fall through */
 bad:
 	m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(mp, hdr, hlen)
 	struct mbuf **mp;
 	caddr_t hdr;
 	int hlen;
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return(ENOBUFS); /* XXX */
 
 	MGET(m, M_DONTWAIT, MT_DATA);
 	if (!m)
 		return(ENOBUFS);
 
 	if (hlen > MLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return(ENOBUFS);
 		}
 	}
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return(0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(exthdrs, plen)
 	struct ip6_exthdrs *exthdrs;
 	u_int32_t plen;
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == 0) {
 		MGET(mopt, M_DONTWAIT, MT_DATA);
 		if (mopt == 0)
 			return(ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return(ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			MGET(n, M_DONTWAIT, MT_DATA);
 			if (n) {
 				MCLGET(n, M_DONTWAIT);
 				if ((n->m_flags & M_EXT) == 0) {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 			if (!n)
 				return(ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			      oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return(0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(m0, m, hlen, frghdrp)
 	struct mbuf *m0, *m;
 	int hlen;
 	struct ip6_frag **frghdrp;
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 			    hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
 		if (n == 0)
 			return(ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if ((mlast->m_flags & M_EXT) == 0 &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp =
 			(struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		MGET(mfrg, M_DONTWAIT, MT_DATA);
 		if (mfrg == 0)
 			return(ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return(0);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int privileged;
 	struct inpcb *in6p = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 		td = sopt->sopt_td;
 	} else {
 		panic("ip6_ctloutput: arg soopt is NULL");
 	}
 	error = optval = 0;
 
 	privileged = (td == 0 || suser(td)) ? 0 : 1;
 
 	if (level == IPPROTO_IPV6) {
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_PKTOPTIONS:
 			{
 				struct mbuf *m;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				error = ip6_pcbopts(&in6p->in6p_outputopts,
 						    m, so, sopt);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_CHECKSUM:
 			case IPV6_FAITH:
 
 			case IPV6_V6ONLY:
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						in6p->in6p_hops = optval;
 
 						if ((in6p->in6p_vflag &
 						     INP_IPV4) != 0)
 							in6p->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	if (optval) \
 		in6p->in6p_flags |= (bit); \
 	else \
 		in6p->in6p_flags &= ~(bit); \
 } while (0)
 #define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0)
 
 				case IPV6_CHECKSUM:
 					in6p->in6p_cksum = optval;
 					break;
 
 				case IPV6_FAITH:
 					OPTSET(IN6P_FAITH);
 					break;
 
 				case IPV6_V6ONLY:
 					/*
 					 * make setsockopt(IPV6_V6ONLY)
 					 * available only prior to bind(2).
 					 * see ipng mailing list, Jun 22 2001.
 					 */
 					if (in6p->in6p_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr))
 					{
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_IPV6_V6ONLY);
 					if (optval)
 						in6p->in6p_vflag &= ~INP_IPV4;
 					else
 						in6p->in6p_vflag |= INP_IPV4;
 					break;
 				}
 				break;
 
 			case IPV6_PKTINFO:
 			case IPV6_HOPLIMIT:
 			case IPV6_HOPOPTS:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_PKTINFO:
 					OPTSET(IN6P_PKTINFO);
 					break;
 				case IPV6_HOPLIMIT:
 					OPTSET(IN6P_HOPLIMIT);
 					break;
 				case IPV6_HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (!privileged)
 						return(EPERM);
 					OPTSET(IN6P_HOPOPTS);
 					break;
 				case IPV6_DSTOPTS:
 					if (!privileged)
 						return(EPERM);
 					OPTSET(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_RTHDR:
 					OPTSET(IN6P_RTHDR);
 					break;
 				}
 				break;
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			    {
 				struct mbuf *m;
 				if (sopt->sopt_valsize > MLEN) {
 					error = EMSGSIZE;
 					break;
 				}
 				/* XXX */
 				MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
 				if (m == 0) {
 					error = ENOBUFS;
 					break;
 				}
 				m->m_len = sopt->sopt_valsize;
 				error = sooptcopyin(sopt, mtod(m, char *),
 						    m->m_len, m->m_len);
 				error =	ip6_setmoptions(sopt->sopt_name,
 							&in6p->in6p_moptions,
 							m);
 				(void)m_free(m);
 			    }
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					in6p->in6p_flags &= ~(IN6P_LOWPORT);
 					in6p->in6p_flags &= ~(IN6P_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					in6p->in6p_flags &= ~(IN6P_LOWPORT);
 					in6p->in6p_flags |= IN6P_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					in6p->in6p_flags &= ~(IN6P_HIGHPORT);
 					in6p->in6p_flags |= IN6P_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			    {
 				caddr_t req = NULL;
 				size_t len = 0;
 				struct mbuf *m;
 
 				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 					break;
 				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 					break;
 				if (m) {
 					req = mtod(m, caddr_t);
 					len = m->m_len;
 				}
 				error = ipsec6_set_policy(in6p, optname, req,
 				                          len, privileged);
 				m_freem(m);
 			    }
 				break;
 #endif /* KAME IPSEC */
 
 			case IPV6_FW_ADD:
 			case IPV6_FW_DEL:
 			case IPV6_FW_FLUSH:
 			case IPV6_FW_ZERO:
 			    {
 				struct mbuf *m;
 				struct mbuf **mp = &m;
 
 				if (ip6_fw_ctl_ptr == NULL)
 					return EINVAL;
 				/* XXX */
 				if ((error = soopt_getm(sopt, &m)) != 0)
 					break;
 				/* XXX */
 				if ((error = soopt_mcopyin(sopt, m)) != 0)
 					break;
 				error = (*ip6_fw_ctl_ptr)(optname, mp);
 				m = *mp;
 			    }
 				break;
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_PKTOPTIONS:
 				if (in6p->in6p_options) {
 					struct mbuf *m;
 					m = m_copym(in6p->in6p_options,
 					    0, M_COPYALL, M_TRYWAIT);
 					error = soopt_mcopyout(sopt, m);
 					if (error == 0)
 						m_freem(m);
 				} else
 					sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_UNICAST_HOPS:
 			case IPV6_CHECKSUM:
 
 			case IPV6_FAITH:
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					optval = in6p->in6p_hops;
 					break;
 
 				case IPV6_CHECKSUM:
 					optval = in6p->in6p_cksum;
 					break;
 
 				case IPV6_FAITH:
 					optval = OPTBIT(IN6P_FAITH);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = in6p->in6p_flags;
 					if (flags & IN6P_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & IN6P_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				}
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PKTINFO:
 			case IPV6_HOPLIMIT:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 				if (optname == IPV6_HOPOPTS ||
 				    optname == IPV6_DSTOPTS ||
 				    !privileged)
 					return(EPERM);
 				switch (optname) {
 				case IPV6_PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_HOPOPTS:
 					if (!privileged)
 						return(EPERM);
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_DSTOPTS:
 					if (!privileged)
 						return(EPERM);
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			    {
 				struct mbuf *m;
 				error = ip6_getmoptions(sopt->sopt_name,
 						in6p->in6p_moptions, &m);
 				if (error == 0)
 					error = sooptcopyout(sopt,
 						mtod(m, char *), m->m_len);
 				m_freem(m);
 			    }
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			  {
 				caddr_t req = NULL;
 				size_t len = 0;
 				struct mbuf *m = NULL;
 				struct mbuf **mp = &m;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				if (m) {
 					req = mtod(m, caddr_t);
 					len = m->m_len;
 				}
 				error = ipsec6_get_policy(in6p, req, len, mp);
 				if (error == 0)
 					error = soopt_mcopyout(sopt, m); /*XXX*/
 				if (error == 0 && m)
 					m_freem(m);
 				break;
 			  }
 #endif /* KAME IPSEC */
 
 			case IPV6_FW_GET:
 			  {
 				struct mbuf *m;
 				struct mbuf **mp = &m;
 
 				if (ip6_fw_ctl_ptr == NULL)
 			        {
 					return EINVAL;
 				}
 				error = (*ip6_fw_ctl_ptr)(optname, mp);
 				if (error == 0)
 					error = soopt_mcopyout(sopt, m); /* XXX */
 				if (error == 0 && m)
 					m_freem(m);
 			  }
 				break;
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	} else {
 		error = EINVAL;
 	}
 	return(error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(pktopt, m, so, sopt)
 	struct ip6_pktopts **pktopt;
 	struct mbuf *m;
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 	int priv = 0;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, 1, -1);
 	} else
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options.
 		 */
 		if (opt)
 			free(opt, M_IP6OPT);
 		return(0);
 	}
 
 	/*  set options specified by user. */
 	if (td && !suser(td))
 		priv = 1;
 	if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) {
 		ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */
 		return(error);
 	}
 	*pktopt = opt;
 	return(0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 init_ip6pktopts(opt)
 	struct ip6_pktopts *opt;
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 }
 
 void
 ip6_clearpktopts(pktopt, needfree, optname)
 	struct ip6_pktopts *pktopt;
 	int needfree, optname;
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_rt) {
 			RTFREE(pktopt->ip6po_route.ro_rt);
 			pktopt->ip6po_route.ro_rt = NULL;
 		}
 	}
 	if (optname == -1) {
 		if (needfree && pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen =\
 			(((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL && canwait == M_NOWAIT)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (0)
 
 struct ip6_pktopts *
 ip6_copypktopts(src, canwait)
 	struct ip6_pktopts *src;
 	int canwait;
 {
 	struct ip6_pktopts *dst;
 
 	if (src == NULL) {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return(NULL);
 	}
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL && canwait == M_NOWAIT)
 		goto bad;
 	bzero(dst, sizeof(*dst));
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 					    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL && canwait == M_NOWAIT)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 					    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL && canwait == M_NOWAIT)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		      src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return(dst);
 
   bad:
 	printf("ip6_copypktopts: copy failed");
 	if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT);
 	if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT);
 	if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT);
 	if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT);
 	if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT);
 	if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT);
 	return(NULL);
 }
 #undef PKTOPT_EXTHDRCPY
 
 void
 ip6_freepcbopts(pktopt)
 	struct ip6_pktopts *pktopt;
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, 1, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set the IP6 multicast options in response to user setsockopt().
  */
 static int
 ip6_setmoptions(optname, im6op, m)
 	int optname;
 	struct ip6_moptions **im6op;
 	struct mbuf *m;
 {
 	int error = 0;
 	u_int loop, ifindex;
 	struct ipv6_mreq *mreq;
 	struct ifnet *ifp;
 	struct ip6_moptions *im6o = *im6op;
 	struct route_in6 ro;
 	struct sockaddr_in6 *dst;
 	struct in6_multi_mship *imm;
 	struct thread *td = curthread;	/* XXX */
 
 	if (im6o == NULL) {
 		/*
 		 * No multicast option buffer attached to the pcb;
 		 * allocate one and initialize to default values.
 		 */
 		im6o = (struct ip6_moptions *)
 			malloc(sizeof(*im6o), M_IPMOPTS, M_WAITOK);
 
 		if (im6o == NULL)
 			return(ENOBUFS);
 		*im6op = im6o;
 		im6o->im6o_multicast_ifp = NULL;
 		im6o->im6o_multicast_hlim = ip6_defmcasthlim;
 		im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 		LIST_INIT(&im6o->im6o_memberships);
 	}
 
 	switch (optname) {
 
 	case IPV6_MULTICAST_IF:
 		/*
 		 * Select the interface for outgoing multicast packets.
 		 */
 		if (m == NULL || m->m_len != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex));
 		if (ifindex < 0 || if_index < ifindex) {
 			error = ENXIO;	/* XXX EINVAL? */
 			break;
 		}
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		im6o->im6o_multicast_ifp = ifp;
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 	    {
 		/*
 		 * Set the IP6 hoplimit for outgoing multicast packets.
 		 */
 		int optval;
 		if (m == NULL || m->m_len != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &optval, sizeof(optval));
 		if (optval < -1 || optval >= 256)
 			error = EINVAL;
 		else if (optval == -1)
 			im6o->im6o_multicast_hlim = ip6_defmcasthlim;
 		else
 			im6o->im6o_multicast_hlim = optval;
 		break;
 	    }
 
 	case IPV6_MULTICAST_LOOP:
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (m == NULL || m->m_len != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		bcopy(mtod(m, u_int *), &loop, sizeof(loop));
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o->im6o_multicast_loop = loop;
 		break;
 
 	case IPV6_JOIN_GROUP:
 		/*
 		 * Add a multicast group membership.
 		 * Group must be a valid IP6 multicast address.
 		 */
 		if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
 			error = EINVAL;
 			break;
 		}
 		mreq = mtod(m, struct ipv6_mreq *);
 		if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
 			/*
 			 * We use the unspecified address to specify to accept
 			 * all multicast addresses. Only super user is allowed
 			 * to do this.
 			 */
 			if (suser(td))
 			{
 				error = EACCES;
 				break;
 			}
 		} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * If the interface is specified, validate it.
 		 */
 		if (mreq->ipv6mr_interface < 0
 		 || if_index < mreq->ipv6mr_interface) {
 			error = ENXIO;	/* XXX EINVAL? */
 			break;
 		}
 		/*
 		 * If no interface was explicitly specified, choose an
 		 * appropriate one according to the given multicast address.
 		 */
 		if (mreq->ipv6mr_interface == 0) {
 			/*
 			 * If the multicast address is in node-local scope,
 			 * the interface should be a loopback interface.
 			 * Otherwise, look up the routing table for the
 			 * address, and choose the outgoing interface.
 			 *   XXX: is it a good approach?
 			 */
 			if (IN6_IS_ADDR_MC_NODELOCAL(&mreq->ipv6mr_multiaddr)) {
 				ifp = &loif[0];
 			} else {
 				ro.ro_rt = NULL;
 				dst = (struct sockaddr_in6 *)&ro.ro_dst;
 				bzero(dst, sizeof(*dst));
 				dst->sin6_len = sizeof(struct sockaddr_in6);
 				dst->sin6_family = AF_INET6;
 				dst->sin6_addr = mreq->ipv6mr_multiaddr;
 				rtalloc((struct route *)&ro);
 				if (ro.ro_rt == NULL) {
 					error = EADDRNOTAVAIL;
 					break;
 				}
 				ifp = ro.ro_rt->rt_ifp;
 				rtfree(ro.ro_rt);
 			}
 		} else
 			ifp = ifnet_byindex(mreq->ipv6mr_interface);
 
 		/*
 		 * See if we found an interface, and confirm that it
 		 * supports multicast
 		 */
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		/*
 		 * Put interface index into the multicast address,
 		 * if the address has link-local scope.
 		 */
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) {
 			mreq->ipv6mr_multiaddr.s6_addr16[1]
 				= htons(mreq->ipv6mr_interface);
 		}
 		/*
 		 * See if the membership already exists.
 		 */
 		for (imm = im6o->im6o_memberships.lh_first;
 		     imm != NULL; imm = imm->i6mm_chain.le_next)
 			if (imm->i6mm_maddr->in6m_ifp == ifp &&
 			    IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
 					       &mreq->ipv6mr_multiaddr))
 				break;
 		if (imm != NULL) {
 			error = EADDRINUSE;
 			break;
 		}
 		/*
 		 * Everything looks good; add a new record to the multicast
 		 * address list for the given interface.
 		 */
 		imm = malloc(sizeof(*imm), M_IPMADDR, M_WAITOK);
 		if (imm == NULL) {
 			error = ENOBUFS;
 			break;
 		}
 		if ((imm->i6mm_maddr =
 		     in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error)) == NULL) {
 			free(imm, M_IPMADDR);
 			break;
 		}
 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 		/*
 		 * Drop a multicast group membership.
 		 * Group must be a valid IP6 multicast address.
 		 */
 		if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
 			error = EINVAL;
 			break;
 		}
 		mreq = mtod(m, struct ipv6_mreq *);
 		if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
 			if (suser(td)) {
 				error = EACCES;
 				break;
 			}
 		} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
 			error = EINVAL;
 			break;
 		}
 		/*
 		 * If an interface address was specified, get a pointer
 		 * to its ifnet structure.
 		 */
 		if (mreq->ipv6mr_interface < 0
 		 || if_index < mreq->ipv6mr_interface) {
 			error = ENXIO;	/* XXX EINVAL? */
 			break;
 		}
 		ifp = ifnet_byindex(mreq->ipv6mr_interface);
 		/*
 		 * Put interface index into the multicast address,
 		 * if the address has link-local scope.
 		 */
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) {
 			mreq->ipv6mr_multiaddr.s6_addr16[1]
 				= htons(mreq->ipv6mr_interface);
 		}
 		/*
 		 * Find the membership in the membership list.
 		 */
 		for (imm = im6o->im6o_memberships.lh_first;
 		     imm != NULL; imm = imm->i6mm_chain.le_next) {
 			if ((ifp == NULL ||
 			     imm->i6mm_maddr->in6m_ifp == ifp) &&
 			    IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
 					       &mreq->ipv6mr_multiaddr))
 				break;
 		}
 		if (imm == NULL) {
 			/* Unable to resolve interface */
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		/*
 		 * Give up the multicast address record to which the
 		 * membership points.
 		 */
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_delmulti(imm->i6mm_maddr);
 		free(imm, M_IPMADDR);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	/*
 	 * If all options have default values, no need to keep the mbuf.
 	 */
 	if (im6o->im6o_multicast_ifp == NULL &&
 	    im6o->im6o_multicast_hlim == ip6_defmcasthlim &&
 	    im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
 	    im6o->im6o_memberships.lh_first == NULL) {
 		free(*im6op, M_IPMOPTS);
 		*im6op = NULL;
 	}
 
 	return(error);
 }
 
 /*
  * Return the IP6 multicast options in response to user getsockopt().
  */
 static int
 ip6_getmoptions(optname, im6o, mp)
 	int optname;
 	struct ip6_moptions *im6o;
 	struct mbuf **mp;
 {
 	u_int *hlim, *loop, *ifindex;
 
 	*mp = m_get(M_TRYWAIT, MT_HEADER);		/* XXX */
 
 	switch (optname) {
 
 	case IPV6_MULTICAST_IF:
 		ifindex = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL)
 			*ifindex = 0;
 		else
 			*ifindex = im6o->im6o_multicast_ifp->if_index;
 		return(0);
 
 	case IPV6_MULTICAST_HOPS:
 		hlim = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL)
 			*hlim = ip6_defmcasthlim;
 		else
 			*hlim = im6o->im6o_multicast_hlim;
 		return(0);
 
 	case IPV6_MULTICAST_LOOP:
 		loop = mtod(*mp, u_int *);
 		(*mp)->m_len = sizeof(u_int);
 		if (im6o == NULL)
 			*loop = ip6_defmcasthlim;
 		else
 			*loop = im6o->im6o_multicast_loop;
 		return(0);
 
 	default:
 		return(EOPNOTSUPP);
 	}
 }
 
 /*
  * Discard the IP6 multicast options.
  */
 void
 ip6_freemoptions(im6o)
 	struct ip6_moptions *im6o;
 {
 	struct in6_multi_mship *imm;
 
 	if (im6o == NULL)
 		return;
 
 	while ((imm = im6o->im6o_memberships.lh_first) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		if (imm->i6mm_maddr)
 			in6_delmulti(imm->i6mm_maddr);
 		free(imm, M_IPMADDR);
 	}
 	free(im6o, M_IPMOPTS);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktoptions(control, opt, priv, needcopy)
 	struct mbuf *control;
 	struct ip6_pktopts *opt;
 	int priv, needcopy;
 {
 	struct cmsghdr *cm = 0;
 
 	if (control == 0 || opt == 0)
 		return(EINVAL);
 
 	init_ip6pktopts(opt);
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return(EINVAL);
 
 	for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		     control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return(EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		/*
 		 * XXX should check if RFC2292 API is mixed with 2292bis API
 		 */
 		switch (cm->cmsg_type) {
 		case IPV6_PKTINFO:
 			if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo)))
 				return(EINVAL);
 			if (needcopy) {
 				/* XXX: Is it really WAITOK? */
 				opt->ip6po_pktinfo =
 					malloc(sizeof(struct in6_pktinfo),
 					       M_IP6OPT, M_WAITOK);
 				bcopy(CMSG_DATA(cm), opt->ip6po_pktinfo,
 				    sizeof(struct in6_pktinfo));
 			} else
 				opt->ip6po_pktinfo =
 					(struct in6_pktinfo *)CMSG_DATA(cm);
 			if (opt->ip6po_pktinfo->ipi6_ifindex &&
 			    IN6_IS_ADDR_LINKLOCAL(&opt->ip6po_pktinfo->ipi6_addr))
 				opt->ip6po_pktinfo->ipi6_addr.s6_addr16[1] =
 					htons(opt->ip6po_pktinfo->ipi6_ifindex);
 
 			if (opt->ip6po_pktinfo->ipi6_ifindex > if_index
 			 || opt->ip6po_pktinfo->ipi6_ifindex < 0) {
 				return(ENXIO);
 			}
 
 			/*
 			 * Check if the requested source address is indeed a
 			 * unicast address assigned to the node, and can be
 			 * used as the packet's source address.
 			 */
 			if (!IN6_IS_ADDR_UNSPECIFIED(&opt->ip6po_pktinfo->ipi6_addr)) {
 				struct in6_ifaddr *ia6;
 				struct sockaddr_in6 sin6;
 
 				bzero(&sin6, sizeof(sin6));
 				sin6.sin6_len = sizeof(sin6);
 				sin6.sin6_family = AF_INET6;
 				sin6.sin6_addr =
 					opt->ip6po_pktinfo->ipi6_addr;
 				ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(sin6tosa(&sin6));
 				if (ia6 == NULL ||
 				    (ia6->ia6_flags & (IN6_IFF_ANYCAST |
 						       IN6_IFF_NOTREADY)) != 0)
 					return(EADDRNOTAVAIL);
 			}
 			break;
 
 		case IPV6_HOPLIMIT:
 			if (cm->cmsg_len != CMSG_LEN(sizeof(int)))
 				return(EINVAL);
 
 			opt->ip6po_hlim = *(int *)CMSG_DATA(cm);
 			if (opt->ip6po_hlim < -1 || opt->ip6po_hlim > 255)
 				return(EINVAL);
 			break;
 
 		case IPV6_NEXTHOP:
 			if (!priv)
 				return(EPERM);
 
 			if (cm->cmsg_len < sizeof(u_char) ||
 			    /* check if cmsg_len is large enough for sa_len */
 			    cm->cmsg_len < CMSG_LEN(*CMSG_DATA(cm)))
 				return(EINVAL);
 
 			if (needcopy) {
 				opt->ip6po_nexthop =
 					malloc(*CMSG_DATA(cm),
 					       M_IP6OPT, M_WAITOK);
 				bcopy(CMSG_DATA(cm),
 				      opt->ip6po_nexthop,
 				      *CMSG_DATA(cm));
 			} else
 				opt->ip6po_nexthop =
 					(struct sockaddr *)CMSG_DATA(cm);
 			break;
 
 		case IPV6_HOPOPTS:
 		{
 			struct ip6_hbh *hbh;
 			int hbhlen;
 
 			if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_hbh)))
 				return(EINVAL);
 			hbh = (struct ip6_hbh *)CMSG_DATA(cm);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (cm->cmsg_len != CMSG_LEN(hbhlen))
 				return(EINVAL);
 
 			if (needcopy) {
 				opt->ip6po_hbh =
 					malloc(hbhlen, M_IP6OPT, M_WAITOK);
 				bcopy(hbh, opt->ip6po_hbh, hbhlen);
 			} else
 				opt->ip6po_hbh = hbh;
 			break;
 		}
 
 		case IPV6_DSTOPTS:
 		{
 			struct ip6_dest *dest, **newdest;
 			int destlen;
 
 			if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_dest)))
 				return(EINVAL);
 			dest = (struct ip6_dest *)CMSG_DATA(cm);
 			destlen = (dest->ip6d_len + 1) << 3;
 			if (cm->cmsg_len != CMSG_LEN(destlen))
 				return(EINVAL);
 
 			/* 
 			 * The old advacned API is ambiguous on this
 			 * point. Our approach is to determine the
 			 * position based according to the existence
 			 * of a routing header. Note, however, that
 			 * this depends on the order of the extension
 			 * headers in the ancillary data; the 1st part
 			 * of the destination options header must
 			 * appear before the routing header in the
 			 * ancillary data, too.
 			 * RFC2292bis solved the ambiguity by
 			 * introducing separate cmsg types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 
 			if (needcopy) {
 				*newdest = malloc(destlen, M_IP6OPT, M_WAITOK);
 				bcopy(dest, *newdest, destlen);
 			} else
 				*newdest = dest;
 
 			break;
 		}
 
 		case IPV6_RTHDR:
 		{
 			struct ip6_rthdr *rth;
 			int rthlen;
 
 			if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_rthdr)))
 				return(EINVAL);
 			rth = (struct ip6_rthdr *)CMSG_DATA(cm);
 			rthlen = (rth->ip6r_len + 1) << 3;
 			if (cm->cmsg_len != CMSG_LEN(rthlen))
 				return(EINVAL);
 
 			switch (rth->ip6r_type) {
 			case IPV6_RTHDR_TYPE_0:
 				/* must contain one addr */
 				if (rth->ip6r_len == 0)
 					return(EINVAL);
 				/* length must be even */
 				if (rth->ip6r_len % 2)
 					return(EINVAL);
 				if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 					return(EINVAL);
 				break;
 			default:
 				return(EINVAL);	/* not supported */
 			}
 
 			if (needcopy) {
 				opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT,
 							  M_WAITOK);
 				bcopy(rth, opt->ip6po_rthdr, rthlen);
 			} else
 				opt->ip6po_rthdr = rth;
 
 			break;
 		}
 
 		default:
 			return(ENOPROTOOPT);
 		}
 	}
 
 	return(0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(ifp, m, dst)
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct sockaddr_in6 *dst;
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copy(m, 0, M_COPYALL);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if ((copym->m_flags & M_EXT) != 0 ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 
 #ifdef DIAGNOSTIC
 	if (copym->m_len < sizeof(*ip6)) {
 		m_freem(copym);
 		return;
 	}
 #endif
 
 	ip6 = mtod(copym, struct ip6_hdr *);
 #ifndef SCOPEDROUTING
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 #endif
 
 	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(m, exthdrs)
 	struct mbuf *m;
 	struct ip6_exthdrs *exthdrs;
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
 		if (mh == 0) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		M_COPY_PKTHDR(mh, m);
 		MH_ALIGN(mh, sizeof(*ip6));
 		m->m_flags &= ~M_PKTHDR;
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(in6p)
 	struct in6pcb *in6p;
 {
 	int len;
 
 	if (!in6p->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
 	if (in6p->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
Index: head/sys/netinet6/ip6_var.h
===================================================================
--- head/sys/netinet6/ip6_var.h	(revision 105193)
+++ head/sys/netinet6/ip6_var.h	(revision 105194)
@@ -1,353 +1,354 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.1 (Berkeley) 6/10/93
  */
 
 #ifndef _NETINET6_IP6_VAR_H_
 #define _NETINET6_IP6_VAR_H_
 
 /*
  * IP6 reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  */
 struct	ip6q {
 	u_int32_t	ip6q_head;
 	u_int16_t	ip6q_len;
 	u_int8_t	ip6q_nxt;	/* ip6f_nxt in first fragment */
 	u_int8_t	ip6q_hlim;
 	struct ip6asfrag *ip6q_down;
 	struct ip6asfrag *ip6q_up;
 	u_int32_t	ip6q_ident;
 	u_int8_t	ip6q_arrive;
 	u_int8_t	ip6q_ttl;
 	struct in6_addr	ip6q_src, ip6q_dst;
 	struct ip6q	*ip6q_next;
 	struct ip6q	*ip6q_prev;
 	int		ip6q_unfrglen;	/* len of unfragmentable part */
 #ifdef notyet
 	u_char		*ip6q_nxtp;
 #endif
 };
 
 struct	ip6asfrag {
 	u_int32_t	ip6af_head;
 	u_int16_t	ip6af_len;
 	u_int8_t	ip6af_nxt;
 	u_int8_t	ip6af_hlim;
 	/* must not override the above members during reassembling */
 	struct ip6asfrag *ip6af_down;
 	struct ip6asfrag *ip6af_up;
 	struct mbuf	*ip6af_m;
 	int		ip6af_offset;	/* offset in ip6af_m to next header */
 	int		ip6af_frglen;	/* fragmentable part length */
 	int		ip6af_off;	/* fragment offset */
 	u_int16_t	ip6af_mff;	/* more fragment bit in frag off */
 };
 
 #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m))
 
 struct	ip6_moptions {
 	struct	ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */
 	u_char	im6o_multicast_hlim;	/* hoplimit for outgoing multicasts */
 	u_char	im6o_multicast_loop;	/* 1 >= hear sends if a member */
 	LIST_HEAD(, in6_multi_mship) im6o_memberships;
 };
 
 /*
  * Control options for outgoing packets
  */
 
 /* Routing header related info */
 struct	ip6po_rhinfo {
 	struct	ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
 	struct	route_in6 ip6po_rhi_route; /* Route to the 1st hop */
 };
 #define ip6po_rthdr	ip6po_rhinfo.ip6po_rhi_rthdr
 #define ip6po_route	ip6po_rhinfo.ip6po_rhi_route
 
 struct	ip6_pktopts {
 	struct	mbuf *ip6po_m;	/* Pointer to mbuf storing the data */
 	int	ip6po_hlim;	/* Hoplimit for outgoing packets */
 
 	/* Outgoing IF/address information */
 	struct	in6_pktinfo *ip6po_pktinfo;
 
 	struct	sockaddr *ip6po_nexthop; /* Next-hop address */
 	
 	struct	ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
 
 	/* Destination options header (before a routing header) */
 	struct	ip6_dest *ip6po_dest1;
 
 	/* Routing header related info. */
 	struct	ip6po_rhinfo ip6po_rhinfo;
 
 	/* Destination options header (after a routing header) */
 	struct	ip6_dest *ip6po_dest2;
 };
 
 /*
  * Control options for incoming packets
  */
 
 struct	ip6stat {
 	u_quad_t ip6s_total;		/* total packets received */
 	u_quad_t ip6s_tooshort;		/* packet too short */
 	u_quad_t ip6s_toosmall;		/* not enough data */
 	u_quad_t ip6s_fragments;	/* fragments received */
 	u_quad_t ip6s_fragdropped;	/* frags dropped(dups, out of space) */
 	u_quad_t ip6s_fragtimeout;	/* fragments timed out */
 	u_quad_t ip6s_fragoverflow;	/* fragments that exceeded limit */
 	u_quad_t ip6s_forward;		/* packets forwarded */
 	u_quad_t ip6s_cantforward;	/* packets rcvd for unreachable dest */
 	u_quad_t ip6s_redirectsent;	/* packets forwarded on same net */
 	u_quad_t ip6s_delivered;	/* datagrams delivered to upper level*/
 	u_quad_t ip6s_localout;		/* total ip packets generated here */
 	u_quad_t ip6s_odropped;		/* lost packets due to nobufs, etc. */
 	u_quad_t ip6s_reassembled;	/* total packets reassembled ok */
 	u_quad_t ip6s_fragmented;	/* datagrams sucessfully fragmented */
 	u_quad_t ip6s_ofragments;	/* output fragments created */
 	u_quad_t ip6s_cantfrag;		/* don't fragment flag was set, etc. */
 	u_quad_t ip6s_badoptions;	/* error in option processing */
 	u_quad_t ip6s_noroute;		/* packets discarded due to no route */
 	u_quad_t ip6s_badvers;		/* ip6 version != 6 */
 	u_quad_t ip6s_rawout;		/* total raw ip packets generated */
 	u_quad_t ip6s_badscope;		/* scope error */
 	u_quad_t ip6s_notmember;	/* don't join this multicast group */
 	u_quad_t ip6s_nxthist[256];	/* next header history */
 	u_quad_t ip6s_m1;		/* one mbuf */
 	u_quad_t ip6s_m2m[32];		/* two or more mbuf */
 	u_quad_t ip6s_mext1;		/* one ext mbuf */
 	u_quad_t ip6s_mext2m;		/* two or more ext mbuf */
 	u_quad_t ip6s_exthdrtoolong;	/* ext hdr are not continuous */
 	u_quad_t ip6s_nogif;		/* no match gif found */
 	u_quad_t ip6s_toomanyhdr;	/* discarded due to too many headers */
 
 	/*
 	 * statistics for improvement of the source address selection
 	 * algorithm:
 	 * XXX: hardcoded 16 = # of ip6 multicast scope types + 1
 	 */
 	/* number of times that address selection fails */
 	u_quad_t ip6s_sources_none;
 	/* number of times that an address on the outgoing I/F is chosen */
 	u_quad_t ip6s_sources_sameif[16];
 	/* number of times that an address on a non-outgoing I/F is chosen */
 	u_quad_t ip6s_sources_otherif[16];
 	/*
 	 * number of times that an address that has the same scope
 	 * from the destination is chosen.
 	 */
 	u_quad_t ip6s_sources_samescope[16];
 	/*
 	 * number of times that an address that has a different scope
 	 * from the destination is chosen.
 	 */
 	u_quad_t ip6s_sources_otherscope[16];
 	/* number of times that an deprecated address is chosen */
 	u_quad_t ip6s_sources_deprecated[16];
 
 	u_quad_t ip6s_forward_cachehit;
 	u_quad_t ip6s_forward_cachemiss;
 };
 
 #ifdef _KERNEL
 /*
  * IPv6 onion peeling state.
  * it will be initialized when we come into ip6_input().
  * XXX do not make it a kitchen sink!
  */
 struct ip6aux {
 	u_int32_t ip6a_flags;
 #define IP6A_SWAP	0x01		/* swapped home/care-of on packet */
 #define IP6A_HASEEN	0x02		/* HA was present */
 #define IP6A_BRUID	0x04		/* BR Unique Identifier was present */
 #define IP6A_RTALERTSEEN 0x08		/* rtalert present */
 
 	/* ip6.ip6_src */
 	struct in6_addr ip6a_careof;	/* care-of address of the peer */
 	struct in6_addr ip6a_home;	/* home address of the peer */
 	u_int16_t	ip6a_bruid;	/* BR unique identifier */
 
 	/* ip6.ip6_dst */
 	struct in6_ifaddr *ip6a_dstia6;	/* my ifaddr that matches ip6_dst */
 
 	/* rtalert */
 	u_int16_t ip6a_rtalert;		/* rtalert option value */
 
 	/*
 	 * decapsulation history will be here.
 	 * with IPsec it may not be accurate.
 	 */
 };
 #endif
 
 #ifdef _KERNEL
 /* flags passed to ip6_output as last parameter */
 #define	IPV6_DADOUTPUT		0x01	/* DAD */
 #define	IPV6_FORWARDING		0x02	/* most of IPv6 header exists */
 #define	IPV6_MINMTU		0x04	/* use minimum MTU (IPV6_USE_MIN_MTU) */
 
 extern struct	ip6stat ip6stat;	/* statistics */
 extern u_int32_t ip6_id;		/* fragment identifier */
 extern int	ip6_defhlim;		/* default hop limit */
 extern int	ip6_defmcasthlim;	/* default multicast hop limit */
 extern int	ip6_forwarding;		/* act as router? */
 extern int	ip6_forward_srcrt;	/* forward src-routed? */
 extern int	ip6_gif_hlim;		/* Hop limit for gif encap packet */
 extern int	ip6_use_deprecated;	/* allow deprecated addr as source */
 extern int	ip6_rr_prune;		/* router renumbering prefix
 					 * walk list every 5 sec.    */
 extern int	ip6_v6only;
 
 extern struct socket *ip6_mrouter; 	/* multicast routing daemon */
 extern int	ip6_sendredirects;	/* send IP redirects when forwarding? */
 extern int	ip6_maxfragpackets; /* Maximum packets in reassembly queue */
 extern int	ip6_sourcecheck;	/* Verify source interface */
 extern int	ip6_sourcecheck_interval; /* Interval between log messages */
 extern int	ip6_accept_rtadv;	/* Acts as a host not a router */
 extern int	ip6_keepfaith;		/* Firewall Aided Internet Translator */
 extern int	ip6_log_interval;
 extern time_t	ip6_log_time;
 extern int	ip6_hdrnestlimit; /* upper limit of # of extension headers */
 extern int	ip6_dad_count;		/* DupAddrDetectionTransmits */
 
 extern u_int32_t ip6_flow_seq;
 extern int ip6_auto_flowlabel;
 extern int ip6_auto_linklocal;
 
 extern int   ip6_anonportmin;		/* minimum ephemeral port */
 extern int   ip6_anonportmax;		/* maximum ephemeral port */
 extern int   ip6_lowportmin;		/* minimum reserved port */
 extern int   ip6_lowportmax;		/* maximum reserved port */
 
 extern int	ip6_use_tempaddr; /* whether to use temporary addresses. */
 
 extern struct	pr_usrreqs rip6_usrreqs;
 struct sockopt;
 
 struct inpcb;
 
 int	icmp6_ctloutput __P((struct socket *, struct sockopt *sopt));
 
 struct in6_ifaddr;
 void	ip6_init __P((void));
 void	ip6intr __P((void));
 void	ip6_input __P((struct mbuf *));
 struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *));
 void	ip6_freepcbopts __P((struct ip6_pktopts *));
 void	ip6_freemoptions __P((struct ip6_moptions *));
 int	ip6_unknown_opt __P((u_int8_t *, struct mbuf *, int));
 char *	ip6_get_prevhdr __P((struct mbuf *, int));
 int	ip6_nexthdr __P((struct mbuf *, int, int, int *));
 int	ip6_lasthdr __P((struct mbuf *, int, int, int *));
 
-struct mbuf *ip6_addaux __P((struct mbuf *));
-struct mbuf *ip6_findaux __P((struct mbuf *));
+struct ip6aux *ip6_addaux __P((struct mbuf *));
+struct ip6aux *ip6_findaux __P((struct mbuf *));
 void	ip6_delaux __P((struct mbuf *));
 
 int	ip6_mforward __P((struct ip6_hdr *, struct ifnet *, struct mbuf *));
 int	ip6_process_hopopts __P((struct mbuf *, u_int8_t *, int, u_int32_t *,
 				 u_int32_t *));
 void	ip6_savecontrol __P((struct inpcb *, struct mbuf **, struct ip6_hdr *,
 			     struct mbuf *));
 void	ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *,
 			     u_int32_t *));
 int	ip6_sysctl __P((int *, u_int, void *, size_t *, void *, size_t));
 
 void	ip6_forward __P((struct mbuf *, int));
 
 void	ip6_mloopback __P((struct ifnet *, struct mbuf *, struct sockaddr_in6 *));
 int	ip6_output __P((struct mbuf *, struct ip6_pktopts *,
 			struct route_in6 *,
 			int,
-			struct ip6_moptions *, struct ifnet **));
+			struct ip6_moptions *, struct ifnet **,
+			struct inpcb *));
 int	ip6_ctloutput __P((struct socket *, struct sockopt *sopt));
 void	init_ip6pktopts __P((struct ip6_pktopts *));
 int	ip6_setpktoptions __P((struct mbuf *, struct ip6_pktopts *, int, int));
 void	ip6_clearpktopts __P((struct ip6_pktopts *, int, int));
 struct ip6_pktopts *ip6_copypktopts __P((struct ip6_pktopts *, int));
 int	ip6_optlen __P((struct inpcb *));
 
 int	route6_input __P((struct mbuf **, int *, int));
 
 void	frag6_init __P((void));
 int	frag6_input __P((struct mbuf **, int *, int));
 void	frag6_slowtimo __P((void));
 void	frag6_drain __P((void));
 
 void	rip6_init __P((void));
 int	rip6_input __P((struct mbuf **mp, int *offp, int proto));
 void	rip6_ctlinput __P((int, struct sockaddr *, void *));
 int	rip6_ctloutput __P((struct socket *so, struct sockopt *sopt));
 int	rip6_output __P((struct mbuf *, ...));
 int	rip6_usrreq __P((struct socket *,
 	    int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *));
 
 int	dest6_input __P((struct mbuf **, int *, int));
 int	none_input __P((struct mbuf **, int *, int));
 #endif /* _KERNEL */
 
 #endif /* !_NETINET6_IP6_VAR_H_ */
Index: head/sys/netinet6/ipsec.c
===================================================================
--- head/sys/netinet6/ipsec.c	(revision 105193)
+++ head/sys/netinet6/ipsec.c	(revision 105194)
@@ -1,3602 +1,3502 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPsec controller part.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/ip_ecn.h>
 #ifdef INET6
 #include <netinet6/ip6_ecn.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #include <netinet6/ah.h>
 #ifdef INET6
 #include <netinet6/ah6.h>
 #endif
 #ifdef IPSEC_ESP
 #include <netinet6/esp.h>
 #ifdef INET6
 #include <netinet6/esp6.h>
 #endif
 #endif
 #include <netinet6/ipcomp.h>
 #ifdef INET6
 #include <netinet6/ipcomp6.h>
 #endif
 #include <netkey/key.h>
 #include <netkey/keydb.h>
 #include <netkey/key_debug.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/net_osdep.h>
 
 #ifdef IPSEC_DEBUG
 int ipsec_debug = 1;
 #else
 int ipsec_debug = 0;
 #endif
 
 struct ipsecstat ipsecstat;
 int ip4_ah_cleartos = 1;
 int ip4_ah_offsetmask = 0;	/* maybe IP_DF? */
 int ip4_ipsec_dfbit = 0;	/* DF bit on encap. 0: clear 1: set 2: copy */
 int ip4_esp_trans_deflev = IPSEC_LEVEL_USE;
 int ip4_esp_net_deflev = IPSEC_LEVEL_USE;
 int ip4_ah_trans_deflev = IPSEC_LEVEL_USE;
 int ip4_ah_net_deflev = IPSEC_LEVEL_USE;
 struct secpolicy ip4_def_policy;
 int ip4_ipsec_ecn = 0;		/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 int ip4_esp_randpad = -1;
 
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_ipsec);
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ipsec6);
 #endif
 #endif
 
 /* net.inet.ipsec */
 SYSCTL_STRUCT(_net_inet_ipsec, IPSECCTL_STATS,
 	stats, CTLFLAG_RD,	&ipsecstat,	ipsecstat, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY,
 	def_policy, CTLFLAG_RW,	&ip4_def_policy.policy,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
 	CTLFLAG_RW, &ip4_esp_trans_deflev,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
 	CTLFLAG_RW, &ip4_esp_net_deflev,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
 	CTLFLAG_RW, &ip4_ah_trans_deflev,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
 	CTLFLAG_RW, &ip4_ah_net_deflev,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS,
 	ah_cleartos, CTLFLAG_RW,	&ip4_ah_cleartos,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK,
 	ah_offsetmask, CTLFLAG_RW,	&ip4_ah_offsetmask,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT,
 	dfbit, CTLFLAG_RW,	&ip4_ipsec_dfbit,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN,
 	ecn, CTLFLAG_RW,	&ip4_ipsec_ecn,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG,
 	debug, CTLFLAG_RW,	&ipsec_debug,	0, "");
 SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ESP_RANDPAD,
 	esp_randpad, CTLFLAG_RW,	&ip4_esp_randpad,	0, "");
 
 #ifdef INET6
 struct ipsecstat ipsec6stat;
 int ip6_esp_trans_deflev = IPSEC_LEVEL_USE;
 int ip6_esp_net_deflev = IPSEC_LEVEL_USE;
 int ip6_ah_trans_deflev = IPSEC_LEVEL_USE;
 int ip6_ah_net_deflev = IPSEC_LEVEL_USE;
 struct secpolicy ip6_def_policy;
 int ip6_ipsec_ecn = 0;		/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 int ip6_esp_randpad = -1;
 
 /* net.inet6.ipsec6 */
 SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS,
 	stats, CTLFLAG_RD, &ipsec6stat, ipsecstat, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY,
 	def_policy, CTLFLAG_RW,	&ip6_def_policy.policy,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
 	CTLFLAG_RW, &ip6_esp_trans_deflev,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
 	CTLFLAG_RW, &ip6_esp_net_deflev,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
 	CTLFLAG_RW, &ip6_ah_trans_deflev,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
 	CTLFLAG_RW, &ip6_ah_net_deflev,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN,
 	ecn, CTLFLAG_RW,	&ip6_ipsec_ecn,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG,
 	debug, CTLFLAG_RW,	&ipsec_debug,	0, "");
 SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD,
 	esp_randpad, CTLFLAG_RW,	&ip6_esp_randpad,	0, "");
 #endif /* INET6 */
 
 static int ipsec_setspidx_mbuf
 	__P((struct secpolicyindex *, u_int, u_int, struct mbuf *, int));
 static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb));
 #ifdef INET6
 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb));
 #endif
 static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int));
 static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #ifdef INET6
 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #endif
 static struct inpcbpolicy *ipsec_newpcbpolicy __P((void));
 static void ipsec_delpcbpolicy __P((struct inpcbpolicy *));
 static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src));
 static int ipsec_set_policy __P((struct secpolicy **pcb_sp,
 	int optname, caddr_t request, size_t len, int priv));
 static int ipsec_get_policy __P((struct secpolicy *pcb_sp, struct mbuf **mp));
 static void vshiftl __P((unsigned char *, int, int));
 static int ipsec_in_reject __P((struct secpolicy *, struct mbuf *));
 static size_t ipsec_hdrsiz __P((struct secpolicy *));
 #ifdef INET
 static struct mbuf *ipsec4_splithdr __P((struct mbuf *));
 #endif
 #ifdef INET6
 static struct mbuf *ipsec6_splithdr __P((struct mbuf *));
 #endif
 #ifdef INET
 static int ipsec4_encapsulate __P((struct mbuf *, struct secasvar *));
 #endif
 #ifdef INET6
 static int ipsec6_encapsulate __P((struct mbuf *, struct secasvar *));
 #endif
-static struct mbuf *ipsec_addaux __P((struct mbuf *));
-static struct mbuf *ipsec_findaux __P((struct mbuf *));
-static void ipsec_optaux __P((struct mbuf *, struct mbuf *));
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 struct secpolicy *
 ipsec4_getpolicybysock(m, dir, so, error)
 	struct mbuf *m;
 	u_int dir;
 	struct socket *so;
 	int *error;
 {
 	struct inpcbpolicy *pcbsp = NULL;
 	struct secpolicy *currsp = NULL;	/* policy on socket */
 	struct secpolicy *kernsp = NULL;	/* policy on kernel */
 
 	/* sanity check */
 	if (m == NULL || so == NULL || error == NULL)
 		panic("ipsec4_getpolicybysock: NULL pointer was passed.\n");
 
 	switch (so->so_proto->pr_domain->dom_family) {
 	case AF_INET:
 		/* set spidx in pcb */
 		*error = ipsec4_setspidx_inpcb(m, sotoinpcb(so));
 		break;
 #ifdef INET6
 	case AF_INET6:
 		/* set spidx in pcb */
 		*error = ipsec6_setspidx_in6pcb(m, sotoin6pcb(so));
 		break;
 #endif
 	default:
 		panic("ipsec4_getpolicybysock: unsupported address family\n");
 	}
 	if (*error)
 		return NULL;
 	switch (so->so_proto->pr_domain->dom_family) {
 	case AF_INET:
 		pcbsp = sotoinpcb(so)->inp_sp;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		pcbsp = sotoin6pcb(so)->in6p_sp;
 		break;
 #endif
 	}
 
 	/* sanity check */
 	if (pcbsp == NULL)
 		panic("ipsec4_getpolicybysock: pcbsp is NULL.\n");
 
 	switch (dir) {
 	case IPSEC_DIR_INBOUND:
 		currsp = pcbsp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		currsp = pcbsp->sp_out;
 		break;
 	default:
 		panic("ipsec4_getpolicybysock: illegal direction.\n");
 	}
 
 	/* sanity check */
 	if (currsp == NULL)
 		panic("ipsec4_getpolicybysock: currsp is NULL.\n");
 
 	/* when privilieged socket */
 	if (pcbsp->priv) {
 		switch (currsp->policy) {
 		case IPSEC_POLICY_BYPASS:
 			currsp->refcnt++;
 			*error = 0;
 			return currsp;
 
 		case IPSEC_POLICY_ENTRUST:
 			/* look for a policy in SPD */
 			kernsp = key_allocsp(&currsp->spidx, dir);
 
 			/* SP found */
 			if (kernsp != NULL) {
 				KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 					printf("DP ipsec4_getpolicybysock called "
 					       "to allocate SP:%p\n", kernsp));
 				*error = 0;
 				return kernsp;
 			}
 
 			/* no SP found */
 			if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD
 			 && ip4_def_policy.policy != IPSEC_POLICY_NONE) {
 				ipseclog((LOG_INFO,
 				    "fixed system default policy: %d->%d\n",
 				    ip4_def_policy.policy, IPSEC_POLICY_NONE));
 				ip4_def_policy.policy = IPSEC_POLICY_NONE;
 			}
 			ip4_def_policy.refcnt++;
 			*error = 0;
 			return &ip4_def_policy;
 			
 		case IPSEC_POLICY_IPSEC:
 			currsp->refcnt++;
 			*error = 0;
 			return currsp;
 
 		default:
 			ipseclog((LOG_ERR, "ipsec4_getpolicybysock: "
 			      "Invalid policy for PCB %d\n", currsp->policy));
 			*error = EINVAL;
 			return NULL;
 		}
 		/* NOTREACHED */
 	}
 
 	/* when non-privilieged socket */
 	/* look for a policy in SPD */
 	kernsp = key_allocsp(&currsp->spidx, dir);
 
 	/* SP found */
 	if (kernsp != NULL) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP ipsec4_getpolicybysock called "
 			       "to allocate SP:%p\n", kernsp));
 		*error = 0;
 		return kernsp;
 	}
 
 	/* no SP found */
 	switch (currsp->policy) {
 	case IPSEC_POLICY_BYPASS:
 		ipseclog((LOG_ERR, "ipsec4_getpolicybysock: "
 		       "Illegal policy for non-priviliged defined %d\n",
 			currsp->policy));
 		*error = EINVAL;
 		return NULL;
 
 	case IPSEC_POLICY_ENTRUST:
 		if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD
 		 && ip4_def_policy.policy != IPSEC_POLICY_NONE) {
 			ipseclog((LOG_INFO,
 			    "fixed system default policy: %d->%d\n",
 			    ip4_def_policy.policy, IPSEC_POLICY_NONE));
 			ip4_def_policy.policy = IPSEC_POLICY_NONE;
 		}
 		ip4_def_policy.refcnt++;
 		*error = 0;
 		return &ip4_def_policy;
 
 	case IPSEC_POLICY_IPSEC:
 		currsp->refcnt++;
 		*error = 0;
 		return currsp;
 
 	default:
 		ipseclog((LOG_ERR, "ipsec4_getpolicybysock: "
 		   "Invalid policy for PCB %d\n", currsp->policy));
 		*error = EINVAL;
 		return NULL;
 	}
 	/* NOTREACHED */
 }
 
 /*
  * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	positive: a pointer to the entry for security policy leaf matched.
  *	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  */
 struct secpolicy *
 ipsec4_getpolicybyaddr(m, dir, flag, error)
 	struct mbuf *m;
 	u_int dir;
 	int flag;
 	int *error;
 {
 	struct secpolicy *sp = NULL;
 
 	/* sanity check */
 	if (m == NULL || error == NULL)
 		panic("ipsec4_getpolicybyaddr: NULL pointer was passed.\n");
 
     {
 	struct secpolicyindex spidx;
 
 	bzero(&spidx, sizeof(spidx));
 
 	/* make a index to look for a policy */
 	*error = ipsec_setspidx_mbuf(&spidx, dir, AF_INET, m,
 	    (flag & IP_FORWARDING) ? 0 : 1);
 
 	if (*error != 0)
 		return NULL;
 
 	sp = key_allocsp(&spidx, dir);
     }
 
 	/* SP found */
 	if (sp != NULL) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP ipsec4_getpolicybyaddr called "
 			       "to allocate SP:%p\n", sp));
 		*error = 0;
 		return sp;
 	}
 
 	/* no SP found */
 	if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD
 	 && ip4_def_policy.policy != IPSEC_POLICY_NONE) {
 		ipseclog((LOG_INFO, "fixed system default policy:%d->%d\n",
 			ip4_def_policy.policy,
 			IPSEC_POLICY_NONE));
 		ip4_def_policy.policy = IPSEC_POLICY_NONE;
 	}
 	ip4_def_policy.refcnt++;
 	*error = 0;
 	return &ip4_def_policy;
 }
 
 #ifdef INET6
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  */
 struct secpolicy *
 ipsec6_getpolicybysock(m, dir, so, error)
 	struct mbuf *m;
 	u_int dir;
 	struct socket *so;
 	int *error;
 {
 	struct inpcbpolicy *pcbsp = NULL;
 	struct secpolicy *currsp = NULL;	/* policy on socket */
 	struct secpolicy *kernsp = NULL;	/* policy on kernel */
 
 	/* sanity check */
 	if (m == NULL || so == NULL || error == NULL)
 		panic("ipsec6_getpolicybysock: NULL pointer was passed.\n");
 
 #ifdef DIAGNOSTIC
 	if (so->so_proto->pr_domain->dom_family != AF_INET6)
 		panic("ipsec6_getpolicybysock: socket domain != inet6\n");
 #endif
 
 	/* set spidx in pcb */
 	ipsec6_setspidx_in6pcb(m, sotoin6pcb(so));
 
 	pcbsp = sotoin6pcb(so)->in6p_sp;
 
 	/* sanity check */
 	if (pcbsp == NULL)
 		panic("ipsec6_getpolicybysock: pcbsp is NULL.\n");
 
 	switch (dir) {
 	case IPSEC_DIR_INBOUND:
 		currsp = pcbsp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		currsp = pcbsp->sp_out;
 		break;
 	default:
 		panic("ipsec6_getpolicybysock: illegal direction.\n");
 	}
 
 	/* sanity check */
 	if (currsp == NULL)
 		panic("ipsec6_getpolicybysock: currsp is NULL.\n");
 
 	/* when privilieged socket */
 	if (pcbsp->priv) {
 		switch (currsp->policy) {
 		case IPSEC_POLICY_BYPASS:
 			currsp->refcnt++;
 			*error = 0;
 			return currsp;
 
 		case IPSEC_POLICY_ENTRUST:
 			/* look for a policy in SPD */
 			kernsp = key_allocsp(&currsp->spidx, dir);
 
 			/* SP found */
 			if (kernsp != NULL) {
 				KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 					printf("DP ipsec6_getpolicybysock called "
 					       "to allocate SP:%p\n", kernsp));
 				*error = 0;
 				return kernsp;
 			}
 
 			/* no SP found */
 			if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD
 			 && ip6_def_policy.policy != IPSEC_POLICY_NONE) {
 				ipseclog((LOG_INFO,
 				    "fixed system default policy: %d->%d\n",
 				    ip6_def_policy.policy, IPSEC_POLICY_NONE));
 				ip6_def_policy.policy = IPSEC_POLICY_NONE;
 			}
 			ip6_def_policy.refcnt++;
 			*error = 0;
 			return &ip6_def_policy;
 			
 		case IPSEC_POLICY_IPSEC:
 			currsp->refcnt++;
 			*error = 0;
 			return currsp;
 
 		default:
 			ipseclog((LOG_ERR, "ipsec6_getpolicybysock: "
 			    "Invalid policy for PCB %d\n", currsp->policy));
 			*error = EINVAL;
 			return NULL;
 		}
 		/* NOTREACHED */
 	}
 
 	/* when non-privilieged socket */
 	/* look for a policy in SPD */
 	kernsp = key_allocsp(&currsp->spidx, dir);
 
 	/* SP found */
 	if (kernsp != NULL) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP ipsec6_getpolicybysock called "
 			       "to allocate SP:%p\n", kernsp));
 		*error = 0;
 		return kernsp;
 	}
 
 	/* no SP found */
 	switch (currsp->policy) {
 	case IPSEC_POLICY_BYPASS:
 		ipseclog((LOG_ERR, "ipsec6_getpolicybysock: "
 		    "Illegal policy for non-priviliged defined %d\n",
 		    currsp->policy));
 		*error = EINVAL;
 		return NULL;
 
 	case IPSEC_POLICY_ENTRUST:
 		if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD
 		 && ip6_def_policy.policy != IPSEC_POLICY_NONE) {
 			ipseclog((LOG_INFO,
 			    "fixed system default policy: %d->%d\n",
 			    ip6_def_policy.policy, IPSEC_POLICY_NONE));
 			ip6_def_policy.policy = IPSEC_POLICY_NONE;
 		}
 		ip6_def_policy.refcnt++;
 		*error = 0;
 		return &ip6_def_policy;
 
 	case IPSEC_POLICY_IPSEC:
 		currsp->refcnt++;
 		*error = 0;
 		return currsp;
 
 	default:
 		ipseclog((LOG_ERR,
 		    "ipsec6_policybysock: Invalid policy for PCB %d\n",
 		    currsp->policy));
 		*error = EINVAL;
 		return NULL;
 	}
 	/* NOTREACHED */
 }
 
 /*
  * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * `flag' means that packet is to be forwarded whether or not.
  *	flag = 1: forwad
  * OUT:	positive: a pointer to the entry for security policy leaf matched.
  *	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  */
 #ifndef IP_FORWARDING
 #define IP_FORWARDING 1
 #endif
 
 struct secpolicy *
 ipsec6_getpolicybyaddr(m, dir, flag, error)
 	struct mbuf *m;
 	u_int dir;
 	int flag;
 	int *error;
 {
 	struct secpolicy *sp = NULL;
 
 	/* sanity check */
 	if (m == NULL || error == NULL)
 		panic("ipsec6_getpolicybyaddr: NULL pointer was passed.\n");
 
     {
 	struct secpolicyindex spidx;
 
 	bzero(&spidx, sizeof(spidx));
 
 	/* make a index to look for a policy */
 	*error = ipsec_setspidx_mbuf(&spidx, dir, AF_INET6, m,
 	    (flag & IP_FORWARDING) ? 0 : 1);
 
 	if (*error != 0)
 		return NULL;
 
 	sp = key_allocsp(&spidx, dir);
     }
 
 	/* SP found */
 	if (sp != NULL) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP ipsec6_getpolicybyaddr called "
 			       "to allocate SP:%p\n", sp));
 		*error = 0;
 		return sp;
 	}
 
 	/* no SP found */
 	if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD
 	 && ip6_def_policy.policy != IPSEC_POLICY_NONE) {
 		ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n",
 		    ip6_def_policy.policy, IPSEC_POLICY_NONE));
 		ip6_def_policy.policy = IPSEC_POLICY_NONE;
 	}
 	ip6_def_policy.refcnt++;
 	*error = 0;
 	return &ip6_def_policy;
 }
 #endif /* INET6 */
 
 /*
  * set IP address into spidx from mbuf.
  * When Forwarding packet and ICMP echo reply, this function is used.
  *
  * IN:	get the followings from mbuf.
  *	protocol family, src, dst, next protocol
  * OUT:
  *	0:	success.
  *	other:	failure, and set errno.
  */
 int
 ipsec_setspidx_mbuf(spidx, dir, family, m, needport)
 	struct secpolicyindex *spidx;
 	u_int dir, family;
 	struct mbuf *m;
 	int needport;
 {
 	int error;
 
 	/* sanity check */
 	if (spidx == NULL || m == NULL)
 		panic("ipsec_setspidx_mbuf: NULL pointer was passed.\n");
 
 	bzero(spidx, sizeof(*spidx));
 
 	error = ipsec_setspidx(m, spidx, needport);
 	if (error)
 		goto bad;
 	spidx->dir = dir;
 
 	return 0;
 
     bad:
 	/* XXX initialize */
 	bzero(spidx, sizeof(*spidx));
 	return EINVAL;
 }
 
 static int
 ipsec4_setspidx_inpcb(m, pcb)
 	struct mbuf *m;
 	struct inpcb *pcb;
 {
 	struct secpolicyindex *spidx;
 	int error;
 
 	/* sanity check */
 	if (pcb == NULL)
 		panic("ipsec4_setspidx_inpcb: no PCB found.\n");
 	if (pcb->inp_sp == NULL)
 		panic("ipsec4_setspidx_inpcb: no inp_sp found.\n");
 	if (pcb->inp_sp->sp_out == NULL || pcb->inp_sp->sp_in == NULL)
 		panic("ipsec4_setspidx_inpcb: no sp_in/out found.\n");
 
 	bzero(&pcb->inp_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->inp_sp->sp_out->spidx, sizeof(*spidx));
 
 	spidx = &pcb->inp_sp->sp_in->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_INBOUND;
 
 	spidx = &pcb->inp_sp->sp_out->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_OUTBOUND;
 
 	return 0;
 
 bad:
 	bzero(&pcb->inp_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->inp_sp->sp_out->spidx, sizeof(*spidx));
 	return error;
 }
 
 #ifdef INET6
 static int
 ipsec6_setspidx_in6pcb(m, pcb)
 	struct mbuf *m;
 	struct in6pcb *pcb;
 {
 	struct secpolicyindex *spidx;
 	int error;
 
 	/* sanity check */
 	if (pcb == NULL)
 		panic("ipsec6_setspidx_in6pcb: no PCB found.\n");
 	if (pcb->in6p_sp == NULL)
 		panic("ipsec6_setspidx_in6pcb: no in6p_sp found.\n");
 	if (pcb->in6p_sp->sp_out == NULL || pcb->in6p_sp->sp_in == NULL)
 		panic("ipsec6_setspidx_in6pcb: no sp_in/out found.\n");
 
 	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));
 
 	spidx = &pcb->in6p_sp->sp_in->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_INBOUND;
 
 	spidx = &pcb->in6p_sp->sp_out->spidx;
 	error = ipsec_setspidx(m, spidx, 1);
 	if (error)
 		goto bad;
 	spidx->dir = IPSEC_DIR_OUTBOUND;
 
 	return 0;
 
 bad:
 	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
 	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));
 	return error;
 }
 #endif
 
 /*
  * configure security policy index (src/dst/proto/sport/dport)
  * by looking at the content of mbuf.
  * the caller is responsible for error recovery (like clearing up spidx).
  */
 static int
 ipsec_setspidx(m, spidx, needport)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 	int needport;
 {
 	struct ip *ip = NULL;
 	struct ip ipbuf;
 	u_int v;
 	struct mbuf *n;
 	int len;
 	int error;
 
 	if (m == NULL)
 		panic("ipsec_setspidx: m == 0 passed.\n");
 
 	/*
 	 * validate m->m_pkthdr.len.  we see incorrect length if we
 	 * mistakenly call this function with inconsistent mbuf chain
 	 * (like 4.4BSD tcp/udp processing).  XXX should we panic here?
 	 */
 	len = 0;
 	for (n = m; n; n = n->m_next)
 		len += n->m_len;
 	if (m->m_pkthdr.len != len) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("ipsec_setspidx: "
 			       "total of m_len(%d) != pkthdr.len(%d), "
 			       "ignored.\n",
 				len, m->m_pkthdr.len));
 		return EINVAL;
 	}
 
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("ipsec_setspidx: "
 			    "pkthdr.len(%d) < sizeof(struct ip), ignored.\n",
 			    m->m_pkthdr.len));
 		return EINVAL;
 	}
 
 	if (m->m_len >= sizeof(*ip))
 		ip = mtod(m, struct ip *);
 	else {
 		m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
 		ip = &ipbuf;
 	}
 #ifdef _IP_VHL
 	v = _IP_VHL_V(ip->ip_vhl);
 #else
 	v = ip->ip_v;
 #endif
 	switch (v) {
 	case 4:
 		error = ipsec4_setspidx_ipaddr(m, spidx);
 		if (error)
 			return error;
 		ipsec4_get_ulp(m, spidx, needport);
 		return 0;
 #ifdef INET6
 	case 6:
 		if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
 			KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				printf("ipsec_setspidx: "
 				    "pkthdr.len(%d) < sizeof(struct ip6_hdr), "
 				    "ignored.\n", m->m_pkthdr.len));
 			return EINVAL;
 		}
 		error = ipsec6_setspidx_ipaddr(m, spidx);
 		if (error)
 			return error;
 		ipsec6_get_ulp(m, spidx, needport);
 		return 0;
 #endif
 	default:
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("ipsec_setspidx: "
 			    "unknown IP version %u, ignored.\n", v));
 		return EINVAL;
 	}
 }
 
 static void
 ipsec4_get_ulp(m, spidx, needport)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 	int needport;
 {
 	struct ip ip;
 	struct ip6_ext ip6e;
 	u_int8_t nxt;
 	int off;
 	struct tcphdr th;
 	struct udphdr uh;
 
 	/* sanity check */
 	if (m == NULL)
 		panic("ipsec4_get_ulp: NULL pointer was passed.\n");
 	if (m->m_pkthdr.len < sizeof(ip))
 		panic("ipsec4_get_ulp: too short\n");
 
 	/* set default */
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 	((struct sockaddr_in *)&spidx->src)->sin_port = IPSEC_PORT_ANY;
 	((struct sockaddr_in *)&spidx->dst)->sin_port = IPSEC_PORT_ANY;
 
 	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
 	/* ip_input() flips it into host endian XXX need more checking */
 	if (ip.ip_off & (IP_MF | IP_OFFMASK))
 		return;
 
 	nxt = ip.ip_p;
 #ifdef _IP_VHL
 	off = _IP_VHL_HL(ip->ip_vhl) << 2;
 #else
 	off = ip.ip_hl << 2;
 #endif
 	while (off < m->m_pkthdr.len) {
 		switch (nxt) {
 		case IPPROTO_TCP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				return;
 			if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 				return;
 			m_copydata(m, off, sizeof(th), (caddr_t)&th);
 			((struct sockaddr_in *)&spidx->src)->sin_port =
 			    th.th_sport;
 			((struct sockaddr_in *)&spidx->dst)->sin_port =
 			    th.th_dport;
 			return;
 		case IPPROTO_UDP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				return;
 			if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 				return;
 			m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
 			((struct sockaddr_in *)&spidx->src)->sin_port =
 			    uh.uh_sport;
 			((struct sockaddr_in *)&spidx->dst)->sin_port =
 			    uh.uh_dport;
 			return;
 		case IPPROTO_AH:
 			if (m->m_pkthdr.len > off + sizeof(ip6e))
 				return;
 			m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 			off += (ip6e.ip6e_len + 2) << 2;
 			nxt = ip6e.ip6e_nxt;
 			break;
 		case IPPROTO_ICMP:
 		default:
 			/* XXX intermediate headers??? */
 			spidx->ul_proto = nxt;
 			return;
 		}
 	}
 }
 
 /* assumes that m is sane */
 static int
 ipsec4_setspidx_ipaddr(m, spidx)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 {
 	struct ip *ip = NULL;
 	struct ip ipbuf;
 	struct sockaddr_in *sin;
 
 	if (m->m_len >= sizeof(*ip))
 		ip = mtod(m, struct ip *);
 	else {
 		m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
 		ip = &ipbuf;
 	}
 
 	sin = (struct sockaddr_in *)&spidx->src;
 	bzero(sin, sizeof(*sin));
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	bcopy(&ip->ip_src, &sin->sin_addr, sizeof(ip->ip_src));
 	spidx->prefs = sizeof(struct in_addr) << 3;
 
 	sin = (struct sockaddr_in *)&spidx->dst;
 	bzero(sin, sizeof(*sin));
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	bcopy(&ip->ip_dst, &sin->sin_addr, sizeof(ip->ip_dst));
 	spidx->prefd = sizeof(struct in_addr) << 3;
 	return 0;
 }
 
 #ifdef INET6
 static void
 ipsec6_get_ulp(m, spidx, needport)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 	int needport;
 {
 	int off, nxt;
 	struct tcphdr th;
 	struct udphdr uh;
 
 	/* sanity check */
 	if (m == NULL)
 		panic("ipsec6_get_ulp: NULL pointer was passed.\n");
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("ipsec6_get_ulp:\n"); kdebug_mbuf(m));
 
 	/* set default */
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 	((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
 	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;
 
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off < 0 || m->m_pkthdr.len < off)
 		return;
 
 	switch (nxt) {
 	case IPPROTO_TCP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(th), (caddr_t)&th);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
 		break;
 	case IPPROTO_UDP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
 		break;
 	case IPPROTO_ICMPV6:
 	default:
 		/* XXX intermediate headers??? */
 		spidx->ul_proto = nxt;
 		break;
 	}
 }
 
 /* assumes that m is sane */
 static int
 ipsec6_setspidx_ipaddr(m, spidx)
 	struct mbuf *m;
 	struct secpolicyindex *spidx;
 {
 	struct ip6_hdr *ip6 = NULL;
 	struct ip6_hdr ip6buf;
 	struct sockaddr_in6 *sin6;
 
 	if (m->m_len >= sizeof(*ip6))
 		ip6 = mtod(m, struct ip6_hdr *);
 	else {
 		m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf);
 		ip6 = &ip6buf;
 	}
 
 	sin6 = (struct sockaddr_in6 *)&spidx->src;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 	}
 	spidx->prefs = sizeof(struct in6_addr) << 3;
 
 	sin6 = (struct sockaddr_in6 *)&spidx->dst;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 	}
 	spidx->prefd = sizeof(struct in6_addr) << 3;
 
 	return 0;
 }
 #endif
 
 static struct inpcbpolicy *
 ipsec_newpcbpolicy()
 {
 	struct inpcbpolicy *p;
 
 	p = (struct inpcbpolicy *)malloc(sizeof(*p), M_SECA, M_NOWAIT);
 	return p;
 }
 
 static void
 ipsec_delpcbpolicy(p)
 	struct inpcbpolicy *p;
 {
 	free(p, M_SECA);
 }
 
 /* initialize policy in PCB */
 int
 ipsec_init_policy(so, pcb_sp)
 	struct socket *so;
 	struct inpcbpolicy **pcb_sp;
 {
 	struct inpcbpolicy *new;
 
 	/* sanity check. */
 	if (so == NULL || pcb_sp == NULL)
 		panic("ipsec_init_policy: NULL pointer was passed.\n");
 
 	new = ipsec_newpcbpolicy();
 	if (new == NULL) {
 		ipseclog((LOG_DEBUG, "ipsec_init_policy: No more memory.\n"));
 		return ENOBUFS;
 	}
 	bzero(new, sizeof(*new));
 
 	if (so->so_cred != 0 && so->so_cred->cr_uid == 0)
 		new->priv = 1;
 	else
 		new->priv = 0;
 
 	if ((new->sp_in = key_newsp()) == NULL) {
 		ipsec_delpcbpolicy(new);
 		return ENOBUFS;
 	}
 	new->sp_in->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_in->policy = IPSEC_POLICY_ENTRUST;
 
 	if ((new->sp_out = key_newsp()) == NULL) {
 		key_freesp(new->sp_in);
 		ipsec_delpcbpolicy(new);
 		return ENOBUFS;
 	}
 	new->sp_out->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_out->policy = IPSEC_POLICY_ENTRUST;
 
 	*pcb_sp = new;
 
 	return 0;
 }
 
 /* copy old ipsec policy into new */
 int
 ipsec_copy_policy(old, new)
 	struct inpcbpolicy *old, *new;
 {
 	struct secpolicy *sp;
 
 	sp = ipsec_deepcopy_policy(old->sp_in);
 	if (sp) {
 		key_freesp(new->sp_in);
 		new->sp_in = sp;
 	} else
 		return ENOBUFS;
 
 	sp = ipsec_deepcopy_policy(old->sp_out);
 	if (sp) {
 		key_freesp(new->sp_out);
 		new->sp_out = sp;
 	} else
 		return ENOBUFS;
 
 	new->priv = old->priv;
 
 	return 0;
 }
 
 /* deep-copy a policy in PCB */
 static struct secpolicy *
 ipsec_deepcopy_policy(src)
 	struct secpolicy *src;
 {
 	struct ipsecrequest *newchain = NULL;
 	struct ipsecrequest *p;
 	struct ipsecrequest **q;
 	struct ipsecrequest *r;
 	struct secpolicy *dst;
 
 	dst = key_newsp();
 	if (src == NULL || dst == NULL)
 		return NULL;
 
 	/*
 	 * deep-copy IPsec request chain.  This is required since struct
 	 * ipsecrequest is not reference counted.
 	 */
 	q = &newchain;
 	for (p = src->req; p; p = p->next) {
 		*q = (struct ipsecrequest *)malloc(sizeof(struct ipsecrequest),
 			M_SECA, M_NOWAIT);
 		if (*q == NULL)
 			goto fail;
 		bzero(*q, sizeof(**q));
 		(*q)->next = NULL;
 
 		(*q)->saidx.proto = p->saidx.proto;
 		(*q)->saidx.mode = p->saidx.mode;
 		(*q)->level = p->level;
 		(*q)->saidx.reqid = p->saidx.reqid;
 
 		bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src));
 		bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst));
 
 		(*q)->sav = NULL;
 		(*q)->sp = dst;
 
 		q = &((*q)->next);
 	}
 
 	dst->req = newchain;
 	dst->state = src->state;
 	dst->policy = src->policy;
 	/* do not touch the refcnt fields */
 
 	return dst;
 
 fail:
 	for (p = newchain; p; p = r) {
 		r = p->next;
 		free(p, M_SECA);
 		p = NULL;
 	}
 	return NULL;
 }
 
 /* set policy and ipsec request if present. */
 static int
 ipsec_set_policy(pcb_sp, optname, request, len, priv)
 	struct secpolicy **pcb_sp;
 	int optname;
 	caddr_t request;
 	size_t len;
 	int priv;
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *newsp = NULL;
 	int error;
 
 	/* sanity check. */
 	if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("ipsec_set_policy: passed policy\n");
 		kdebug_sadb_x_policy((struct sadb_ext *)xpl));
 
 	/* check policy type */
 	/* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */
 	if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD
 	 || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
 		return EINVAL;
 
 	/* check privileged socket */
 	if (priv == 0 && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS)
 		return EACCES;
 
 	/* allocation new SP entry */
 	if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
 		return error;
 
 	newsp->state = IPSEC_SPSTATE_ALIVE;
 
 	/* clear old SP and set new SP */
 	key_freesp(*pcb_sp);
 	*pcb_sp = newsp;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("ipsec_set_policy: new policy\n");
 		kdebug_secpolicy(newsp));
 
 	return 0;
 }
 
 static int
 ipsec_get_policy(pcb_sp, mp)
 	struct secpolicy *pcb_sp;
 	struct mbuf **mp;
 {
 
 	/* sanity check. */
 	if (pcb_sp == NULL || mp == NULL)
 		return EINVAL;
 
 	*mp = key_sp2msg(pcb_sp);
 	if (!*mp) {
 		ipseclog((LOG_DEBUG, "ipsec_get_policy: No more memory.\n"));
 		return ENOBUFS;
 	}
 
 	(*mp)->m_type = MT_DATA;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("ipsec_get_policy:\n");
 		kdebug_mbuf(*mp));
 
 	return 0;
 }
 
 int
 ipsec4_set_policy(inp, optname, request, len, priv)
 	struct inpcb *inp;
 	int optname;
 	caddr_t request;
 	size_t len;
 	int priv;
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* sanity check. */
 	if (inp == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "ipsec4_set_policy: invalid direction=%u\n",
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_set_policy(pcb_sp, optname, request, len, priv);
 }
 
 int
 ipsec4_get_policy(inp, request, len, mp)
 	struct inpcb *inp;
 	caddr_t request;
 	size_t len;
 	struct mbuf **mp;
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* sanity check. */
 	if (inp == NULL || request == NULL || mp == NULL)
 		return EINVAL;
 	if (inp->inp_sp == NULL)
 		panic("policy in PCB is NULL\n");
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "ipsec4_set_policy: invalid direction=%u\n",
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_get_policy(pcb_sp, mp);
 }
 
 /* delete policy in PCB */
 int
 ipsec4_delete_pcbpolicy(inp)
 	struct inpcb *inp;
 {
 	/* sanity check. */
 	if (inp == NULL)
 		panic("ipsec4_delete_pcbpolicy: NULL pointer was passed.\n");
 
 	if (inp->inp_sp == NULL)
 		return 0;
 
 	if (inp->inp_sp->sp_in != NULL) {
 		key_freesp(inp->inp_sp->sp_in);
 		inp->inp_sp->sp_in = NULL;
 	}
 
 	if (inp->inp_sp->sp_out != NULL) {
 		key_freesp(inp->inp_sp->sp_out);
 		inp->inp_sp->sp_out = NULL;
 	}
 
 	ipsec_delpcbpolicy(inp->inp_sp);
 	inp->inp_sp = NULL;
 
 	return 0;
 }
 
 #ifdef INET6
 int
 ipsec6_set_policy(in6p, optname, request, len, priv)
 	struct in6pcb *in6p;
 	int optname;
 	caddr_t request;
 	size_t len;
 	int priv;
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* sanity check. */
 	if (in6p == NULL || request == NULL)
 		return EINVAL;
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &in6p->in6p_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &in6p->in6p_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "ipsec6_set_policy: invalid direction=%u\n",
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_set_policy(pcb_sp, optname, request, len, priv);
 }
 
 int
 ipsec6_get_policy(in6p, request, len, mp)
 	struct in6pcb *in6p;
 	caddr_t request;
 	size_t len;
 	struct mbuf **mp;
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* sanity check. */
 	if (in6p == NULL || request == NULL || mp == NULL)
 		return EINVAL;
 	if (in6p->in6p_sp == NULL)
 		panic("policy in PCB is NULL\n");
 	if (len < sizeof(*xpl))
 		return EINVAL;
 	xpl = (struct sadb_x_policy *)request;
 
 	/* select direction */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = in6p->in6p_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = in6p->in6p_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "ipsec6_set_policy: invalid direction=%u\n",
 			xpl->sadb_x_policy_dir));
 		return EINVAL;
 	}
 
 	return ipsec_get_policy(pcb_sp, mp);
 }
 
 int
 ipsec6_delete_pcbpolicy(in6p)
 	struct in6pcb *in6p;
 {
 	/* sanity check. */
 	if (in6p == NULL)
 		panic("ipsec6_delete_pcbpolicy: NULL pointer was passed.\n");
 
 	if (in6p->in6p_sp == NULL)
 		return 0;
 
 	if (in6p->in6p_sp->sp_in != NULL) {
 		key_freesp(in6p->in6p_sp->sp_in);
 		in6p->in6p_sp->sp_in = NULL;
 	}
 
 	if (in6p->in6p_sp->sp_out != NULL) {
 		key_freesp(in6p->in6p_sp->sp_out);
 		in6p->in6p_sp->sp_out = NULL;
 	}
 
 	ipsec_delpcbpolicy(in6p->in6p_sp);
 	in6p->in6p_sp = NULL;
 
 	return 0;
 }
 #endif
 
 /*
  * return current level.
  * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned.
  */
 u_int
 ipsec_get_reqlevel(isr)
 	struct ipsecrequest *isr;
 {
 	u_int level = 0;
 	u_int esp_trans_deflev, esp_net_deflev, ah_trans_deflev, ah_net_deflev;
 
 	/* sanity check */
 	if (isr == NULL || isr->sp == NULL)
 		panic("ipsec_get_reqlevel: NULL pointer is passed.\n");
 	if (((struct sockaddr *)&isr->sp->spidx.src)->sa_family
 			!= ((struct sockaddr *)&isr->sp->spidx.dst)->sa_family)
 		panic("ipsec_get_reqlevel: family mismatched.\n");
 
 /* XXX note that we have ipseclog() expanded here - code sync issue */
 #define IPSEC_CHECK_DEFAULT(lev) \
 	(((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE	      \
 			&& (lev) != IPSEC_LEVEL_UNIQUE)			      \
 		? (ipsec_debug						      \
 			? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\
 				(lev), IPSEC_LEVEL_REQUIRE)		      \
 			: 0),						      \
 			(lev) = IPSEC_LEVEL_REQUIRE,			      \
 			(lev)						      \
 		: (lev))
 
 	/* set default level */
 	switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
 #ifdef INET
 	case AF_INET:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_net_deflev);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_net_deflev);
 		break;
 #endif /* INET6 */
 	default:
 		panic("key_get_reqlevel: Unknown family. %d\n",
 			((struct sockaddr *)&isr->sp->spidx.src)->sa_family);
 	}
 
 #undef IPSEC_CHECK_DEFAULT
 
 	/* set level */
 	switch (isr->level) {
 	case IPSEC_LEVEL_DEFAULT:
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = esp_net_deflev;
 			else
 				level = esp_trans_deflev;
 			break;
 		case IPPROTO_AH:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = ah_net_deflev;
 			else
 				level = ah_trans_deflev;
 		case IPPROTO_IPCOMP:
 			/*
 			 * we don't really care, as IPcomp document says that
 			 * we shouldn't compress small packets
 			 */
 			level = IPSEC_LEVEL_USE;
 			break;
 		default:
 			panic("ipsec_get_reqlevel: "
 				"Illegal protocol defined %u\n",
 				isr->saidx.proto);
 		}
 		break;
 
 	case IPSEC_LEVEL_USE:
 	case IPSEC_LEVEL_REQUIRE:
 		level = isr->level;
 		break;
 	case IPSEC_LEVEL_UNIQUE:
 		level = IPSEC_LEVEL_REQUIRE;
 		break;
 
 	default:
 		panic("ipsec_get_reqlevel: Illegal IPsec level %u\n",
 			isr->level);
 	}
 
 	return level;
 }
 
 /*
  * Check AH/ESP integrity.
  * OUT:
  *	0: valid
  *	1: invalid
  */
 static int
 ipsec_in_reject(sp, m)
 	struct secpolicy *sp;
 	struct mbuf *m;
 {
 	struct ipsecrequest *isr;
 	u_int level;
 	int need_auth, need_conf, need_icv;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec_in_reject: using SP\n");
 		kdebug_secpolicy(sp));
 
 	/* check policy */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		return 1;
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return 0;
 	
 	case IPSEC_POLICY_IPSEC:
 		break;
 
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		panic("ipsec_hdrsiz: Invalid policy found. %d\n", sp->policy);
 	}
 
 	need_auth = 0;
 	need_conf = 0;
 	need_icv = 0;
 
 	/* XXX should compare policy against ipsec header history */
 
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 
 		/* get current level */
 		level = ipsec_get_reqlevel(isr);
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if (level == IPSEC_LEVEL_REQUIRE) {
 				need_conf++;
 
 				if (isr->sav != NULL
 				 && isr->sav->flags == SADB_X_EXT_NONE
 				 && isr->sav->alg_auth != SADB_AALG_NONE)
 					need_icv++;
 			}
 			break;
 		case IPPROTO_AH:
 			if (level == IPSEC_LEVEL_REQUIRE) {
 				need_auth++;
 				need_icv++;
 			}
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * we don't really care, as IPcomp document says that
 			 * we shouldn't compress small packets, IPComp policy
 			 * should always be treated as being in "use" level.
 			 */
 			break;
 		}
 	}
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("ipsec_in_reject: auth:%d conf:%d icv:%d m_flags:%x\n",
 			need_auth, need_conf, need_icv, m->m_flags));
 
 	if ((need_conf && !(m->m_flags & M_DECRYPTED))
 	 || (!need_auth && need_icv && !(m->m_flags & M_AUTHIPDGM))
 	 || (need_auth && !(m->m_flags & M_AUTHIPHDR)))
 		return 1;
 
 	return 0;
 }
 
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp_input(), udp_input(),
  * and {ah,esp}4_input for tunnel mode
  */
 int
 ipsec4_in_reject_so(m, so)
 	struct mbuf *m;
 	struct socket *so;
 {
 	struct secpolicy *sp = NULL;
 	int error;
 	int result;
 
 	/* sanity check */
 	if (m == NULL)
 		return 0;	/* XXX should be panic ? */
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec4_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (so == NULL)
 		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
 	else
 		sp = ipsec4_getpolicybysock(m, IPSEC_DIR_INBOUND, so, &error);
 
 	if (sp == NULL)
 		return 0;	/* XXX should be panic ?
 				 * -> No, there may be error. */
 
 	result = ipsec_in_reject(sp, m);
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP ipsec4_in_reject_so call free SP:%p\n", sp));
 	key_freesp(sp);
 
 	return result;
 }
 
 int
 ipsec4_in_reject(m, inp)
 	struct mbuf *m;
 	struct inpcb *inp;
 {
 	if (inp == NULL)
 		return ipsec4_in_reject_so(m, NULL);
 	if (inp->inp_socket)
 		return ipsec4_in_reject_so(m, inp->inp_socket);
 	else
 		panic("ipsec4_in_reject: invalid inpcb/socket");
 }
 
 #ifdef INET6
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp6_input(), udp6_input(),
  * and {ah,esp}6_input for tunnel mode
  */
 int
 ipsec6_in_reject_so(m, so)
 	struct mbuf *m;
 	struct socket *so;
 {
 	struct secpolicy *sp = NULL;
 	int error;
 	int result;
 
 	/* sanity check */
 	if (m == NULL)
 		return 0;	/* XXX should be panic ? */
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec6_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (so == NULL)
 		sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
 	else
 		sp = ipsec6_getpolicybysock(m, IPSEC_DIR_INBOUND, so, &error);
 
 	if (sp == NULL)
 		return 0;	/* XXX should be panic ? */
 
 	result = ipsec_in_reject(sp, m);
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP ipsec6_in_reject_so call free SP:%p\n", sp));
 	key_freesp(sp);
 
 	return result;
 }
 
 int
 ipsec6_in_reject(m, in6p)
 	struct mbuf *m;
 	struct in6pcb *in6p;
 {
 	if (in6p == NULL)
 		return ipsec6_in_reject_so(m, NULL);
 	if (in6p->in6p_socket)
 		return ipsec6_in_reject_so(m, in6p->in6p_socket);
 	else
 		panic("ipsec6_in_reject: invalid in6p/socket");
 }
 #endif
 
 /*
  * compute the byte size to be occupied by IPsec header.
  * in case it is tunneled, it includes the size of outer IP header.
  * NOTE: SP passed is free in this function.
  */
 static size_t
 ipsec_hdrsiz(sp)
 	struct secpolicy *sp;
 {
 	struct ipsecrequest *isr;
 	size_t siz, clen;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec_hdrsiz: using SP\n");
 		kdebug_secpolicy(sp));
 
 	/* check policy */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return 0;
 	
 	case IPSEC_POLICY_IPSEC:
 		break;
 
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		panic("ipsec_hdrsiz: Invalid policy found. %d\n", sp->policy);
 	}
 
 	siz = 0;
 
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 
 		clen = 0;
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 #ifdef IPSEC_ESP
 			clen = esp_hdrsiz(isr);
 #else
 			clen = 0;	/* XXX */
 #endif
 			break;
 		case IPPROTO_AH:
 			clen = ah_hdrsiz(isr);
 			break;
 		case IPPROTO_IPCOMP:
 			clen = sizeof(struct ipcomp);
 			break;
 		}
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			switch (((struct sockaddr *)&isr->saidx.dst)->sa_family) {
 			case AF_INET:
 				clen += sizeof(struct ip);
 				break;
 #ifdef INET6
 			case AF_INET6:
 				clen += sizeof(struct ip6_hdr);
 				break;
 #endif
 			default:
 				ipseclog((LOG_ERR, "ipsec_hdrsiz: "
 				    "unknown AF %d in IPsec tunnel SA\n",
 				    ((struct sockaddr *)&isr->saidx.dst)->sa_family));
 				break;
 			}
 		}
 		siz += clen;
 	}
 
 	return siz;
 }
 
 /* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */
 size_t
 ipsec4_hdrsiz(m, dir, inp)
 	struct mbuf *m;
 	u_int dir;
 	struct inpcb *inp;
 {
 	struct secpolicy *sp = NULL;
 	int error;
 	size_t size;
 
 	/* sanity check */
 	if (m == NULL)
 		return 0;	/* XXX should be panic ? */
 	if (inp != NULL && inp->inp_socket == NULL)
 		panic("ipsec4_hdrsize: why is socket NULL but there is PCB.");
 
 	/* get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec4_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec4_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
 	else
 		sp = ipsec4_getpolicybysock(m, dir, inp->inp_socket, &error);
 
 	if (sp == NULL)
 		return 0;	/* XXX should be panic ? */
 
 	size = ipsec_hdrsiz(sp);
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP ipsec4_hdrsiz call free SP:%p\n", sp));
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec4_hdrsiz: size:%lu.\n", (unsigned long)size));
 	key_freesp(sp);
 
 	return size;
 }
 
 #ifdef INET6
 /* This function is called from ipsec6_hdrsize_tcp(),
  * and maybe from ip6_forward.()
  */
 size_t
 ipsec6_hdrsiz(m, dir, in6p)
 	struct mbuf *m;
 	u_int dir;
 	struct in6pcb *in6p;
 {
 	struct secpolicy *sp = NULL;
 	int error;
 	size_t size;
 
 	/* sanity check */
 	if (m == NULL)
 		return 0;	/* XXX shoud be panic ? */
 	if (in6p != NULL && in6p->in6p_socket == NULL)
 		panic("ipsec6_hdrsize: why is socket NULL but there is PCB.");
 
 	/* get SP for this packet */
 	/* XXX Is it right to call with IP_FORWARDING. */
 	if (in6p == NULL)
 		sp = ipsec6_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
 	else
 		sp = ipsec6_getpolicybysock(m, dir, in6p->in6p_socket, &error);
 
 	if (sp == NULL)
 		return 0;
 	size = ipsec_hdrsiz(sp);
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP ipsec6_hdrsiz call free SP:%p\n", sp));
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec6_hdrsiz: size:%lu.\n", (unsigned long)size));
 	key_freesp(sp);
 
 	return size;
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * encapsulate for ipsec tunnel.
  * ip->ip_src must be fixed later on.
  */
 static int
 ipsec4_encapsulate(m, sav)
 	struct mbuf *m;
 	struct secasvar *sav;
 {
 	struct ip *oip;
 	struct ip *ip;
 	size_t hlen;
 	size_t plen;
 
 	/* can't tunnel between different AFs */
 	if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family
 		!= ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family
 	 || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET) {
 		m_freem(m);
 		return EINVAL;
 	}
 #if 0
 	/* XXX if the dst is myself, perform nothing. */
 	if (key_ismyaddr((struct sockaddr *)&sav->sah->saidx.dst)) {
 		m_freem(m);
 		return EINVAL;
 	}
 #endif
 
 	if (m->m_len < sizeof(*ip))
 		panic("ipsec4_encapsulate: assumption failed (first mbuf length)");
 
 	ip = mtod(m, struct ip *);
 #ifdef _IP_VHL
 	hlen = _IP_VHL_HL(ip->ip_vhl) << 2;
 #else
 	hlen = ip->ip_hl << 2;
 #endif
 
 	if (m->m_len != hlen)
 		panic("ipsec4_encapsulate: assumption failed (first mbuf length)");
 
 	/* generate header checksum */
 	ip->ip_sum = 0;
 #ifdef _IP_VHL
 	if (ip->ip_vhl == IP_VHL_BORING)
 		ip->ip_sum = in_cksum_hdr(ip);
 	else
 		ip->ip_sum = in_cksum(m, hlen);
 #else
 	ip->ip_sum = in_cksum(m, hlen);
 #endif
 
 	plen = m->m_pkthdr.len;
 
 	/*
 	 * grow the mbuf to accomodate the new IPv4 header.
 	 * NOTE: IPv4 options will never be copied.
 	 */
 	if (M_LEADINGSPACE(m->m_next) < hlen) {
 		struct mbuf *n;
 		MGET(n, M_DONTWAIT, MT_DATA);
 		if (!n) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		n->m_len = hlen;
 		n->m_next = m->m_next;
 		m->m_next = n;
 		m->m_pkthdr.len += hlen;
 		oip = mtod(n, struct ip *);
 	} else {
 		m->m_next->m_len += hlen;
 		m->m_next->m_data -= hlen;
 		m->m_pkthdr.len += hlen;
 		oip = mtod(m->m_next, struct ip *);
 	}
 	ip = mtod(m, struct ip *);
 	ovbcopy((caddr_t)ip, (caddr_t)oip, hlen);
 	m->m_len = sizeof(struct ip);
 	m->m_pkthdr.len -= (hlen - sizeof(struct ip));
 
 	/* construct new IPv4 header. see RFC 2401 5.1.2.1 */
 	/* ECN consideration. */
 	ip_ecn_ingress(ip4_ipsec_ecn, &ip->ip_tos, &oip->ip_tos);
 #ifdef _IP_VHL
 	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
 #else
 	ip->ip_hl = sizeof(struct ip) >> 2;
 #endif
 	ip->ip_off &= htons(~IP_OFFMASK);
 	ip->ip_off &= htons(~IP_MF);
 	switch (ip4_ipsec_dfbit) {
 	case 0:	/* clear DF bit */
 		ip->ip_off &= htons(~IP_DF);
 		break;
 	case 1:	/* set DF bit */
 		ip->ip_off |= htons(IP_DF);
 		break;
 	default:	/* copy DF bit */
 		break;
 	}
 	ip->ip_p = IPPROTO_IPIP;
 	if (plen + sizeof(struct ip) < IP_MAXPACKET)
 		ip->ip_len = htons(plen + sizeof(struct ip));
 	else {
 		ipseclog((LOG_ERR, "IPv4 ipsec: size exceeds limit: "
 			"leave ip_len as is (invalid packet)\n"));
 	}
 #ifdef RANDOM_IP_ID
 	ip->ip_id = ip_randomid();
 #else
 	ip->ip_id = htons(ip_id++);
 #endif
 	bcopy(&((struct sockaddr_in *)&sav->sah->saidx.src)->sin_addr,
 		&ip->ip_src, sizeof(ip->ip_src));
 	bcopy(&((struct sockaddr_in *)&sav->sah->saidx.dst)->sin_addr,
 		&ip->ip_dst, sizeof(ip->ip_dst));
 	ip->ip_ttl = IPDEFTTL;
 
 	/* XXX Should ip_src be updated later ? */
 
 	return 0;
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 ipsec6_encapsulate(m, sav)
 	struct mbuf *m;
 	struct secasvar *sav;
 {
 	struct ip6_hdr *oip6;
 	struct ip6_hdr *ip6;
 	size_t plen;
 
 	/* can't tunnel between different AFs */
 	if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family
 		!= ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family
 	 || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET6) {
 		m_freem(m);
 		return EINVAL;
 	}
 #if 0
 	/* XXX if the dst is myself, perform nothing. */
 	if (key_ismyaddr((struct sockaddr *)&sav->sah->saidx.dst)) {
 		m_freem(m);
 		return EINVAL;
 	}
 #endif
 
 	plen = m->m_pkthdr.len;
 
 	/*
 	 * grow the mbuf to accomodate the new IPv6 header.
 	 */
 	if (m->m_len != sizeof(struct ip6_hdr))
 		panic("ipsec6_encapsulate: assumption failed (first mbuf length)");
 	if (M_LEADINGSPACE(m->m_next) < sizeof(struct ip6_hdr)) {
 		struct mbuf *n;
 		MGET(n, M_DONTWAIT, MT_DATA);
 		if (!n) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		n->m_len = sizeof(struct ip6_hdr);
 		n->m_next = m->m_next;
 		m->m_next = n;
 		m->m_pkthdr.len += sizeof(struct ip6_hdr);
 		oip6 = mtod(n, struct ip6_hdr *);
 	} else {
 		m->m_next->m_len += sizeof(struct ip6_hdr);
 		m->m_next->m_data -= sizeof(struct ip6_hdr);
 		m->m_pkthdr.len += sizeof(struct ip6_hdr);
 		oip6 = mtod(m->m_next, struct ip6_hdr *);
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	ovbcopy((caddr_t)ip6, (caddr_t)oip6, sizeof(struct ip6_hdr));
 
 	/* Fake link-local scope-class addresses */
 	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src))
 		oip6->ip6_src.s6_addr16[1] = 0;
 	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst))
 		oip6->ip6_dst.s6_addr16[1] = 0;
 
 	/* construct new IPv6 header. see RFC 2401 5.1.2.2 */
 	/* ECN consideration. */
 	ip6_ecn_ingress(ip6_ipsec_ecn, &ip6->ip6_flow, &oip6->ip6_flow);
 	if (plen < IPV6_MAXPACKET - sizeof(struct ip6_hdr))
 		ip6->ip6_plen = htons(plen);
 	else {
 		/* ip6->ip6_plen will be updated in ip6_output() */
 	}
 	ip6->ip6_nxt = IPPROTO_IPV6;
 	bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.src)->sin6_addr,
 		&ip6->ip6_src, sizeof(ip6->ip6_src));
 	bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.dst)->sin6_addr,
 		&ip6->ip6_dst, sizeof(ip6->ip6_dst));
 	ip6->ip6_hlim = IPV6_DEFHLIM;
 
 	/* XXX Should ip6_src be updated later ? */
 
 	return 0;
 }
 #endif /* INET6 */
 
 /*
  * Check the variable replay window.
  * ipsec_chkreplay() performs replay check before ICV verification.
  * ipsec_updatereplay() updates replay bitmap.  This must be called after
  * ICV verification (it also performs replay check, which is usually done
  * beforehand).
  * 0 (zero) is returned if packet disallowed, 1 if packet permitted.
  *
  * based on RFC 2401.
  */
 int
 ipsec_chkreplay(seq, sav)
 	u_int32_t seq;
 	struct secasvar *sav;
 {
 	const struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* constant: bits of window size */
 	int frlast;		/* constant: last frame */
 
 	/* sanity check */
 	if (sav == NULL)
 		panic("ipsec_chkreplay: NULL pointer was passed.\n");
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		return 1;	/* no need to check replay. */
 
 	/* constant */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* sequence number of 0 is invalid */
 	if (seq == 0)
 		return 0;
 
 	/* first time is always okay */
 	if (replay->count == 0)
 		return 1;
 
 	if (seq > replay->lastseq) {
 		/* larger sequences are okay */
 		return 1;
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* over range to check, i.e. too old or wrapped */
 		if (diff >= wsizeb)
 			return 0;
 
 		fr = frlast - diff / 8;
 
 		/* this packet already seen ? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return 0;
 
 		/* out of order but good */
 		return 1;
 	}
 }
 
 /*
  * check replay counter whether to update or not.
  * OUT:	0:	OK
  *	1:	NG
  */
 int
 ipsec_updatereplay(seq, sav)
 	u_int32_t seq;
 	struct secasvar *sav;
 {
 	struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* constant: bits of window size */
 	int frlast;		/* constant: last frame */
 
 	/* sanity check */
 	if (sav == NULL)
 		panic("ipsec_chkreplay: NULL pointer was passed.\n");
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		goto ok;	/* no need to check replay. */
 
 	/* constant */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* sequence number of 0 is invalid */
 	if (seq == 0)
 		return 1;
 
 	/* first time */
 	if (replay->count == 0) {
 		replay->lastseq = seq;
 		bzero(replay->bitmap, replay->wsize);
 		(replay->bitmap)[frlast] = 1;
 		goto ok;
 	}
 
 	if (seq > replay->lastseq) {
 		/* seq is larger than lastseq. */
 		diff = seq - replay->lastseq;
 
 		/* new larger sequence number */
 		if (diff < wsizeb) {
 			/* In window */
 			/* set bit for this packet */
 			vshiftl(replay->bitmap, diff, replay->wsize);
 			(replay->bitmap)[frlast] |= 1;
 		} else {
 			/* this packet has a "way larger" */
 			bzero(replay->bitmap, replay->wsize);
 			(replay->bitmap)[frlast] = 1;
 		}
 		replay->lastseq = seq;
 
 		/* larger is good */
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* over range to check, i.e. too old or wrapped */
 		if (diff >= wsizeb)
 			return 1;
 
 		fr = frlast - diff / 8;
 
 		/* this packet already seen ? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return 1;
 
 		/* mark as seen */
 		(replay->bitmap)[fr] |= (1 << (diff % 8));
 
 		/* out of order but good */
 	}
 
 ok:
 	if (replay->count == ~0) {
 
 		/* set overflow flag */
 		replay->overflow++;
 
 		/* don't increment, no more packets accepted */
 		if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
 			return 1;
 
 		ipseclog((LOG_WARNING, "replay counter made %d cycle. %s\n",
 		    replay->overflow, ipsec_logsastr(sav)));
 	}
 
 	replay->count++;
 
 	return 0;
 }
 
 /*
  * shift variable length buffer to left.
  * IN:	bitmap: pointer to the buffer
  * 	nbit:	the number of to shift.
  *	wsize:	buffer size (bytes).
  */
 static void
 vshiftl(bitmap, nbit, wsize)
 	unsigned char *bitmap;
 	int nbit, wsize;
 {
 	int s, j, i;
 	unsigned char over;
 
 	for (j = 0; j < nbit; j += 8) {
 		s = (nbit - j < 8) ? (nbit - j): 8;
 		bitmap[0] <<= s;
 		for (i = 1; i < wsize; i++) {
 			over = (bitmap[i] >> (8 - s));
 			bitmap[i] <<= s;
 			bitmap[i-1] |= over;
 		}
 	}
 
 	return;
 }
 
 const char *
 ipsec4_logpacketstr(ip, spi)
 	struct ip *ip;
 	u_int32_t spi;
 {
 	static char buf[256];
 	char *p;
 	u_int8_t *s, *d;
 
 	s = (u_int8_t *)(&ip->ip_src);
 	d = (u_int8_t *)(&ip->ip_dst);
 
 	p = buf;
 	snprintf(buf, sizeof(buf), "packet(SPI=%u ", (u_int32_t)ntohl(spi));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), "src=%u.%u.%u.%u",
 		s[0], s[1], s[2], s[3]);
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), " dst=%u.%u.%u.%u",
 		d[0], d[1], d[2], d[3]);
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), ")");
 
 	return buf;
 }
 
 #ifdef INET6
 const char *
 ipsec6_logpacketstr(ip6, spi)
 	struct ip6_hdr *ip6;
 	u_int32_t spi;
 {
 	static char buf[256];
 	char *p;
 
 	p = buf;
 	snprintf(buf, sizeof(buf), "packet(SPI=%u ", (u_int32_t)ntohl(spi));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), "src=%s",
 		ip6_sprintf(&ip6->ip6_src));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), " dst=%s",
 		ip6_sprintf(&ip6->ip6_dst));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), ")");
 
 	return buf;
 }
 #endif /* INET6 */
 
 const char *
 ipsec_logsastr(sav)
 	struct secasvar *sav;
 {
 	static char buf[256];
 	char *p;
 	struct secasindex *saidx = &sav->sah->saidx;
 
 	/* validity check */
 	if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family
 			!= ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family)
 		panic("ipsec_logsastr: family mismatched.\n");
 
 	p = buf;
 	snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi));
 	while (p && *p)
 		p++;
 	if (((struct sockaddr *)&saidx->src)->sa_family == AF_INET) {
 		u_int8_t *s, *d;
 		s = (u_int8_t *)&((struct sockaddr_in *)&saidx->src)->sin_addr;
 		d = (u_int8_t *)&((struct sockaddr_in *)&saidx->dst)->sin_addr;
 		snprintf(p, sizeof(buf) - (p - buf),
 			"src=%d.%d.%d.%d dst=%d.%d.%d.%d",
 			s[0], s[1], s[2], s[3], d[0], d[1], d[2], d[3]);
 	}
 #ifdef INET6
 	else if (((struct sockaddr *)&saidx->src)->sa_family == AF_INET6) {
 		snprintf(p, sizeof(buf) - (p - buf),
 			"src=%s",
 			ip6_sprintf(&((struct sockaddr_in6 *)&saidx->src)->sin6_addr));
 		while (p && *p)
 			p++;
 		snprintf(p, sizeof(buf) - (p - buf),
 			" dst=%s",
 			ip6_sprintf(&((struct sockaddr_in6 *)&saidx->dst)->sin6_addr));
 	}
 #endif
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof(buf) - (p - buf), ")");
 
 	return buf;
 }
 
 void
 ipsec_dumpmbuf(m)
 	struct mbuf *m;
 {
 	int totlen;
 	int i;
 	u_char *p;
 
 	totlen = 0;
 	printf("---\n");
 	while (m) {
 		p = mtod(m, u_char *);
 		for (i = 0; i < m->m_len; i++) {
 			printf("%02x ", p[i]);
 			totlen++;
 			if (totlen % 16 == 0)
 				printf("\n");
 		}
 		m = m->m_next;
 	}
 	if (totlen % 16 != 0)
 		printf("\n");
 	printf("---\n");
 }
 
 #ifdef INET
 /*
  * IPsec output logic for IPv4.
  */
 int
 ipsec4_output(state, sp, flags)
 	struct ipsec_output_state *state;
 	struct secpolicy *sp;
 	int flags;
 {
 	struct ip *ip = NULL;
 	struct ipsecrequest *isr = NULL;
 	struct secasindex saidx;
 	int s;
 	int error;
 	struct sockaddr_in *dst4;
 	struct sockaddr_in *sin;
 
 	if (!state)
 		panic("state == NULL in ipsec4_output");
 	if (!state->m)
 		panic("state->m == NULL in ipsec4_output");
 	if (!state->ro)
 		panic("state->ro == NULL in ipsec4_output");
 	if (!state->dst)
 		panic("state->dst == NULL in ipsec4_output");
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec4_output: applyed SP\n");
 		kdebug_secpolicy(sp));
 
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 
 #if 0	/* give up to check restriction of transport mode */
 	/* XXX but should be checked somewhere */
 		/*
 		 * some of the IPsec operation must be performed only in
 		 * originating case.
 		 */
 		if (isr->saidx.mode == IPSEC_MODE_TRANSPORT
 		 && (flags & IP_FORWARDING))
 			continue;
 #endif
 
 		/* make SA index for search proper SA */
 		ip = mtod(state->m, struct ip *);
 		bcopy(&isr->saidx, &saidx, sizeof(saidx));
 		saidx.mode = isr->saidx.mode;
 		saidx.reqid = isr->saidx.reqid;
 		sin = (struct sockaddr_in *)&saidx.src;
 		if (sin->sin_len == 0) {
 			sin->sin_len = sizeof(*sin);
 			sin->sin_family = AF_INET;
 			sin->sin_port = IPSEC_PORT_ANY;
 			bcopy(&ip->ip_src, &sin->sin_addr,
 			    sizeof(sin->sin_addr));
 		}
 		sin = (struct sockaddr_in *)&saidx.dst;
 		if (sin->sin_len == 0) {
 			sin->sin_len = sizeof(*sin);
 			sin->sin_family = AF_INET;
 			sin->sin_port = IPSEC_PORT_ANY;
 			bcopy(&ip->ip_dst, &sin->sin_addr,
 			    sizeof(sin->sin_addr));
 		}
 
 		if ((error = key_checkrequest(isr, &saidx)) != 0) {
 			/*
 			 * IPsec processing is required, but no SA found.
 			 * I assume that key_acquire() had been called
 			 * to get/establish the SA. Here I discard
 			 * this packet because it is responsibility for
 			 * upper layer to retransmit the packet.
 			 */
 			ipsecstat.out_nosa++;
 			goto bad;
 		}
 
 		/* validity check */
 		if (isr->sav == NULL) {
 			switch (ipsec_get_reqlevel(isr)) {
 			case IPSEC_LEVEL_USE:
 				continue;
 			case IPSEC_LEVEL_REQUIRE:
 				/* must be not reached here. */
 				panic("ipsec4_output: no SA found, but required.");
 			}
 		}
 
 		/*
 		 * If there is no valid SA, we give up to process any
 		 * more.  In such a case, the SA's status is changed
 		 * from DYING to DEAD after allocating.  If a packet
 		 * send to the receiver by dead SA, the receiver can
 		 * not decode a packet because SA has been dead.
 		 */
 		if (isr->sav->state != SADB_SASTATE_MATURE
 		 && isr->sav->state != SADB_SASTATE_DYING) {
 			ipsecstat.out_nosa++;
 			error = EINVAL;
 			goto bad;
 		}
 
 		/*
 		 * There may be the case that SA status will be changed when
 		 * we are refering to one. So calling splsoftnet().
 		 */
 		s = splnet();
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			/*
 			 * build IPsec tunnel.
 			 */
 			/* XXX should be processed with other familiy */
 			if (((struct sockaddr *)&isr->sav->sah->saidx.src)->sa_family != AF_INET) {
 				ipseclog((LOG_ERR, "ipsec4_output: "
 				    "family mismatched between inner and outer spi=%u\n",
 				    (u_int32_t)ntohl(isr->sav->spi)));
 				splx(s);
 				error = EAFNOSUPPORT;
 				goto bad;
 			}
 
 			state->m = ipsec4_splithdr(state->m);
 			if (!state->m) {
 				splx(s);
 				error = ENOMEM;
 				goto bad;
 			}
 			error = ipsec4_encapsulate(state->m, isr->sav);
 			splx(s);
 			if (error) {
 				state->m = NULL;
 				goto bad;
 			}
 			ip = mtod(state->m, struct ip *);
 
 			state->ro = &isr->sav->sah->sa_route;
 			state->dst = (struct sockaddr *)&state->ro->ro_dst;
 			dst4 = (struct sockaddr_in *)state->dst;
 			if (state->ro->ro_rt
 			 && ((state->ro->ro_rt->rt_flags & RTF_UP) == 0
 			  || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) {
 				RTFREE(state->ro->ro_rt);
 				state->ro->ro_rt = NULL;
 			}
 			if (state->ro->ro_rt == 0) {
 				dst4->sin_family = AF_INET;
 				dst4->sin_len = sizeof(*dst4);
 				dst4->sin_addr = ip->ip_dst;
 				rtalloc(state->ro);
 			}
 			if (state->ro->ro_rt == 0) {
 				ipstat.ips_noroute++;
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 
 			/* adjust state->dst if tunnel endpoint is offlink */
 			if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) {
 				state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway;
 				dst4 = (struct sockaddr_in *)state->dst;
 			}
 		} else
 			splx(s);
 
 		state->m = ipsec4_splithdr(state->m);
 		if (!state->m) {
 			error = ENOMEM;
 			goto bad;
 		}
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 #ifdef IPSEC_ESP
 			if ((error = esp4_output(state->m, isr)) != 0) {
 				state->m = NULL;
 				goto bad;
 			}
 			break;
 #else
 			m_freem(state->m);
 			state->m = NULL;
 			error = EINVAL;
 			goto bad;
 #endif
 		case IPPROTO_AH:
 			if ((error = ah4_output(state->m, isr)) != 0) {
 				state->m = NULL;
 				goto bad;
 			}
 			break;
 		case IPPROTO_IPCOMP:
 			if ((error = ipcomp4_output(state->m, isr)) != 0) {
 				state->m = NULL;
 				goto bad;
 			}
 			break;
 		default:
 			ipseclog((LOG_ERR,
 			    "ipsec4_output: unknown ipsec protocol %d\n",
 			    isr->saidx.proto));
 			m_freem(state->m);
 			state->m = NULL;
 			error = EINVAL;
 			goto bad;
 		}
 
 		if (state->m == 0) {
 			error = ENOMEM;
 			goto bad;
 		}
 		ip = mtod(state->m, struct ip *);
 	}
 
 	return 0;
 
 bad:
 	m_freem(state->m);
 	state->m = NULL;
 	return error;
 }
 #endif
 
 #ifdef INET6
 /*
  * IPsec output logic for IPv6, transport mode.
  */
 int
 ipsec6_output_trans(state, nexthdrp, mprev, sp, flags, tun)
 	struct ipsec_output_state *state;
 	u_char *nexthdrp;
 	struct mbuf *mprev;
 	struct secpolicy *sp;
 	int flags;
 	int *tun;
 {
 	struct ip6_hdr *ip6;
 	struct ipsecrequest *isr = NULL;
 	struct secasindex saidx;
 	int error = 0;
 	int plen;
 	struct sockaddr_in6 *sin6;
 
 	if (!state)
 		panic("state == NULL in ipsec6_output_trans");
 	if (!state->m)
 		panic("state->m == NULL in ipsec6_output_trans");
 	if (!nexthdrp)
 		panic("nexthdrp == NULL in ipsec6_output_trans");
 	if (!mprev)
 		panic("mprev == NULL in ipsec6_output_trans");
 	if (!sp)
 		panic("sp == NULL in ipsec6_output_trans");
 	if (!tun)
 		panic("tun == NULL in ipsec6_output_trans");
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec6_output_trans: applyed SP\n");
 		kdebug_secpolicy(sp));
 
 	*tun = 0;
 	for (isr = sp->req; isr; isr = isr->next) {
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			/* the rest will be handled by ipsec6_output_tunnel() */
 			break;
 		}
 
 		/* make SA index for search proper SA */
 		ip6 = mtod(state->m, struct ip6_hdr *);
 		bcopy(&isr->saidx, &saidx, sizeof(saidx));
 		saidx.mode = isr->saidx.mode;
 		saidx.reqid = isr->saidx.reqid;
 		sin6 = (struct sockaddr_in6 *)&saidx.src;
 		if (sin6->sin6_len == 0) {
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_port = IPSEC_PORT_ANY;
 			bcopy(&ip6->ip6_src, &sin6->sin6_addr,
 			    sizeof(ip6->ip6_src));
 			if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 				/* fix scope id for comparing SPD */
 				sin6->sin6_addr.s6_addr16[1] = 0;
 				sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 			}
 		}
 		sin6 = (struct sockaddr_in6 *)&saidx.dst;
 		if (sin6->sin6_len == 0) {
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_port = IPSEC_PORT_ANY;
 			bcopy(&ip6->ip6_dst, &sin6->sin6_addr,
 			    sizeof(ip6->ip6_dst));
 			if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 				/* fix scope id for comparing SPD */
 				sin6->sin6_addr.s6_addr16[1] = 0;
 				sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 			}
 		}
 
 		if (key_checkrequest(isr, &saidx) == ENOENT) {
 			/*
 			 * IPsec processing is required, but no SA found.
 			 * I assume that key_acquire() had been called
 			 * to get/establish the SA. Here I discard
 			 * this packet because it is responsibility for
 			 * upper layer to retransmit the packet.
 			 */
 			ipsec6stat.out_nosa++;
 			error = ENOENT;
 
 			/*
 			 * Notify the fact that the packet is discarded
 			 * to ourselves. I believe this is better than
 			 * just silently discarding. (jinmei@kame.net)
 			 * XXX: should we restrict the error to TCP packets?
 			 * XXX: should we directly notify sockets via
 			 *      pfctlinputs?
 			 */
 			icmp6_error(state->m, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_ADMIN, 0);
 			state->m = NULL; /* icmp6_error freed the mbuf */
 			goto bad;
 		}
 
 		/* validity check */
 		if (isr->sav == NULL) {
 			switch (ipsec_get_reqlevel(isr)) {
 			case IPSEC_LEVEL_USE:
 				continue;
 			case IPSEC_LEVEL_REQUIRE:
 				/* must be not reached here. */
 				panic("ipsec6_output_trans: no SA found, but required.");
 			}
 		}
 
 		/*
 		 * If there is no valid SA, we give up to process.
 		 * see same place at ipsec4_output().
 		 */
 		if (isr->sav->state != SADB_SASTATE_MATURE
 		 && isr->sav->state != SADB_SASTATE_DYING) {
 			ipsec6stat.out_nosa++;
 			error = EINVAL;
 			goto bad;
 		}
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 #ifdef IPSEC_ESP
 			error = esp6_output(state->m, nexthdrp, mprev->m_next, isr);
 #else
 			m_freem(state->m);
 			error = EINVAL;
 #endif
 			break;
 		case IPPROTO_AH:
 			error = ah6_output(state->m, nexthdrp, mprev->m_next, isr);
 			break;
 		case IPPROTO_IPCOMP:
 			error = ipcomp6_output(state->m, nexthdrp, mprev->m_next, isr);
 			break;
 		default:
 			ipseclog((LOG_ERR, "ipsec6_output_trans: "
 			    "unknown ipsec protocol %d\n", isr->saidx.proto));
 			m_freem(state->m);
 			ipsec6stat.out_inval++;
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			state->m = NULL;
 			goto bad;
 		}
 		plen = state->m->m_pkthdr.len - sizeof(struct ip6_hdr);
 		if (plen > IPV6_MAXPACKET) {
 			ipseclog((LOG_ERR, "ipsec6_output_trans: "
 			    "IPsec with IPv6 jumbogram is not supported\n"));
 			ipsec6stat.out_inval++;
 			error = EINVAL;	/* XXX */
 			goto bad;
 		}
 		ip6 = mtod(state->m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(plen);
 	}
 
 	/* if we have more to go, we need a tunnel mode processing */
 	if (isr != NULL)
 		*tun = 1;
 
 	return 0;
 
 bad:
 	m_freem(state->m);
 	state->m = NULL;
 	return error;
 }
 
 /*
  * IPsec output logic for IPv6, tunnel mode.
  */
 int
 ipsec6_output_tunnel(state, sp, flags)
 	struct ipsec_output_state *state;
 	struct secpolicy *sp;
 	int flags;
 {
 	struct ip6_hdr *ip6;
 	struct ipsecrequest *isr = NULL;
 	struct secasindex saidx;
 	int error = 0;
 	int plen;
 	struct sockaddr_in6* dst6;
 	int s;
 
 	if (!state)
 		panic("state == NULL in ipsec6_output_tunnel");
 	if (!state->m)
 		panic("state->m == NULL in ipsec6_output_tunnel");
 	if (!sp)
 		panic("sp == NULL in ipsec6_output_tunnel");
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("ipsec6_output_tunnel: applyed SP\n");
 		kdebug_secpolicy(sp));
 
 	/*
 	 * transport mode ipsec (before the 1st tunnel mode) is already
 	 * processed by ipsec6_output_trans().
 	 */
 	for (isr = sp->req; isr; isr = isr->next) {
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 			break;
 	}
 
 	for (/* already initialized */; isr; isr = isr->next) {
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			/* When tunnel mode, SA peers must be specified. */
 			bcopy(&isr->saidx, &saidx, sizeof(saidx));
 		} else {
 			/* make SA index to look for a proper SA */
 			struct sockaddr_in6 *sin6;
 
 			bzero(&saidx, sizeof(saidx));
 			saidx.proto = isr->saidx.proto;
 			saidx.mode = isr->saidx.mode;
 			saidx.reqid = isr->saidx.reqid;
 
 			ip6 = mtod(state->m, struct ip6_hdr *);
 			sin6 = (struct sockaddr_in6 *)&saidx.src;
 			if (sin6->sin6_len == 0) {
 				sin6->sin6_len = sizeof(*sin6);
 				sin6->sin6_family = AF_INET6;
 				sin6->sin6_port = IPSEC_PORT_ANY;
 				bcopy(&ip6->ip6_src, &sin6->sin6_addr,
 				    sizeof(ip6->ip6_src));
 				if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 					/* fix scope id for comparing SPD */
 					sin6->sin6_addr.s6_addr16[1] = 0;
 					sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 				}
 			}
 			sin6 = (struct sockaddr_in6 *)&saidx.dst;
 			if (sin6->sin6_len == 0) {
 				sin6->sin6_len = sizeof(*sin6);
 				sin6->sin6_family = AF_INET6;
 				sin6->sin6_port = IPSEC_PORT_ANY;
 				bcopy(&ip6->ip6_dst, &sin6->sin6_addr,
 				    sizeof(ip6->ip6_dst));
 				if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 					/* fix scope id for comparing SPD */
 					sin6->sin6_addr.s6_addr16[1] = 0;
 					sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 				}
 			}
 		}
 
 		if (key_checkrequest(isr, &saidx) == ENOENT) {
 			/*
 			 * IPsec processing is required, but no SA found.
 			 * I assume that key_acquire() had been called
 			 * to get/establish the SA. Here I discard
 			 * this packet because it is responsibility for
 			 * upper layer to retransmit the packet.
 			 */
 			ipsec6stat.out_nosa++;
 			error = ENOENT;
 			goto bad;
 		}
 
 		/* validity check */
 		if (isr->sav == NULL) {
 			switch (ipsec_get_reqlevel(isr)) {
 			case IPSEC_LEVEL_USE:
 				continue;
 			case IPSEC_LEVEL_REQUIRE:
 				/* must be not reached here. */
 				panic("ipsec6_output_tunnel: no SA found, but required.");
 			}
 		}
 
 		/*
 		 * If there is no valid SA, we give up to process.
 		 * see same place at ipsec4_output().
 		 */
 		if (isr->sav->state != SADB_SASTATE_MATURE
 		 && isr->sav->state != SADB_SASTATE_DYING) {
 			ipsec6stat.out_nosa++;
 			error = EINVAL;
 			goto bad;
 		}
 
 		/*
 		 * There may be the case that SA status will be changed when
 		 * we are refering to one. So calling splsoftnet().
 		 */
 		s = splnet();
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			/*
 			 * build IPsec tunnel.
 			 */
 			/* XXX should be processed with other familiy */
 			if (((struct sockaddr *)&isr->sav->sah->saidx.src)->sa_family != AF_INET6) {
 				ipseclog((LOG_ERR, "ipsec6_output_tunnel: "
 				    "family mismatched between inner and outer, spi=%u\n",
 				    (u_int32_t)ntohl(isr->sav->spi)));
 				splx(s);
 				ipsec6stat.out_inval++;
 				error = EAFNOSUPPORT;
 				goto bad;
 			}
 
 			state->m = ipsec6_splithdr(state->m);
 			if (!state->m) {
 				splx(s);
 				ipsec6stat.out_nomem++;
 				error = ENOMEM;
 				goto bad;
 			}
 			error = ipsec6_encapsulate(state->m, isr->sav);
 			splx(s);
 			if (error) {
 				state->m = 0;
 				goto bad;
 			}
 			ip6 = mtod(state->m, struct ip6_hdr *);
 
 			state->ro = &isr->sav->sah->sa_route;
 			state->dst = (struct sockaddr *)&state->ro->ro_dst;
 			dst6 = (struct sockaddr_in6 *)state->dst;
 			if (state->ro->ro_rt
 			 && ((state->ro->ro_rt->rt_flags & RTF_UP) == 0
 			  || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) {
 				RTFREE(state->ro->ro_rt);
 				state->ro->ro_rt = NULL;
 			}
 			if (state->ro->ro_rt == 0) {
 				bzero(dst6, sizeof(*dst6));
 				dst6->sin6_family = AF_INET6;
 				dst6->sin6_len = sizeof(*dst6);
 				dst6->sin6_addr = ip6->ip6_dst;
 				rtalloc(state->ro);
 			}
 			if (state->ro->ro_rt == 0) {
 				ip6stat.ip6s_noroute++;
 				ipsec6stat.out_noroute++;
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 
 			/* adjust state->dst if tunnel endpoint is offlink */
 			if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) {
 				state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway;
 				dst6 = (struct sockaddr_in6 *)state->dst;
 			}
 		} else
 			splx(s);
 
 		state->m = ipsec6_splithdr(state->m);
 		if (!state->m) {
 			ipsec6stat.out_nomem++;
 			error = ENOMEM;
 			goto bad;
 		}
 		ip6 = mtod(state->m, struct ip6_hdr *);
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 #ifdef IPSEC_ESP
 			error = esp6_output(state->m, &ip6->ip6_nxt, state->m->m_next, isr);
 #else
 			m_freem(state->m);
 			error = EINVAL;
 #endif
 			break;
 		case IPPROTO_AH:
 			error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, isr);
 			break;
 		case IPPROTO_IPCOMP:
 			/* XXX code should be here */
 			/* FALLTHROUGH */
 		default:
 			ipseclog((LOG_ERR, "ipsec6_output_tunnel: "
 			    "unknown ipsec protocol %d\n", isr->saidx.proto));
 			m_freem(state->m);
 			ipsec6stat.out_inval++;
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			state->m = NULL;
 			goto bad;
 		}
 		plen = state->m->m_pkthdr.len - sizeof(struct ip6_hdr);
 		if (plen > IPV6_MAXPACKET) {
 			ipseclog((LOG_ERR, "ipsec6_output_tunnel: "
 			    "IPsec with IPv6 jumbogram is not supported\n"));
 			ipsec6stat.out_inval++;
 			error = EINVAL;	/* XXX */
 			goto bad;
 		}
 		ip6 = mtod(state->m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(plen);
 	}
 
 	return 0;
 
 bad:
 	m_freem(state->m);
 	state->m = NULL;
 	return error;
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Chop IP header and option off from the payload.
  */
 static struct mbuf *
 ipsec4_splithdr(m)
 	struct mbuf *m;
 {
 	struct mbuf *mh;
 	struct ip *ip;
 	int hlen;
 
 	if (m->m_len < sizeof(struct ip))
 		panic("ipsec4_splithdr: first mbuf too short");
 	ip = mtod(m, struct ip *);
 #ifdef _IP_VHL
 	hlen = _IP_VHL_HL(ip->ip_vhl) << 2;
 #else
 	hlen = ip->ip_hl << 2;
 #endif
 	if (m->m_len > hlen) {
 		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
 		if (!mh) {
 			m_freem(m);
 			return NULL;
 		}
 		M_COPY_PKTHDR(mh, m);
 		MH_ALIGN(mh, hlen);
 		m->m_flags &= ~M_PKTHDR;
 		m->m_len -= hlen;
 		m->m_data += hlen;
 		mh->m_next = m;
 		m = mh;
 		m->m_len = hlen;
 		bcopy((caddr_t)ip, mtod(m, caddr_t), hlen);
 	} else if (m->m_len < hlen) {
 		m = m_pullup(m, hlen);
 		if (!m)
 			return NULL;
 	}
 	return m;
 }
 #endif
 
 #ifdef INET6
 static struct mbuf *
 ipsec6_splithdr(m)
 	struct mbuf *m;
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 	int hlen;
 
 	if (m->m_len < sizeof(struct ip6_hdr))
 		panic("ipsec6_splithdr: first mbuf too short");
 	ip6 = mtod(m, struct ip6_hdr *);
 	hlen = sizeof(struct ip6_hdr);
 	if (m->m_len > hlen) {
 		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
 		if (!mh) {
 			m_freem(m);
 			return NULL;
 		}
 		M_COPY_PKTHDR(mh, m);
 		MH_ALIGN(mh, hlen);
 		m->m_flags &= ~M_PKTHDR;
 		m->m_len -= hlen;
 		m->m_data += hlen;
 		mh->m_next = m;
 		m = mh;
 		m->m_len = hlen;
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), hlen);
 	} else if (m->m_len < hlen) {
 		m = m_pullup(m, hlen);
 		if (!m)
 			return NULL;
 	}
 	return m;
 }
 #endif
 
 /* validate inbound IPsec tunnel packet. */
 int
 ipsec4_tunnel_validate(m, off, nxt0, sav)
 	struct mbuf *m;		/* no pullup permitted, m->m_len >= ip */
 	int off;
 	u_int nxt0;
 	struct secasvar *sav;
 {
 	u_int8_t nxt = nxt0 & 0xff;
 	struct sockaddr_in *sin;
 	struct sockaddr_in osrc, odst, isrc, idst;
 	int hlen;
 	struct secpolicy *sp;
 	struct ip *oip;
 
 #ifdef DIAGNOSTIC
 	if (m->m_len < sizeof(struct ip))
 		panic("too short mbuf on ipsec4_tunnel_validate");
 #endif
 	if (nxt != IPPROTO_IPV4)
 		return 0;
 	if (m->m_pkthdr.len < off + sizeof(struct ip))
 		return 0;
 	/* do not decapsulate if the SA is for transport mode only */
 	if (sav->sah->saidx.mode == IPSEC_MODE_TRANSPORT)
 		return 0;
 
 	oip = mtod(m, struct ip *);
 #ifdef _IP_VHL
 	hlen = _IP_VHL_HL(oip->ip_vhl) << 2;
 #else
 	hlen = oip->ip_hl << 2;
 #endif
 	if (hlen != sizeof(struct ip))
 		return 0;
 
 	/* AF_INET6 should be supported, but at this moment we don't. */
 	sin = (struct sockaddr_in *)&sav->sah->saidx.dst;
 	if (sin->sin_family != AF_INET)
 		return 0;
 	if (bcmp(&oip->ip_dst, &sin->sin_addr, sizeof(oip->ip_dst)) != 0)
 		return 0;
 
 	/* XXX slow */
 	bzero(&osrc, sizeof(osrc));
 	bzero(&odst, sizeof(odst));
 	bzero(&isrc, sizeof(isrc));
 	bzero(&idst, sizeof(idst));
 	osrc.sin_family = odst.sin_family = isrc.sin_family = idst.sin_family = 
 	    AF_INET;
 	osrc.sin_len = odst.sin_len = isrc.sin_len = idst.sin_len = 
 	    sizeof(struct sockaddr_in);
 	osrc.sin_addr = oip->ip_src;
 	odst.sin_addr = oip->ip_dst;
 	m_copydata(m, off + offsetof(struct ip, ip_src), sizeof(isrc.sin_addr),
 	    (caddr_t)&isrc.sin_addr);
 	m_copydata(m, off + offsetof(struct ip, ip_dst), sizeof(idst.sin_addr),
 	    (caddr_t)&idst.sin_addr);
 
 	/*
 	 * RFC2401 5.2.1 (b): (assume that we are using tunnel mode)
 	 * - if the inner destination is multicast address, there can be
 	 *   multiple permissible inner source address.  implementation
 	 *   may want to skip verification of inner source address against
 	 *   SPD selector.
 	 * - if the inner protocol is ICMP, the packet may be an error report
 	 *   from routers on the other side of the VPN cloud (R in the
 	 *   following diagram).  in this case, we cannot verify inner source
 	 *   address against SPD selector.
 	 *	me -- gw === gw -- R -- you
 	 *
 	 * we consider the first bullet to be users responsibility on SPD entry
 	 * configuration (if you need to encrypt multicast traffic, set
 	 * the source range of SPD selector to 0.0.0.0/0, or have explicit
 	 * address ranges for possible senders).
 	 * the second bullet is not taken care of (yet).
 	 *
 	 * therefore, we do not do anything special about inner source.
 	 */
 
 	sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst,
 	    (struct sockaddr *)&isrc, (struct sockaddr *)&idst);
 	if (!sp)
 		return 0;
 	key_freesp(sp);
 
 	return 1;
 }
 
 #ifdef INET6
 /* validate inbound IPsec tunnel packet. */
 int
 ipsec6_tunnel_validate(m, off, nxt0, sav)
 	struct mbuf *m;		/* no pullup permitted, m->m_len >= ip */
 	int off;
 	u_int nxt0;
 	struct secasvar *sav;
 {
 	u_int8_t nxt = nxt0 & 0xff;
 	struct sockaddr_in6 *sin6;
 	struct sockaddr_in6 osrc, odst, isrc, idst;
 	struct secpolicy *sp;
 	struct ip6_hdr *oip6;
 
 #ifdef DIAGNOSTIC
 	if (m->m_len < sizeof(struct ip6_hdr))
 		panic("too short mbuf on ipsec6_tunnel_validate");
 #endif
 	if (nxt != IPPROTO_IPV6)
 		return 0;
 	if (m->m_pkthdr.len < off + sizeof(struct ip6_hdr))
 		return 0;
 	/* do not decapsulate if the SA is for transport mode only */
 	if (sav->sah->saidx.mode == IPSEC_MODE_TRANSPORT)
 		return 0;
 
 	oip6 = mtod(m, struct ip6_hdr *);
 	/* AF_INET should be supported, but at this moment we don't. */
 	sin6 = (struct sockaddr_in6 *)&sav->sah->saidx.dst;
 	if (sin6->sin6_family != AF_INET6)
 		return 0;
 	if (!IN6_ARE_ADDR_EQUAL(&oip6->ip6_dst, &sin6->sin6_addr))
 		return 0;
 
 	/* XXX slow */
 	bzero(&osrc, sizeof(osrc));
 	bzero(&odst, sizeof(odst));
 	bzero(&isrc, sizeof(isrc));
 	bzero(&idst, sizeof(idst));
 	osrc.sin6_family = odst.sin6_family = isrc.sin6_family =
 	    idst.sin6_family = AF_INET6;
 	osrc.sin6_len = odst.sin6_len = isrc.sin6_len = idst.sin6_len = 
 	    sizeof(struct sockaddr_in6);
 	osrc.sin6_addr = oip6->ip6_src;
 	odst.sin6_addr = oip6->ip6_dst;
 	m_copydata(m, off + offsetof(struct ip6_hdr, ip6_src),
 	    sizeof(isrc.sin6_addr), (caddr_t)&isrc.sin6_addr);
 	m_copydata(m, off + offsetof(struct ip6_hdr, ip6_dst),
 	    sizeof(idst.sin6_addr), (caddr_t)&idst.sin6_addr);
 
 	/*
 	 * regarding to inner source address validation, see a long comment
 	 * in ipsec4_tunnel_validate.
 	 */
 
 	sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst,
 	    (struct sockaddr *)&isrc, (struct sockaddr *)&idst);
 	/*
 	 * when there is no suitable inbound policy for the packet of the ipsec
 	 * tunnel mode, the kernel never decapsulate the tunneled packet
 	 * as the ipsec tunnel mode even when the system wide policy is "none".
 	 * then the kernel leaves the generic tunnel module to process this
 	 * packet.  if there is no rule of the generic tunnel, the packet
 	 * is rejected and the statistics will be counted up.
 	 */
 	if (!sp)
 		return 0;
 	key_freesp(sp);
 
 	return 1;
 }
 #endif
 
 /*
  * Make a mbuf chain for encryption.
  * If the original mbuf chain contains a mbuf with a cluster,
  * allocate a new cluster and copy the data to the new cluster.
  * XXX: this hack is inefficient, but is necessary to handle cases
  * of TCP retransmission...
  */
 struct mbuf *
 ipsec_copypkt(m)
 	struct mbuf *m;
 {
 	struct mbuf *n, **mpp, *mnew;
 
 	for (n = m, mpp = &m; n; n = n->m_next) {
 		if (n->m_flags & M_EXT) {
 			/*
 			 * Make a copy only if there are more than one
 			 * references to the cluster.
 			 * XXX: is this approach effective?
 			 */
 			if (n->m_ext.ext_type != EXT_CLUSTER || MEXT_IS_REF(n)) 
 			{
 				int remain, copied;
 				struct mbuf *mm;
 
 				if (n->m_flags & M_PKTHDR) {
 					MGETHDR(mnew, M_DONTWAIT, MT_HEADER);
 					if (mnew == NULL)
 						goto fail;
 					mnew->m_pkthdr = n->m_pkthdr;
 #if 0
 					if (n->m_pkthdr.aux) {
 						mnew->m_pkthdr.aux =
 						    m_copym(n->m_pkthdr.aux,
 						    0, M_COPYALL, M_DONTWAIT);
 					}
 #endif
 					M_COPY_PKTHDR(mnew, n);
 					mnew->m_flags = n->m_flags & M_COPYFLAGS;
 				}
 				else {
 					MGET(mnew, M_DONTWAIT, MT_DATA);
 					if (mnew == NULL)
 						goto fail;
 				}
 				mnew->m_len = 0;
 				mm = mnew;
 
 				/*
 				 * Copy data. If we don't have enough space to
 				 * store the whole data, allocate a cluster
 				 * or additional mbufs.
 				 * XXX: we don't use m_copyback(), since the
 				 * function does not use clusters and thus is
 				 * inefficient.
 				 */
 				remain = n->m_len;
 				copied = 0;
 				while (1) {
 					int len;
 					struct mbuf *mn;
 
 					if (remain <= (mm->m_flags & M_PKTHDR ? MHLEN : MLEN))
 						len = remain;
 					else { /* allocate a cluster */
 						MCLGET(mm, M_DONTWAIT);
 						if (!(mm->m_flags & M_EXT)) {
 							m_free(mm);
 							goto fail;
 						}
 						len = remain < MCLBYTES ?
 							remain : MCLBYTES;
 					}
 
 					bcopy(n->m_data + copied, mm->m_data,
 					      len);
 
 					copied += len;
 					remain -= len;
 					mm->m_len = len;
 
 					if (remain <= 0) /* completed? */
 						break;
 
 					/* need another mbuf */
 					MGETHDR(mn, M_DONTWAIT, MT_HEADER);
 					if (mn == NULL)
 						goto fail;
 					mn->m_pkthdr.rcvif = NULL;
 					mm->m_next = mn;
 					mm = mn;
 				}
 
 				/* adjust chain */
 				mm->m_next = m_free(n);
 				n = mm;
 				*mpp = mnew;
 				mpp = &n->m_next;
 
 				continue;
 			}
 		}
 		*mpp = n;
 		mpp = &n->m_next;
 	}
 
 	return(m);
   fail:
 	m_freem(m);
 	return(NULL);
 }
 
-static struct mbuf *
-ipsec_addaux(m)
-	struct mbuf *m;
-{
-	struct mbuf *n;
-
-	n = m_aux_find(m, AF_INET, IPPROTO_ESP);
-	if (!n)
-		n = m_aux_add(m, AF_INET, IPPROTO_ESP);
-	if (!n)
-		return n;	/* ENOBUFS */
-	n->m_len = sizeof(struct socket *);
-	bzero(mtod(n, void *), n->m_len);
-	return n;
-}
-
-static struct mbuf *
-ipsec_findaux(m)
-	struct mbuf *m;
-{
-	struct mbuf *n;
-
-	n = m_aux_find(m, AF_INET, IPPROTO_ESP);
-#ifdef DIAGNOSTIC
-	if (n && n->m_len < sizeof(struct socket *))
-		panic("invalid ipsec m_aux");
-#endif
-	return n;
-}
-
 void
 ipsec_delaux(m)
 	struct mbuf *m;
 {
-	struct mbuf *n;
+	struct m_tag *tag;
 
-	n = m_aux_find(m, AF_INET, IPPROTO_ESP);
-	if (n)
-		m_aux_delete(m, n);
+	while ((tag = m_tag_find(m, PACKET_TAG_IPSEC_HISTORY, NULL)) != NULL)
+		m_tag_delete(m, tag);
 }
 
-/* if the aux buffer is unnecessary, nuke it. */
-static void
-ipsec_optaux(m, n)
-	struct mbuf *m;
-	struct mbuf *n;
-{
-
-	if (!n)
-		return;
-	if (n->m_len == sizeof(struct socket *) && !*mtod(n, struct socket **))
-		ipsec_delaux(m);
-}
-
 int
-ipsec_setsocket(m, so)
-	struct mbuf *m;
-	struct socket *so;
-{
-	struct mbuf *n;
-
-	/* if so == NULL, don't insist on getting the aux mbuf */
-	if (so) {
-		n = ipsec_addaux(m);
-		if (!n)
-			return ENOBUFS;
-	} else
-		n = ipsec_findaux(m);
-	if (n && n->m_len >= sizeof(struct socket *))
-		*mtod(n, struct socket **) = so;
-	ipsec_optaux(m, n);
-	return 0;
-}
-
-struct socket *
-ipsec_getsocket(m)
-	struct mbuf *m;
-{
-	struct mbuf *n;
-
-	n = ipsec_findaux(m);
-	if (n && n->m_len >= sizeof(struct socket *))
-		return *mtod(n, struct socket **);
-	else
-		return NULL;
-}
-
-int
 ipsec_addhist(m, proto, spi)
 	struct mbuf *m;
 	int proto;
 	u_int32_t spi;
 {
-	struct mbuf *n;
+	struct m_tag *tag;
 	struct ipsec_history *p;
 
-	n = ipsec_addaux(m);
-	if (!n)
+	tag = m_tag_get(PACKET_TAG_IPSEC_HISTORY,
+			sizeof (struct ipsec_history), M_NOWAIT);
+	if (tag == NULL)
 		return ENOBUFS;
-	if (M_TRAILINGSPACE(n) < sizeof(*p))
-		return ENOSPC;	/* XXX */
-	p = (struct ipsec_history *)(mtod(n, caddr_t) + n->m_len);
-	n->m_len += sizeof(*p);
+	p = (struct ipsec_history *)(tag+1);
 	bzero(p, sizeof(*p));
 	p->ih_proto = proto;
 	p->ih_spi = spi;
+	m_tag_prepend(m, tag);
 	return 0;
 }
 
 struct ipsec_history *
 ipsec_gethist(m, lenp)
 	struct mbuf *m;
 	int *lenp;
 {
-	struct mbuf *n;
-	int l;
+	struct m_tag *tag;
 
-	n = ipsec_findaux(m);
-	if (!n)
+	tag = m_tag_find(m, PACKET_TAG_IPSEC_HISTORY, NULL);
+	if (tag == NULL)
 		return NULL;
-	l = n->m_len;
-	if (sizeof(struct socket *) > l)
-		return NULL;
-	if ((l - sizeof(struct socket *)) % sizeof(struct ipsec_history))
-		return NULL;
-	/* XXX does it make more sense to divide by sizeof(ipsec_history)? */
+	/* XXX NB: noone uses this so fake it */
 	if (lenp)
-		*lenp = l - sizeof(struct socket *);
-	return (struct ipsec_history *)
-	    (mtod(n, caddr_t) + sizeof(struct socket *));
-}
-
-void
-ipsec_clearhist(m)
-	struct mbuf *m;
-{
-	struct mbuf *n;
-
-	n = ipsec_findaux(m);
-	if ((n) && n->m_len > sizeof(struct socket *))
-		n->m_len = sizeof(struct socket *);
-	ipsec_optaux(m, n);
+		*lenp = sizeof (struct ipsec_history);
+	return ((struct ipsec_history *)(tag+1));
 }
Index: head/sys/netinet6/ipsec.h
===================================================================
--- head/sys/netinet6/ipsec.h	(revision 105193)
+++ head/sys/netinet6/ipsec.h	(revision 105194)
@@ -1,354 +1,351 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPsec controller part.
  */
 
 #ifndef _NETINET6_IPSEC_H_
 #define _NETINET6_IPSEC_H_
 
 #if defined(_KERNEL) && !defined(_LKM) && !defined(KLD_MODULE)
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 #endif
 
 #include <net/pfkeyv2.h>
 #include <netkey/keydb.h>
 
 #ifdef _KERNEL
 
 /*
  * Security Policy Index
  * Ensure that both address families in the "src" and "dst" are same.
  * When the value of the ul_proto is ICMPv6, the port field in "src"
  * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
  */
 struct secpolicyindex {
 	u_int8_t dir;			/* direction of packet flow, see blow */
 	struct sockaddr_storage src;	/* IP src address for SP */
 	struct sockaddr_storage dst;	/* IP dst address for SP */
 	u_int8_t prefs;			/* prefix length in bits for src */
 	u_int8_t prefd;			/* prefix length in bits for dst */
 	u_int16_t ul_proto;		/* upper layer Protocol */
 #ifdef notyet
 	uid_t uids;
 	uid_t uidd;
 	gid_t gids;
 	gid_t gidd;
 #endif
 };
 
 /* Security Policy Data Base */
 struct secpolicy {
 	LIST_ENTRY(secpolicy) chain;
 
 	int refcnt;			/* reference count */
 	struct secpolicyindex spidx;	/* selector */
 	u_int32_t id;			/* It's unique number on the system. */
 	u_int state;			/* 0: dead, others: alive */
 #define IPSEC_SPSTATE_DEAD	0
 #define IPSEC_SPSTATE_ALIVE	1
 
 	u_int policy;		/* DISCARD, NONE or IPSEC, see keyv2.h */
 	struct ipsecrequest *req;
 				/* pointer to the ipsec request tree, */
 				/* if policy == IPSEC else this value == NULL.*/
 
 	/*
 	 * lifetime handler.
 	 * the policy can be used without limitiation if both lifetime and
 	 * validtime are zero.
 	 * "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime.
 	 * "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime.
 	 */
 	long created;		/* time created the policy */
 	long lastused;		/* updated every when kernel sends a packet */
 	long lifetime;		/* duration of the lifetime of this policy */
 	long validtime;		/* duration this policy is valid without use */
 };
 
 /* Request for IPsec */
 struct ipsecrequest {
 	struct ipsecrequest *next;
 				/* pointer to next structure */
 				/* If NULL, it means the end of chain. */
 	struct secasindex saidx;/* hint for search proper SA */
 				/* if __ss_len == 0 then no address specified.*/
 	u_int level;		/* IPsec level defined below. */
 
 	struct secasvar *sav;	/* place holder of SA for use */
 	struct secpolicy *sp;	/* back pointer to SP */
 };
 
 /* security policy in PCB */
 struct inpcbpolicy {
 	struct secpolicy *sp_in;
 	struct secpolicy *sp_out;
 	int priv;			/* privileged socket ? */
 };
 
 /* SP acquiring list table. */
 struct secspacq {
 	LIST_ENTRY(secspacq) chain;
 
 	struct secpolicyindex spidx;
 
 	long created;		/* for lifetime */
 	int count;		/* for lifetime */
 	/* XXX: here is mbuf place holder to be sent ? */
 };
 #endif /* _KERNEL */
 
 /* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */
 #define IPSEC_PORT_ANY		0
 #define IPSEC_ULPROTO_ANY	255
 #define IPSEC_PROTO_ANY		255
 
 /* mode of security protocol */
 /* NOTE: DON'T use IPSEC_MODE_ANY at SPD.  It's only use in SAD */
 #define	IPSEC_MODE_ANY		0	/* i.e. wildcard. */
 #define	IPSEC_MODE_TRANSPORT	1
 #define	IPSEC_MODE_TUNNEL	2
 
 /*
  * Direction of security policy.
  * NOTE: Since INVALID is used just as flag.
  * The other are used for loop counter too.
  */
 #define IPSEC_DIR_ANY		0
 #define IPSEC_DIR_INBOUND	1
 #define IPSEC_DIR_OUTBOUND	2
 #define IPSEC_DIR_MAX		3
 #define IPSEC_DIR_INVALID	4
 
 /* Policy level */
 /*
  * IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB,
  * DISCARD, IPSEC and NONE are allowed for setkey() in SPD.
  * DISCARD and NONE are allowed for system default.
  */
 #define IPSEC_POLICY_DISCARD	0	/* discarding packet */
 #define IPSEC_POLICY_NONE	1	/* through IPsec engine */
 #define IPSEC_POLICY_IPSEC	2	/* do IPsec */
 #define IPSEC_POLICY_ENTRUST	3	/* consulting SPD if present. */
 #define IPSEC_POLICY_BYPASS	4	/* only for privileged socket. */
 
 /* Security protocol level */
 #define	IPSEC_LEVEL_DEFAULT	0	/* reference to system default */
 #define	IPSEC_LEVEL_USE		1	/* use SA if present. */
 #define	IPSEC_LEVEL_REQUIRE	2	/* require SA. */
 #define	IPSEC_LEVEL_UNIQUE	3	/* unique SA. */
 
 #define IPSEC_MANUAL_REQID_MAX	0x3fff
 				/*
 				 * if security policy level == unique, this id
 				 * indicate to a relative SA for use, else is
 				 * zero.
 				 * 1 - 0x3fff are reserved for manual keying.
 				 * 0 are reserved for above reason.  Others is
 				 * for kernel use.
 				 * Note that this id doesn't identify SA
 				 * by only itself.
 				 */
 #define IPSEC_REPLAYWSIZE  32
 
 /* statistics for ipsec processing */
 struct ipsecstat {
 	u_quad_t in_success;  /* succeeded inbound process */
 	u_quad_t in_polvio;
 			/* security policy violation for inbound process */
 	u_quad_t in_nosa;     /* inbound SA is unavailable */
 	u_quad_t in_inval;    /* inbound processing failed due to EINVAL */
 	u_quad_t in_nomem;    /* inbound processing failed due to ENOBUFS */
 	u_quad_t in_badspi;   /* failed getting a SPI */
 	u_quad_t in_ahreplay; /* AH replay check failed */
 	u_quad_t in_espreplay; /* ESP replay check failed */
 	u_quad_t in_ahauthsucc; /* AH authentication success */
 	u_quad_t in_ahauthfail; /* AH authentication failure */
 	u_quad_t in_espauthsucc; /* ESP authentication success */
 	u_quad_t in_espauthfail; /* ESP authentication failure */
 	u_quad_t in_esphist[256];
 	u_quad_t in_ahhist[256];
 	u_quad_t in_comphist[256];
 	u_quad_t out_success; /* succeeded outbound process */
 	u_quad_t out_polvio;
 			/* security policy violation for outbound process */
 	u_quad_t out_nosa;    /* outbound SA is unavailable */
 	u_quad_t out_inval;   /* outbound process failed due to EINVAL */
 	u_quad_t out_nomem;    /* inbound processing failed due to ENOBUFS */
 	u_quad_t out_noroute; /* there is no route */
 	u_quad_t out_esphist[256];
 	u_quad_t out_ahhist[256];
 	u_quad_t out_comphist[256];
 };
 
 /*
  * Definitions for IPsec & Key sysctl operations.
  */
 /*
  * Names for IPsec & Key sysctl objects
  */
 #define IPSECCTL_STATS			1	/* stats */
 #define IPSECCTL_DEF_POLICY		2
 #define IPSECCTL_DEF_ESP_TRANSLEV	3	/* int; ESP transport mode */
 #define IPSECCTL_DEF_ESP_NETLEV		4	/* int; ESP tunnel mode */
 #define IPSECCTL_DEF_AH_TRANSLEV	5	/* int; AH transport mode */
 #define IPSECCTL_DEF_AH_NETLEV		6	/* int; AH tunnel mode */
 #if 0	/* obsolete, do not reuse */
 #define IPSECCTL_INBOUND_CALL_IKE	7
 #endif
 #define	IPSECCTL_AH_CLEARTOS		8
 #define	IPSECCTL_AH_OFFSETMASK		9
 #define	IPSECCTL_DFBIT			10
 #define	IPSECCTL_ECN			11
 #define	IPSECCTL_DEBUG			12
 #define	IPSECCTL_ESP_RANDPAD		13
 #define IPSECCTL_MAXID			14
 
 #define IPSECCTL_NAMES { \
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ "def_policy", CTLTYPE_INT }, \
 	{ "esp_trans_deflev", CTLTYPE_INT }, \
 	{ "esp_net_deflev", CTLTYPE_INT }, \
 	{ "ah_trans_deflev", CTLTYPE_INT }, \
 	{ "ah_net_deflev", CTLTYPE_INT }, \
 	{ 0, 0 }, \
 	{ "ah_cleartos", CTLTYPE_INT }, \
 	{ "ah_offsetmask", CTLTYPE_INT }, \
 	{ "dfbit", CTLTYPE_INT }, \
 	{ "ecn", CTLTYPE_INT }, \
 	{ "debug", CTLTYPE_INT }, \
 	{ "esp_randpad", CTLTYPE_INT }, \
 }
 
 #define IPSEC6CTL_NAMES { \
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ "def_policy", CTLTYPE_INT }, \
 	{ "esp_trans_deflev", CTLTYPE_INT }, \
 	{ "esp_net_deflev", CTLTYPE_INT }, \
 	{ "ah_trans_deflev", CTLTYPE_INT }, \
 	{ "ah_net_deflev", CTLTYPE_INT }, \
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ "ecn", CTLTYPE_INT }, \
 	{ "debug", CTLTYPE_INT }, \
 	{ "esp_randpad", CTLTYPE_INT }, \
 }
 
 #ifdef _KERNEL
 struct ipsec_output_state {
 	struct mbuf *m;
 	struct route *ro;
 	struct sockaddr *dst;
 };
 
 struct ipsec_history {
 	int ih_proto;
 	u_int32_t ih_spi;
 };
 
 extern int ipsec_debug;
 
 extern struct ipsecstat ipsecstat;
 extern struct secpolicy ip4_def_policy;
 extern int ip4_esp_trans_deflev;
 extern int ip4_esp_net_deflev;
 extern int ip4_ah_trans_deflev;
 extern int ip4_ah_net_deflev;
 extern int ip4_ah_cleartos;
 extern int ip4_ah_offsetmask;
 extern int ip4_ipsec_dfbit;
 extern int ip4_ipsec_ecn;
 extern int ip4_esp_randpad;
 
 #define ipseclog(x)	do { if (ipsec_debug) log x; } while (0)
 
 extern struct secpolicy *ipsec4_getpolicybysock
 	__P((struct mbuf *, u_int, struct socket *, int *));
 extern struct secpolicy *ipsec4_getpolicybyaddr
 	__P((struct mbuf *, u_int, int, int *));
 
 struct inpcb;
 extern int ipsec_init_policy __P((struct socket *so, struct inpcbpolicy **));
 extern int ipsec_copy_policy
 	__P((struct inpcbpolicy *, struct inpcbpolicy *));
 extern u_int ipsec_get_reqlevel __P((struct ipsecrequest *));
 
 extern int ipsec4_set_policy __P((struct inpcb *inp, int optname,
 	caddr_t request, size_t len, int priv));
 extern int ipsec4_get_policy __P((struct inpcb *inpcb, caddr_t request,
 	size_t len, struct mbuf **mp));
 extern int ipsec4_delete_pcbpolicy __P((struct inpcb *));
 extern int ipsec4_in_reject_so __P((struct mbuf *, struct socket *));
 extern int ipsec4_in_reject __P((struct mbuf *, struct inpcb *));
 
 struct secas;
 struct tcpcb;
 extern int ipsec_chkreplay __P((u_int32_t, struct secasvar *));
 extern int ipsec_updatereplay __P((u_int32_t, struct secasvar *));
 
 extern size_t ipsec4_hdrsiz __P((struct mbuf *, u_int, struct inpcb *));
 extern size_t ipsec_hdrsiz_tcp __P((struct tcpcb *));
 
 struct ip;
 extern const char *ipsec4_logpacketstr __P((struct ip *, u_int32_t));
 extern const char *ipsec_logsastr __P((struct secasvar *));
 
 extern void ipsec_dumpmbuf __P((struct mbuf *));
 
 extern int ipsec4_output __P((struct ipsec_output_state *, struct secpolicy *,
 	int));
 extern int ipsec4_tunnel_validate __P((struct mbuf *, int, u_int,
 	struct secasvar *));
 extern struct mbuf *ipsec_copypkt __P((struct mbuf *));
 extern void ipsec_delaux __P((struct mbuf *));
-extern int ipsec_setsocket __P((struct mbuf *, struct socket *));
-extern struct socket *ipsec_getsocket __P((struct mbuf *));
 extern int ipsec_addhist __P((struct mbuf *, int, u_int32_t)); 
 extern struct ipsec_history *ipsec_gethist __P((struct mbuf *, int *));
-extern void ipsec_clearhist __P((struct mbuf *));
 #endif /* _KERNEL */
 
 #ifndef _KERNEL
 extern caddr_t ipsec_set_policy __P((char *, int));
 extern int ipsec_get_policylen __P((caddr_t));
 extern char *ipsec_dump_policy __P((caddr_t, char *));
 
 extern const char *ipsec_strerror __P((void));
 #endif /* !_KERNEL */
 
 #endif /* _NETINET6_IPSEC_H_ */
Index: head/sys/netinet6/mld6.c
===================================================================
--- head/sys/netinet6/mld6.c	(revision 105193)
+++ head/sys/netinet6/mld6.c	(revision 105194)
@@ -1,474 +1,474 @@
 /*	$FreeBSD$	*/
 /*	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $	*/
 
 /*
  * Copyright (C) 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6_var.h>
 
 #include <net/net_osdep.h>
 
 /*
  * Protocol constants
  */
 
 /* denotes that the MLD max response delay field specifies time in milliseconds */
 #define MLD6_TIMER_SCALE	1000
 /*
  * time between repetitions of a node's initial report of interest in a
  * multicast address(in seconds)
  */
 #define MLD6_UNSOLICITED_REPORT_INTERVAL	10
 
 static struct ip6_pktopts ip6_opts;
 static int mld6_timers_are_running;
 /* XXX: These are necessary for KAME's link-local hack */
 static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
 
 static void mld6_sendpkt __P((struct in6_multi *, int, const struct in6_addr *));
 
 void
 mld6_init()
 {
 	static u_int8_t hbh_buf[8];
 	struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf;
 	u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);
 
 	mld6_timers_are_running = 0;
 
 	/* ip6h_nxt will be fill in later */
 	hbh->ip6h_len = 0;	/* (8 >> 3) - 1 */
 
 	/* XXX: grotty hard coding... */
 	hbh_buf[2] = IP6OPT_PADN;	/* 2 byte padding */
 	hbh_buf[3] = 0;
 	hbh_buf[4] = IP6OPT_RTALERT;
 	hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
 	bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t));
 
 	init_ip6pktopts(&ip6_opts);
 	ip6_opts.ip6po_hbh = hbh;
 }
 
 void
 mld6_start_listening(in6m)
 	struct in6_multi *in6m;
 {
 	int s = splnet();
 
 	/*
 	 * RFC2710 page 10:
 	 * The node never sends a Report or Done for the link-scope all-nodes
 	 * address.
 	 * MLD messages are never sent for multicast addresses whose scope is 0
 	 * (reserved) or 1 (node-local).
 	 */
 	mld6_all_nodes_linklocal.s6_addr16[1] =
 		htons(in6m->in6m_ifp->if_index); /* XXX */
 	if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal) ||
 	    IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) {
 		in6m->in6m_timer = 0;
 		in6m->in6m_state = MLD6_OTHERLISTENER;
 	} else {
 		mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
 		in6m->in6m_timer = MLD6_RANDOM_DELAY(
 			MLD6_UNSOLICITED_REPORT_INTERVAL * PR_FASTHZ);
 		in6m->in6m_state = MLD6_IREPORTEDLAST;
 		mld6_timers_are_running = 1;
 	}
 	splx(s);
 }
 
 void
 mld6_stop_listening(in6m)
 	struct in6_multi *in6m;
 {
 	mld6_all_nodes_linklocal.s6_addr16[1] =
 		htons(in6m->in6m_ifp->if_index); /* XXX */
 	mld6_all_routers_linklocal.s6_addr16[1] =
 		htons(in6m->in6m_ifp->if_index); /* XXX: necessary when mrouting */
 
 	if (in6m->in6m_state == MLD6_IREPORTEDLAST &&
 	    (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal)) &&
 	    IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_NODELOCAL)
 		mld6_sendpkt(in6m, MLD_LISTENER_DONE,
 			     &mld6_all_routers_linklocal);
 }
 
 void
 mld6_input(m, off)
 	struct mbuf *m;
 	int off;
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct mld_hdr *mldh;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct in6_multi *in6m;
 	struct in6_ifaddr *ia;
 	struct ifmultiaddr *ifma;
 	int timer;		/* timer value in the MLD query header */
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*mldh),);
 	mldh = (struct mld_hdr *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh));
 	if (mldh == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 
 	/* source address validation */
 	ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */
 	if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
 		log(LOG_ERR,
 		    "mld6_input: src %s is not link-local (grp=%s)\n",
 		    ip6_sprintf(&ip6->ip6_src),
 		    ip6_sprintf(&mldh->mld_addr));
 		/*
 		 * spec (RFC2710) does not explicitly
 		 * specify to discard the packet from a non link-local
 		 * source address. But we believe it's expected to do so.
 		 * XXX: do we have to allow :: as source?
 		 */
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * In the MLD6 specification, there are 3 states and a flag.
 	 *
 	 * In Non-Listener state, we simply don't have a membership record.
 	 * In Delaying Listener state, our timer is running (in6m->in6m_timer)
 	 * In Idle Listener state, our timer is not running (in6m->in6m_timer==0)
 	 *
 	 * The flag is in6m->in6m_state, it is set to MLD6_OTHERLISTENER if
 	 * we have heard a report from another member, or MLD6_IREPORTEDLAST
 	 * if we sent the last report.
 	 */
 	switch(mldh->mld_type) {
 	case MLD_LISTENER_QUERY:
 		if (ifp->if_flags & IFF_LOOPBACK)
 			break;
 
 		if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) &&
 		    !IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
 			break;	/* print error or log stat? */
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
 			mldh->mld_addr.s6_addr16[1] =
 				htons(ifp->if_index); /* XXX */
 
 		/*
 		 * - Start the timers in all of our membership records
 		 *   that the query applies to for the interface on
 		 *   which the query arrived excl. those that belong
 		 *   to the "all-nodes" group (ff02::1).
 		 * - Restart any timer that is already running but has
 		 *   A value longer than the requested timeout.
 		 * - Use the value specified in the query message as
 		 *   the maximum timeout.
 		 */
 		IFP_TO_IA6(ifp, ia);
 		if (ia == NULL)
 			break;
 
 		/*
 		 * XXX: System timer resolution is too low to handle Max
 		 * Response Delay, so set 1 to the internal timer even if
 		 * the calculated value equals to zero when Max Response
 		 * Delay is positive.
 		 */
 		timer = ntohs(mldh->mld_maxdelay)*PR_FASTHZ/MLD6_TIMER_SCALE;
 		if (timer == 0 && mldh->mld_maxdelay)
 			timer = 1;
 		mld6_all_nodes_linklocal.s6_addr16[1] =
 			htons(ifp->if_index); /* XXX */
 		
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
 		{
 			if (ifma->ifma_addr->sa_family != AF_INET6)
 				continue;
 			in6m = (struct in6_multi *)ifma->ifma_protospec;
 			if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr,
 					&mld6_all_nodes_linklocal) ||
 			    IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
 			    IPV6_ADDR_SCOPE_LINKLOCAL)
 				continue;
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) ||
 			    IN6_ARE_ADDR_EQUAL(&mldh->mld_addr,
 						&in6m->in6m_addr))
 			{
 				if (timer == 0) {
 					/* send a report immediately */
 					mld6_sendpkt(in6m, MLD_LISTENER_REPORT,
 						NULL);
 					in6m->in6m_timer = 0; /* reset timer */
 					in6m->in6m_state = MLD6_IREPORTEDLAST;
 				}
 				else if (in6m->in6m_timer == 0 || /*idle state*/
 					in6m->in6m_timer > timer) {
 					in6m->in6m_timer =
 						MLD6_RANDOM_DELAY(timer);
 					mld6_timers_are_running = 1;
 				}
 			}
 		}
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
 			mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
 		break;
 	case MLD_LISTENER_REPORT:
 		/*
 		 * For fast leave to work, we have to know that we are the
 		 * last person to send a report for this group.  Reports
 		 * can potentially get looped back if we are a multicast
 		 * router, so discard reports sourced by me.
 		 * Note that it is impossible to check IFF_LOOPBACK flag of
 		 * ifp for this purpose, since ip6_mloopback pass the physical
 		 * interface to looutput.
 		 */
 		if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
 			break;
 
 		if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
 			break;
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
 			mldh->mld_addr.s6_addr16[1] =
 				htons(ifp->if_index); /* XXX */
 		/*
 		 * If we belong to the group being reported, stop
 		 * our timer for that group.
 		 */
 		IN6_LOOKUP_MULTI(mldh->mld_addr, ifp, in6m);
 		if (in6m) {
 			in6m->in6m_timer = 0; /* transit to idle state */
 			in6m->in6m_state = MLD6_OTHERLISTENER; /* clear flag */
 		}
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
 			mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
 		break;
 	default:		/* this is impossible */
 		log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld_type);
 		break;
 	}
 
 	m_freem(m);
 }
 
 void
 mld6_fasttimeo()
 {
 	struct in6_multi *in6m;
 	struct in6_multistep step;
 	int s;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order
 	 * to minimize the overhead of fasttimo processing.
 	 */
 	if (!mld6_timers_are_running)
 		return;
 
 	s = splnet();
 	mld6_timers_are_running = 0;
 	IN6_FIRST_MULTI(step, in6m);
 	while (in6m != NULL) {
 		if (in6m->in6m_timer == 0) {
 			/* do nothing */
 		} else if (--in6m->in6m_timer == 0) {
 			mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
 			in6m->in6m_state = MLD6_IREPORTEDLAST;
 		} else {
 			mld6_timers_are_running = 1;
 		}
 		IN6_NEXT_MULTI(step, in6m);
 	}
 	splx(s);
 }
 
 static void
 mld6_sendpkt(in6m, type, dst)
 	struct in6_multi *in6m;
 	int type;
 	const struct in6_addr *dst;
 {
 	struct mbuf *mh, *md;
 	struct mld_hdr *mldh;
 	struct ip6_hdr *ip6;
 	struct ip6_moptions im6o;
 	struct in6_ifaddr *ia;
 	struct ifnet *ifp = in6m->in6m_ifp;
 	struct ifnet *outif = NULL;
 
 	/*
 	 * At first, find a link local address on the outgoing interface
 	 * to use as the source address of the MLD packet.
 	 */
 	if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST))
 	    == NULL)
 		return;
 
 	/*
 	 * Allocate mbufs to store ip6 header and MLD header.
 	 * We allocate 2 mbufs and make chain in advance because
 	 * it is more convenient when inserting the hop-by-hop option later.
 	 */
 	MGETHDR(mh, M_DONTWAIT, MT_HEADER);
 	if (mh == NULL)
 		return;
 	MGET(md, M_DONTWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		return;
 	}
 	mh->m_next = md;
 
 	mh->m_pkthdr.rcvif = NULL;
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 	MH_ALIGN(mh, sizeof(struct ip6_hdr));
 
 	/* fill in the ip6 header */
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	/* ip6_hlim will be set by im6o.im6o_multicast_hlim */
 	ip6->ip6_src = ia->ia_addr.sin6_addr;
 	ip6->ip6_dst = dst ? *dst : in6m->in6m_addr;
 
 	/* fill in the MLD header */
 	md->m_len = sizeof(struct mld_hdr);
 	mldh = mtod(md, struct mld_hdr *);
 	mldh->mld_type = type;
 	mldh->mld_code = 0;
 	mldh->mld_cksum = 0;
 	/* XXX: we assume the function will not be called for query messages */
 	mldh->mld_maxdelay = 0;
 	mldh->mld_reserved = 0;
 	mldh->mld_addr = in6m->in6m_addr;
 	if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
 		mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
 	mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 				     sizeof(struct ip6_hdr),
 				     sizeof(struct mld_hdr));
 
 	/* construct multicast option */
 	bzero(&im6o, sizeof(im6o));
 	im6o.im6o_multicast_ifp = ifp;
 	im6o.im6o_multicast_hlim = 1;
 
 	/*
 	 * Request loopback of the report if we are acting as a multicast
 	 * router, so that the process-level routing daemon can hear it.
 	 */
 	im6o.im6o_multicast_loop = (ip6_mrouter != NULL);
 
 	/* increment output statictics */
 	icmp6stat.icp6s_outhist[type]++;
 
-	ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif);
+	ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_QUERY:
 			icmp6_ifstat_inc(outif, ifs6_out_mldquery);
 			break;
 		case MLD_LISTENER_REPORT:
 			icmp6_ifstat_inc(outif, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(outif, ifs6_out_mlddone);
 			break;
 		}
 	}
 }
Index: head/sys/netinet6/nd6_nbr.c
===================================================================
--- head/sys/netinet6/nd6_nbr.c	(revision 105193)
+++ head/sys/netinet6/nd6_nbr.c	(revision 105194)
@@ -1,1404 +1,1396 @@
 /*	$FreeBSD$	*/
 /*	$KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #endif
 
 #include <net/net_osdep.h>
 
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 struct dadq;
 static struct dadq *nd6_dad_find __P((struct ifaddr *));
 static void nd6_dad_starttimer __P((struct dadq *, int));
 static void nd6_dad_stoptimer __P((struct dadq *));
 static void nd6_dad_timer __P((struct ifaddr *));
 static void nd6_dad_ns_output __P((struct dadq *, struct ifaddr *));
 static void nd6_dad_ns_input __P((struct ifaddr *));
 static void nd6_dad_na_input __P((struct ifaddr *));
 
 static int dad_ignore_ns = 0;	/* ignore NS in DAD - specwise incorrect*/
 static int dad_maxtry = 15;	/* max # of *tries* to transmit DAD packet */
 
 /*
  * Input an Neighbor Solicitation Message.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicated address detection)
  */
 void
 nd6_ns_input(m, off, icmp6len)
 	struct mbuf *m;
 	int off, icmp6len;
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	struct in6_addr myaddr6;
 	char *lladdr = NULL;
 	struct ifaddr *ifa;
 	int lladdrlen = 0;
 	int anycast = 0, proxy = 0, tentative = 0;
 	int tlladdr;
 	union nd_opts ndopts;
 	struct sockaddr_dl *proxydl = NULL;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
 	if (nd_ns == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */
 	taddr6 = nd_ns->nd_ns_target;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
 		    ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		/* dst has to be solicited node multicast address. */
 		if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL
 		    /* don't check ifindex portion */
 		    && daddr6.s6_addr32[1] == 0
 		    && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE
 		    && daddr6.s6_addr8[12] == 0xff) {
 			; /* good */
 		} else {
 			nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 				"(wrong ip6 dst)\n"));
 			goto bad;
 		}
 	}
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
 		goto bad;
 	}
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&taddr6))
 		taddr6.s6_addr16[1] = htons(ifp->if_index);
 
 	icmp6len -= sizeof(*nd_ns);
 	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ns_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 	
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 		    "(link-layer address option)\n"));
 		goto bad;
 	}
 
 	/*
 	 * Attaching target link-layer address to the NA?
 	 * (RFC 2461 7.2.4)
 	 *
 	 * NS IP dst is unicast/anycast			MUST NOT add
 	 * NS IP dst is solicited-node multicast	MUST add
 	 *
 	 * In implementation, we add target link-layer address by default.
 	 * We do not add one in MUST NOT cases.
 	 */
 #if 0 /* too much! */
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &daddr6);
 	if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST))
 		tlladdr = 0;
 	else
 #endif
 	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
 		tlladdr = 0;
 	else
 		tlladdr = 1;
 
 	/*
 	 * Target address (taddr6) must be either:
 	 * (1) Valid unicast/anycast address for my receiving interface,
 	 * (2) Unicast address for which I'm offering proxy service, or
 	 * (3) "tentative" address on which DAD is being performed.
 	 */
 	/* (1) and (3) check. */
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/* (2) check. */
 	if (!ifa) {
 		struct rtentry *rt;
 		struct sockaddr_in6 tsin6;
 
 		bzero(&tsin6, sizeof tsin6);		
 		tsin6.sin6_len = sizeof(struct sockaddr_in6);
 		tsin6.sin6_family = AF_INET6;
 		tsin6.sin6_addr = taddr6;
 
 		rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0);
 		if (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 &&
 		    rt->rt_gateway->sa_family == AF_LINK) {
 			/*
 			 * proxy NDP for single entry
 			 */
 			ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 				IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 			if (ifa) {
 				proxy = 1;
 				proxydl = SDL(rt->rt_gateway);
 			}
 		}
 		if (rt)
 			rtfree(rt);
 	}
 	if (!ifa) {
 		/*
 		 * We've got an NS packet, and we don't have that adddress
 		 * assigned for us.  We MUST silently ignore it.
 		 * See RFC2461 7.2.3.
 		 */
 		goto freeit;
 	}
 	myaddr6 = *IFA_IN6(ifa);
 	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
 	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
 	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
 		goto freeit;
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_ns_input: lladdrlen mismatch for %s "
 		    "(if %d, NS packet %d)\n",
 			ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
 		nd6log((LOG_INFO,
 			"nd6_ns_input: duplicate IP6 address %s\n",
 			ip6_sprintf(&saddr6)));
 		goto freeit;
 	}
 
 	/*
 	 * We have neighbor solicitation packet, with target address equals to
 	 * one of my tentative address.
 	 *
 	 * src addr	how to process?
 	 * ---		---
 	 * multicast	of course, invalid (rejected in ip6_input)
 	 * unicast	somebody is doing address resolution -> ignore
 	 * unspec	dup address detection
 	 *
 	 * The processing is defined in RFC 2462.
 	 */
 	if (tentative) {
 		/*
 		 * If source address is unspecified address, it is for
 		 * duplicated address detection.
 		 *
 		 * If not, the packet is for addess resolution;
 		 * silently ignore it.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 			nd6_dad_ns_input(ifa);
 
 		goto freeit;
 	}
 
 	/*
 	 * If the source address is unspecified address, entries must not
 	 * be created or updated.
 	 * It looks that sender is performing DAD.  Output NA toward
 	 * all-node multicast address, to tell the sender that I'm using
 	 * the address.
 	 * S bit ("solicited") must be zero.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		saddr6 = in6addr_linklocal_allnodes;
 		saddr6.s6_addr16[1] = htons(ifp->if_index);
 		nd6_na_output(ifp, &saddr6, &taddr6,
 			      ((anycast || proxy || !tlladdr)
 				      ? 0 : ND_NA_FLAG_OVERRIDE)
 			      	| (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0),
 			      tlladdr, (struct sockaddr *)proxydl);
 		goto freeit;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0);
 
 	nd6_na_output(ifp, &saddr6, &taddr6,
 		      ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE)
 			| (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0)
 			| ND_NA_FLAG_SOLICITED,
 		      tlladdr, (struct sockaddr *)proxydl);
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6)));
 	icmp6stat.icp6s_badns++;
 	m_freem(m);
 }
 
 /*
  * Output an Neighbor Solicitation Message. Caller specifies:
  *	- ICMP6 header source IP6 address
  *	- ND6 header target IP6 address
  *	- ND6 header source datalink address
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicated address detection)
  */
 void
 nd6_ns_output(ifp, daddr6, taddr6, ln, dad)
 	struct ifnet *ifp;
 	const struct in6_addr *daddr6, *taddr6;
 	struct llinfo_nd6 *ln;	/* for source address determination */
 	int dad;	/* duplicated address detection */
 {
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_ifaddr *ia = NULL;
 	struct ip6_moptions im6o;
 	int icmp6len;
 	int maxlen;
 	caddr_t mac;
 	struct ifnet *outif = NULL;
 	
 	if (IN6_IS_ADDR_MULTICAST(taddr6))
 		return;
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_ns);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m && max_linkhdr + maxlen >= MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			m = NULL;
 		}
 	}
 	if (m == NULL)
 		return;
 	m->m_pkthdr.rcvif = NULL;
 
 	if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_ns);
 	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
 	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
 
 	/* fill neighbor solicitation packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (daddr6)
 		ip6->ip6_dst = *daddr6;
 	else {
 		ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index);
 		ip6->ip6_dst.s6_addr32[1] = 0;
 		ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
 		ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
 		ip6->ip6_dst.s6_addr8[12] = 0xff;
 	}
 	if (!dad) {
 #if 0	/* KAME way, exact address scope match */
 		/*
 		 * Select a source whose scope is the same as that of the dest.
 		 * Typically, the dest is link-local solicitation multicast
 		 * (i.e. neighbor discovery) or link-local/global unicast
 		 * (i.e. neighbor un-reachability detection).
 		 */
 		ia = in6_ifawithifp(ifp, &ip6->ip6_dst);
 		if (ia == NULL) {
 			m_freem(m);
 			return;
 		}
 		ip6->ip6_src = ia->ia_addr.sin6_addr;
 #else	/* spec-wise correct */
 		/*
 		 * RFC2461 7.2.2:
 		 * "If the source address of the packet prompting the
 		 * solicitation is the same as one of the addresses assigned
 		 * to the outgoing interface, that address SHOULD be placed
 		 * in the IP Source Address of the outgoing solicitation.
 		 * Otherwise, any one of the addresses assigned to the
 		 * interface should be used."
 		 *
 		 * We use the source address for the prompting packet
 		 * (saddr6), if:
 		 * - saddr6 is given from the caller (by giving "ln"), and
 		 * - saddr6 belongs to the outgoing interface.
 		 * Otherwise, we perform a scope-wise match.
 		 */
 		struct ip6_hdr *hip6;		/* hold ip6 */
 		struct in6_addr *saddr6;
 
 		if (ln && ln->ln_hold) {
 			hip6 = mtod(ln->ln_hold, struct ip6_hdr *);
 			/* XXX pullup? */
 			if (sizeof(*hip6) < ln->ln_hold->m_len)
 				saddr6 = &hip6->ip6_src;
 			else
 				saddr6 = NULL;
 		} else
 			saddr6 = NULL;
 		if (saddr6 && in6ifa_ifpwithaddr(ifp, saddr6))
 			bcopy(saddr6, &ip6->ip6_src, sizeof(*saddr6));
 		else {
 			ia = in6_ifawithifp(ifp, &ip6->ip6_dst);
 			if (ia == NULL) {
 				m_freem(m);
 				return;
 			}
 			ip6->ip6_src = ia->ia_addr.sin6_addr;
 		}
 #endif
 	} else {
 		/*
 		 * Source address for DAD packet must always be IPv6
 		 * unspecified address. (0::0)
 		 */
 		bzero(&ip6->ip6_src, sizeof(ip6->ip6_src));
 	}
 	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
 	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
 	nd_ns->nd_ns_code = 0;
 	nd_ns->nd_ns_reserved = 0;
 	nd_ns->nd_ns_target = *taddr6;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&nd_ns->nd_ns_target))
 		nd_ns->nd_ns_target.s6_addr16[1] = 0;
 
 	/*
 	 * Add source link-layer address option.
 	 *
 	 *				spec		implementation
 	 *				---		---
 	 * DAD packet			MUST NOT	do not add the option
 	 * there's no link layer address:
 	 *				impossible	do not add the option
 	 * there's link layer address:
 	 *	Multicast NS		MUST add one	add the option
 	 *	Unicast NS		SHOULD add one	add the option
 	 */
 	if (!dad && (mac = nd6_ifptomac(ifp))) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
 		/* 8 byte alignments... */
 		optlen = (optlen + 7) & ~7;
 		
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	}
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_ns->nd_ns_cksum = 0;
 	nd_ns->nd_ns_cksum
 		= in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);
 
-#ifdef IPSEC
-	/* Don't lookup socket */
-	(void)ipsec_setsocket(m, NULL);
-#endif
-	ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif);
+	ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_neighborsolicit);
 	}
 	icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++;
 }
 
 /*
  * Neighbor advertisement input handling.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicated address detection)
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  */
 void
 nd6_na_input(m, off, icmp6len)
 	struct mbuf *m;
 	int off, icmp6len;
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_neighbor_advert *nd_na;
 #if 0
 	struct in6_addr saddr6 = ip6->ip6_src;
 #endif
 	struct in6_addr daddr6 = ip6->ip6_dst;
 	struct in6_addr taddr6;
 	int flags;
 	int is_router;
 	int is_solicited;
 	int is_override;
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	struct ifaddr *ifa;
 	struct llinfo_nd6 *ln;
 	struct rtentry *rt;
 	struct sockaddr_dl *sdl;
 	union nd_opts ndopts;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
 		    ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
 	if (nd_na == NULL) {
 		icmp6stat.icp6s_tooshort++;
 		return;
 	}
 #endif
 	taddr6 = nd_na->nd_na_target;
 	flags = nd_na->nd_na_flags_reserved;
 	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
 	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
 	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&taddr6))
 		taddr6.s6_addr16[1] = htons(ifp->if_index);
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid target address %s\n",
 		    ip6_sprintf(&taddr6)));
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&daddr6))
 		if (is_solicited) {
 			nd6log((LOG_ERR,
 			    "nd6_na_input: a solicited adv is multicasted\n"));
 			goto bad;
 		}
 
 	icmp6len -= sizeof(*nd_na);
 	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_na_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/*
 	 * Target address matches one of my interface address.
 	 *
 	 * If my address is tentative, this means that there's somebody
 	 * already using the same address as mine.  This indicates DAD failure.
 	 * This is defined in RFC 2462.
 	 *
 	 * Otherwise, process as defined in RFC 2461.
 	 */
 	if (ifa
 	 && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
 		nd6_dad_na_input(ifa);
 		goto freeit;
 	}
 
 	/* Just for safety, maybe unnecessary. */
 	if (ifa) {
 		log(LOG_ERR,
 		    "nd6_na_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(&taddr6));
 		goto freeit;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_na_input: lladdrlen mismatch for %s "
 		    "(if %d, NA packet %d)\n",
 			ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	/*
 	 * If no neighbor cache entry is found, NA SHOULD silently be discarded.
 	 */
 	rt = nd6_lookup(&taddr6, 0, ifp);
 	if ((rt == NULL) ||
 	   ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) ||
 	   ((sdl = SDL(rt->rt_gateway)) == NULL))
 		goto freeit;
 
 	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 		/*
 		 * If the link-layer has address, and no lladdr option came,
 		 * discard the packet.
 		 */
 		if (ifp->if_addrlen && !lladdr)
 			goto freeit;
 
 		/*
 		 * Record link-layer address, and update the state.
 		 */
 		sdl->sdl_alen = ifp->if_addrlen;
 		bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
 		if (is_solicited) {
 			ln->ln_state = ND6_LLINFO_REACHABLE;
 			ln->ln_byhint = 0;
 			if (ln->ln_expire)
 				ln->ln_expire = time_second +
 				    nd_ifinfo[rt->rt_ifp->if_index].reachable;
 		} else {
 			ln->ln_state = ND6_LLINFO_STALE;
 			ln->ln_expire = time_second + nd6_gctimer;
 		}
 		if ((ln->ln_router = is_router) != 0) {
 			/*
 			 * This means a router's state has changed from
 			 * non-reachable to probably reachable, and might
 			 * affect the status of associated prefixes..
 			 */
 			pfxlist_onlink_check();
 		}
 	} else {
 		int llchange;
 
 		/*
 		 * Check if the link-layer address has changed or not.
 		 */
 		if (!lladdr)
 			llchange = 0;
 		else {
 			if (sdl->sdl_alen) {
 				if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen))
 					llchange = 1;
 				else
 					llchange = 0;
 			} else
 				llchange = 1;
 		}
 
 		/*
 		 * This is VERY complex.  Look at it with care.
 		 *
 		 * override solicit lladdr llchange	action
 		 *					(L: record lladdr)
 		 *
 		 *	0	0	n	--	(2c)
 		 *	0	0	y	n	(2b) L
 		 *	0	0	y	y	(1)    REACHABLE->STALE
 		 *	0	1	n	--	(2c)   *->REACHABLE
 		 *	0	1	y	n	(2b) L *->REACHABLE
 		 *	0	1	y	y	(1)    REACHABLE->STALE
 		 *	1	0	n	--	(2a)
 		 *	1	0	y	n	(2a) L
 		 *	1	0	y	y	(2a) L *->STALE
 		 *	1	1	n	--	(2a)   *->REACHABLE
 		 *	1	1	y	n	(2a) L *->REACHABLE
 		 *	1	1	y	y	(2a) L *->REACHABLE
 		 */
 		if (!is_override && (lladdr && llchange)) {	   /* (1) */
 			/*
 			 * If state is REACHABLE, make it STALE.
 			 * no other updates should be done.
 			 */
 			if (ln->ln_state == ND6_LLINFO_REACHABLE) {
 				ln->ln_state = ND6_LLINFO_STALE;
 				ln->ln_expire = time_second + nd6_gctimer;
 			}
 			goto freeit;
 		} else if (is_override				   /* (2a) */
 			|| (!is_override && (lladdr && !llchange)) /* (2b) */
 			|| !lladdr) {				   /* (2c) */
 			/*
 			 * Update link-local address, if any.
 			 */
 			if (lladdr) {
 				sdl->sdl_alen = ifp->if_addrlen;
 				bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
 			}
 
 			/*
 			 * If solicited, make the state REACHABLE.
 			 * If not solicited and the link-layer address was
 			 * changed, make it STALE.
 			 */
 			if (is_solicited) {
 				ln->ln_state = ND6_LLINFO_REACHABLE;
 				ln->ln_byhint = 0;
 				if (ln->ln_expire) {
 					ln->ln_expire = time_second +
 					    nd_ifinfo[ifp->if_index].reachable;
 				}
 			} else {
 				if (lladdr && llchange) {
 					ln->ln_state = ND6_LLINFO_STALE;
 					ln->ln_expire = time_second + nd6_gctimer;
 				}
 			}
 		}
 
 		if (ln->ln_router && !is_router) {
 			/*
 			 * The peer dropped the router flag.
 			 * Remove the sender from the Default Router List and
 			 * update the Destination Cache entries.
 			 */
 			struct nd_defrouter *dr;
 			struct in6_addr *in6;
 			int s;
 
 			in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr;
 
 			/*
 			 * Lock to protect the default router list.
 			 * XXX: this might be unnecessary, since this function
 			 * is only called under the network software interrupt
 			 * context.  However, we keep it just for safety.  
 			 */
 			s = splnet();
 			dr = defrouter_lookup(in6, rt->rt_ifp);
 			if (dr)
 				defrtrlist_del(dr);
 			else if (!ip6_forwarding && ip6_accept_rtadv) {
 				/*
 				 * Even if the neighbor is not in the default
 				 * router list, the neighbor may be used
 				 * as a next hop for some destinations
 				 * (e.g. redirect case). So we must
 				 * call rt6_flush explicitly.
 				 */
 				rt6_flush(&ip6->ip6_src, rt->rt_ifp);
 			}
 			splx(s);
 		}
 		ln->ln_router = is_router;
 	}
 	rt->rt_flags &= ~RTF_REJECT;
 	ln->ln_asked = 0;
 	if (ln->ln_hold) {
 		/*
 		 * we assume ifp is not a loopback here, so just set the 2nd
 		 * argument as the 1st one.
 		 */
 		nd6_output(ifp, ifp, ln->ln_hold,
 			   (struct sockaddr_in6 *)rt_key(rt), rt);
 		ln->ln_hold = 0;
 	}
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	icmp6stat.icp6s_badna++;
 	m_freem(m);
 }
 
 /*
  * Neighbor advertisement output handling.
  *
  * Based on RFC 2461
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  */
 void
 nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0)
 	struct ifnet *ifp;
 	const struct in6_addr *daddr6, *taddr6;
 	u_long flags;
 	int tlladdr;		/* 1 if include target link-layer address */
 	struct sockaddr *sdl0;	/* sockaddr_dl (= proxy NA) or NULL */
 {
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_advert *nd_na;
 	struct in6_ifaddr *ia = NULL;
 	struct ip6_moptions im6o;
 	int icmp6len;
 	int maxlen;
 	caddr_t mac = NULL;
 	struct ifnet *outif = NULL;
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_na);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	if (max_linkhdr + maxlen >= MCLBYTES) {
 #ifdef DIAGNOSTIC
 		printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES "
 		    "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
 #endif
 		return;
 	}
 
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m && max_linkhdr + maxlen >= MHLEN) {
 		MCLGET(m, M_DONTWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			m = NULL;
 		}
 	}
 	if (m == NULL)
 		return;
 	m->m_pkthdr.rcvif = NULL;
 
 	if (IN6_IS_ADDR_MULTICAST(daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_na);
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
 	m->m_data += max_linkhdr;	/* or MH_ALIGN() equivalent? */
 
 	/* fill neighbor advertisement packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (IN6_IS_ADDR_UNSPECIFIED(daddr6)) {
 		/* reply to DAD */
 		ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index);
 		ip6->ip6_dst.s6_addr32[1] = 0;
 		ip6->ip6_dst.s6_addr32[2] = 0;
 		ip6->ip6_dst.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
 		flags &= ~ND_NA_FLAG_SOLICITED;
 	} else
 		ip6->ip6_dst = *daddr6;
 
 	/*
 	 * Select a source whose scope is the same as that of the dest.
 	 */
 	ia = in6_ifawithifp(ifp, &ip6->ip6_dst);
 	if (ia == NULL) {
 		m_freem(m);
 		return;
 	}
 	ip6->ip6_src = ia->ia_addr.sin6_addr;
 	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
 	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
 	nd_na->nd_na_code = 0;
 	nd_na->nd_na_target = *taddr6;
 	if (IN6_IS_SCOPE_LINKLOCAL(&nd_na->nd_na_target))
 		nd_na->nd_na_target.s6_addr16[1] = 0;
 
 	/*
 	 * "tlladdr" indicates NS's condition for adding tlladdr or not.
 	 * see nd6_ns_input() for details.
 	 * Basically, if NS packet is sent to unicast/anycast addr,
 	 * target lladdr option SHOULD NOT be included.
 	 */
 	if (tlladdr) {
 		/*
 		 * sdl0 != NULL indicates proxy NA.  If we do proxy, use
 		 * lladdr in sdl0.  If we are not proxying (sending NA for
 		 * my address) use lladdr configured for the interface.
 		 */
 		if (sdl0 == NULL)
 			mac = nd6_ifptomac(ifp);
 		else if (sdl0->sa_family == AF_LINK) {
 			struct sockaddr_dl *sdl;
 			sdl = (struct sockaddr_dl *)sdl0;
 			if (sdl->sdl_alen == ifp->if_addrlen)
 				mac = LLADDR(sdl);
 		}
 	}
 	if (tlladdr && mac) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
 		
 		/* roundup to 8 bytes alignment! */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	} else
 		flags &= ~ND_NA_FLAG_OVERRIDE;
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_na->nd_na_flags_reserved = flags;
 	nd_na->nd_na_cksum = 0;
 	nd_na->nd_na_cksum =
 		in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);
 
-#ifdef IPSEC
-	/* Don't lookup socket */
-	(void)ipsec_setsocket(m, NULL);
-#endif
-	ip6_output(m, NULL, NULL, 0, &im6o, &outif);
+	ip6_output(m, NULL, NULL, 0, &im6o, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_neighboradvert);
 	}
 	icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++;
 }
 
 caddr_t
 nd6_ifptomac(ifp)
 	struct ifnet *ifp;
 {
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 #ifdef IFT_L2VLAN
 	case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 		return ((caddr_t)(ifp + 1));
 		break;
 	default:
 		return NULL;
 	}
 }
 
 TAILQ_HEAD(dadq_head, dadq);
 struct dadq {
 	TAILQ_ENTRY(dadq) dad_list;
 	struct ifaddr *dad_ifa;
 	int dad_count;		/* max NS to send */
 	int dad_ns_tcount;	/* # of trials to send NS */
 	int dad_ns_ocount;	/* NS sent so far */
 	int dad_ns_icount;
 	int dad_na_icount;
 	struct callout dad_timer_ch;
 };
 
 static struct dadq_head dadq;
 static int dad_init = 0;
 
 static struct dadq *
 nd6_dad_find(ifa)
 	struct ifaddr *ifa;
 {
 	struct dadq *dp;
 
 	for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) {
 		if (dp->dad_ifa == ifa)
 			return dp;
 	}
 	return NULL;
 }
 
 static void
 nd6_dad_starttimer(dp, ticks)
 	struct dadq *dp;
 	int ticks;
 {
 
 	callout_reset(&dp->dad_timer_ch, ticks,
 	    (void (*) __P((void *)))nd6_dad_timer, (void *)dp->dad_ifa);
 }
 
 static void
 nd6_dad_stoptimer(dp)
 	struct dadq *dp;
 {
 
 	callout_stop(&dp->dad_timer_ch);
 }
 
 /*
  * Start Duplicated Address Detection (DAD) for specified interface address.
  */
 void
 nd6_dad_start(ifa, tick)
 	struct ifaddr *ifa;
 	int *tick;	/* minimum delay ticks for IFF_UP event */
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 
 	if (!dad_init) {
 		TAILQ_INIT(&dadq);
 		dad_init++;
 	}
 
 	/*
 	 * If we don't need DAD, don't do it.
 	 * There are several cases:
 	 * - DAD is disabled (ip6_dad_count == 0)
 	 * - the interface address is anycast
 	 */
 	if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		log(LOG_DEBUG,
 			"nd6_dad_start: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(&ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	if (ia->ia6_flags & IN6_IFF_ANYCAST) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (!ip6_dad_count) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if (!ifa->ifa_ifp)
 		panic("nd6_dad_start: ifa->ifa_ifp == NULL");
 	if (!(ifa->ifa_ifp->if_flags & IFF_UP))
 		return;
 	if (nd6_dad_find(ifa) != NULL) {
 		/* DAD already in progress */
 		return;
 	}
 
 	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
 			"%s(%s)\n",
 			ip6_sprintf(&ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	bzero(dp, sizeof(*dp));
 	callout_init(&dp->dad_timer_ch, 0);
 	TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list);
 
 	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
 	    ip6_sprintf(&ia->ia_addr.sin6_addr)));
 
 	/*
 	 * Send NS packet for DAD, ip6_dad_count times.
 	 * Note that we must delay the first transmission, if this is the
 	 * first packet to be sent from the interface after interface
 	 * (re)initialization.
 	 */
 	dp->dad_ifa = ifa;
 	IFAREF(ifa);	/* just for safety */
 	dp->dad_count = ip6_dad_count;
 	dp->dad_ns_icount = dp->dad_na_icount = 0;
 	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
 	if (tick == NULL) {
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp, 
 		    nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000);
 	} else {
 		int ntick;
 
 		if (*tick == 0)
 			ntick = random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 		else
 			ntick = *tick + random() % (hz / 2);
 		*tick = ntick;
 		nd6_dad_starttimer(dp, ntick);
 	}
 }
 
 /*
  * terminate DAD unconditionally.  used for address removals.
  */
 void
 nd6_dad_stop(ifa)
 	struct ifaddr *ifa;
 {
 	struct dadq *dp;
 
 	if (!dad_init)
 		return;
 	dp = nd6_dad_find(ifa);
 	if (!dp) {
 		/* DAD wasn't started yet */
 		return;
 	}
 
 	nd6_dad_stoptimer(dp);
 
 	TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list);
 	free(dp, M_IP6NDP);
 	dp = NULL;
 	IFAFREE(ifa);
 }
 
 static void
 nd6_dad_timer(ifa)
 	struct ifaddr *ifa;
 {
 	int s;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 
 	s = splnet();		/* XXX */
 
 	/* Sanity check */
 	if (ia == NULL) {
 		log(LOG_ERR, "nd6_dad_timer: called with null parameter\n");
 		goto done;
 	}
 	dp = nd6_dad_find(ifa);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n");
 		goto done;
 	}
 	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
 		log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
 			"%s(%s)\n",
 			ip6_sprintf(&ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto done;
 	}
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
 		log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(&ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto done;
 	}
 
 	/* timeouted with IFF_{RUNNING,UP} check */
 	if (dp->dad_ns_tcount > dad_maxtry) {
 		nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n",
 			if_name(ifa->ifa_ifp)));
 
 		TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list);
 		free(dp, M_IP6NDP);
 		dp = NULL;
 		IFAFREE(ifa);
 		goto done;
 	}
 
 	/* Need more checks? */
 	if (dp->dad_ns_ocount < dp->dad_count) {
 		/*
 		 * We have more NS to go.  Send NS packet for DAD.
 		 */
 		nd6_dad_ns_output(dp, ifa);
 		nd6_dad_starttimer(dp, 
 		    nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000);
 	} else {
 		/*
 		 * We have transmitted sufficient number of DAD packets.
 		 * See what we've got.
 		 */
 		int duplicate;
 
 		duplicate = 0;
 
 		if (dp->dad_na_icount) {
 			/*
 			 * the check is in nd6_dad_na_input(),
 			 * but just in case
 			 */
 			duplicate++;
 		}
 
 		if (dp->dad_ns_icount) {
 #if 0 /* heuristics */
 			/*
 			 * if
 			 * - we have sent many(?) DAD NS, and
 			 * - the number of NS we sent equals to the
 			 *   number of NS we've got, and
 			 * - we've got no NA
 			 * we may have a faulty network card/driver which
 			 * loops back multicasts to myself.
 			 */
 			if (3 < dp->dad_count
 			 && dp->dad_ns_icount == dp->dad_count
 			 && dp->dad_na_icount == 0) {
 				log(LOG_INFO, "DAD questionable for %s(%s): "
 					"network card loops back multicast?\n",
 					ip6_sprintf(&ia->ia_addr.sin6_addr),
 					if_name(ifa->ifa_ifp));
 				/* XXX consider it a duplicate or not? */
 				/* duplicate++; */
 			} else {
 				/* We've seen NS, means DAD has failed. */
 				duplicate++;
 			}
 #else
 			/* We've seen NS, means DAD has failed. */
 			duplicate++;
 #endif
 		}
 
 		if (duplicate) {
 			/* (*dp) will be freed in nd6_dad_duplicated() */
 			dp = NULL;
 			nd6_dad_duplicated(ifa);
 		} else {
 			/*
 			 * We are done with DAD.  No NA came, no NS came.
 			 * duplicated address found.
 			 */
 			ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 
 			nd6log((LOG_DEBUG,
 			    "%s: DAD complete for %s - no duplicates found\n",
 			    if_name(ifa->ifa_ifp),
 			    ip6_sprintf(&ia->ia_addr.sin6_addr)));
 
 			TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list);
 			free(dp, M_IP6NDP);
 			dp = NULL;
 			IFAFREE(ifa);
 		}
 	}
 
 done:
 	splx(s);
 }
 
 void
 nd6_dad_duplicated(ifa)
 	struct ifaddr *ifa;
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 
 	dp = nd6_dad_find(ifa);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n");
 		return;
 	}
 
 	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
 	    "NS in/out=%d/%d, NA in=%d\n",
 	    if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr),
 	    dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount);
 
 	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 	ia->ia6_flags |= IN6_IFF_DUPLICATED;
 
 	/* We are done with DAD, with duplicated address found. (failure) */
 	nd6_dad_stoptimer(dp);
 
 	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
 	    if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr));
 	log(LOG_ERR, "%s: manual intervention required\n",
 	    if_name(ifa->ifa_ifp));
 
 	TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list);
 	free(dp, M_IP6NDP);
 	dp = NULL;
 	IFAFREE(ifa);
 }
 
 static void
 nd6_dad_ns_output(dp, ifa)
 	struct dadq *dp;
 	struct ifaddr *ifa;
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp = ifa->ifa_ifp;
 
 	dp->dad_ns_tcount++;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 #if 0
 		printf("%s: interface down?\n", if_name(ifp));
 #endif
 		return;
 	}
 	if ((ifp->if_flags & IFF_RUNNING) == 0) {
 #if 0
 		printf("%s: interface not running?\n", if_name(ifp));
 #endif
 		return;
 	}
 
 	dp->dad_ns_ocount++;
 	nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1);
 }
 
 static void
 nd6_dad_ns_input(ifa)
 	struct ifaddr *ifa;
 {
 	struct in6_ifaddr *ia;
 	struct ifnet *ifp;
 	const struct in6_addr *taddr6;
 	struct dadq *dp;
 	int duplicate;
 
 	if (!ifa)
 		panic("ifa == NULL in nd6_dad_ns_input");
 
 	ia = (struct in6_ifaddr *)ifa;
 	ifp = ifa->ifa_ifp;
 	taddr6 = &ia->ia_addr.sin6_addr;
 	duplicate = 0;
 	dp = nd6_dad_find(ifa);
 
 	/* Quickhack - completely ignore DAD NS packets */
 	if (dad_ignore_ns) {
 		nd6log((LOG_INFO,
 		    "nd6_dad_ns_input: ignoring DAD NS packet for "
 		    "address %s(%s)\n", ip6_sprintf(taddr6),
 		    if_name(ifa->ifa_ifp)));
 		return;
 	}
 
 	/*
 	 * if I'm yet to start DAD, someone else started using this address
 	 * first.  I have a duplicate and you win.
 	 */
 	if (!dp || dp->dad_ns_ocount == 0)
 		duplicate++;
 
 	/* XXX more checks for loopback situation - see nd6_dad_timer too */
 
 	if (duplicate) {
 		dp = NULL;	/* will be freed in nd6_dad_duplicated() */
 		nd6_dad_duplicated(ifa);
 	} else {
 		/*
 		 * not sure if I got a duplicate.
 		 * increment ns count and see what happens.
 		 */
 		if (dp)
 			dp->dad_ns_icount++;
 	}
 }
 
 static void
 nd6_dad_na_input(ifa)
 	struct ifaddr *ifa;
 {
 	struct dadq *dp;
 
 	if (!ifa)
 		panic("ifa == NULL in nd6_dad_na_input");
 
 	dp = nd6_dad_find(ifa);
 	if (dp)
 		dp->dad_na_icount++;
 
 	/* remove the address. */
 	nd6_dad_duplicated(ifa);
 }
Index: head/sys/netinet6/raw_ip6.c
===================================================================
--- head/sys/netinet6/raw_ip6.c	(revision 105193)
+++ head/sys/netinet6/raw_ip6.c	(revision 105194)
@@ -1,726 +1,719 @@
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.2 (Berkeley) 1/4/94
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/raw_ip6.h>
 #ifdef ENABLE_DEFAULT_SCOPE
 #include <netinet6/scope6_var.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #include <netinet6/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/stdarg.h>
 
 #define	satosin6(sa)	((struct sockaddr_in6 *)(sa))
 #define	ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 
 /*
  * Raw interface to IP6 protocol.
  */
 
 extern struct	inpcbhead ripcb;
 extern struct	inpcbinfo ripcbinfo;
 extern u_long	rip_sendspace;
 extern u_long	rip_recvspace;
 
 struct rip6stat rip6stat;
 
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
  * mbuf chain.
  */
 int
 rip6_input(mp, offp, proto)
 	struct	mbuf **mp;
 	int	*offp, proto;
 {
 	struct mbuf *m = *mp;
 	register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	register struct inpcb *in6p;
 	struct inpcb *last = 0;
 	struct mbuf *opts = NULL;
 	struct sockaddr_in6 rip6src;
 
 	rip6stat.rip6s_ipackets++;
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/* XXX send icmp6 host/port unreach? */
 		m_freem(m);
 		return IPPROTO_DONE;
 	}
 
 	init_sin6(&rip6src, m); /* general init */
 
 	LIST_FOREACH(in6p, &ripcb, inp_list) {
 		if ((in6p->in6p_vflag & INP_IPV6) == 0)
 			continue;
 		if (in6p->in6p_ip6_nxt &&
 		    in6p->in6p_ip6_nxt != proto)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		    !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		    !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		if (in6p->in6p_cksum != -1) {
 			rip6stat.rip6s_isum++;
 			if (in6_cksum(m, ip6->ip6_nxt, *offp,
 			    m->m_pkthdr.len - *offp)) {
 				rip6stat.rip6s_badsum++;
 				continue;
 			}
 		}
 		if (last) {
 			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
 
 #ifdef IPSEC
 			/*
 			 * Check AH/ESP integrity.
 			 */
 			if (n && ipsec6_in_reject_so(n, last->inp_socket)) {
 				m_freem(n);
 				ipsec6stat.in_polvio++;
 				/* do not inject data into pcb */
 			} else
 #endif /*IPSEC*/
 			if (n) {
 				if (last->in6p_flags & IN6P_CONTROLOPTS ||
 				    last->in6p_socket->so_options & SO_TIMESTAMP)
 					ip6_savecontrol(last, &opts, ip6, n);
 				/* strip intermediate headers */
 				m_adj(n, *offp);
 				if (sbappendaddr(&last->in6p_socket->so_rcv,
 						(struct sockaddr *)&rip6src,
 						 n, opts) == 0) {
 					m_freem(n);
 					if (opts)
 						m_freem(opts);
 					rip6stat.rip6s_fullsock++;
 				} else
 					sorwakeup(last->in6p_socket);
 				opts = NULL;
 			}
 		}
 		last = in6p;
 	}
 #ifdef IPSEC
 	/*
 	 * Check AH/ESP integrity.
 	 */
 	if (last && ipsec6_in_reject_so(m, last->inp_socket)) {
 		m_freem(m);
 		ipsec6stat.in_polvio++;
 		ip6stat.ip6s_delivered--;
 		/* do not inject data into pcb */
 	} else
 #endif /*IPSEC*/
 	if (last) {
 		if (last->in6p_flags & IN6P_CONTROLOPTS ||
 		    last->in6p_socket->so_options & SO_TIMESTAMP)
 			ip6_savecontrol(last, &opts, ip6, m);
 		/* strip intermediate headers */
 		m_adj(m, *offp);
 		if (sbappendaddr(&last->in6p_socket->so_rcv,
 				(struct sockaddr *)&rip6src, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			rip6stat.rip6s_fullsock++;
 		} else
 			sorwakeup(last->in6p_socket);
 	} else {
 		rip6stat.rip6s_nosock++;
 		if (m->m_flags & M_MCAST)
 			rip6stat.rip6s_nosockmcast++;
 		if (proto == IPPROTO_NONE)
 			m_freem(m);
 		else {
 			char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_NEXTHEADER,
 				    prvnxtp - mtod(m, char *));
 		}
 		ip6stat.ip6s_delivered--;
 	}
 	return IPPROTO_DONE;
 }
 
 void
 rip6_ctlinput(cmd, sa, d)
 	int cmd;
 	struct sockaddr *sa;
 	void *d;
 {
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	int off = 0;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	struct inpcb *(*notify) __P((struct inpcb *, int)) = in6_rtchange;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	(void) in6_pcbnotify(&ripcb, sa, 0, (const struct sockaddr *)sa6_src,
 			     0, cmd, notify);
 }
 
 /*
  * Generate IPv6 header and pass packet to ip6_output.
  * Tack on options user may have setup with control call.
  */
 int
 #if __STDC__
 rip6_output(struct mbuf *m, ...)
 #else
 rip6_output(m, va_alist)
 	struct mbuf *m;
 	va_dcl
 #endif
 {
 	struct socket *so;
 	struct sockaddr_in6 *dstsock;
 	struct mbuf *control;
 	struct in6_addr *dst;
 	struct ip6_hdr *ip6;
 	struct inpcb *in6p;
 	u_int	plen = m->m_pkthdr.len;
 	int error = 0;
 	struct ip6_pktopts opt, *optp = 0;
 	struct ifnet *oifp = NULL;
 	int type = 0, code = 0;		/* for ICMPv6 output statistics only */
 	int priv = 0;
 	va_list ap;
 
 	va_start(ap, m);
 	so = va_arg(ap, struct socket *);
 	dstsock = va_arg(ap, struct sockaddr_in6 *);
 	control = va_arg(ap, struct mbuf *);
 	va_end(ap);
 
 	in6p = sotoin6pcb(so);
 
 	priv = 0;
 	if (so->so_cred->cr_uid == 0)
 		priv = 1;
 	dst = &dstsock->sin6_addr;
 	if (control) {
 		if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0)
 			goto bad;
 		optp = &opt;
 	} else
 		optp = in6p->in6p_outputopts;
 
 	/*
 	 * For an ICMPv6 packet, we should know its type and code
 	 * to update statistics.
 	 */
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icmp6;
 		if (m->m_len < sizeof(struct icmp6_hdr) &&
 		    (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
 			error = ENOBUFS;
 			goto bad;
 		}
 		icmp6 = mtod(m, struct icmp6_hdr *);
 		type = icmp6->icmp6_type;
 		code = icmp6->icmp6_code;
 	}
 
 	M_PREPEND(m, sizeof(*ip6), M_TRYWAIT);
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Next header might not be ICMP6 but use its pseudo header anyway.
 	 */
 	ip6->ip6_dst = *dst;
 
 	/*
 	 * If the scope of the destination is link-local, embed the interface
 	 * index in the address.
 	 *
 	 * XXX advanced-api value overrides sin6_scope_id
 	 */
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 		struct in6_pktinfo *pi;
 
 		/*
 		 * XXX Boundary check is assumed to be already done in
 		 * ip6_setpktoptions().
 		 */
 		if (optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) {
 			ip6->ip6_dst.s6_addr16[1] = htons(pi->ipi6_ifindex);
 			oifp = ifnet_byindex(pi->ipi6_ifindex);
 		} else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 in6p->in6p_moptions &&
 			 in6p->in6p_moptions->im6o_multicast_ifp) {
 			oifp = in6p->in6p_moptions->im6o_multicast_ifp;
 			ip6->ip6_dst.s6_addr16[1] = htons(oifp->if_index);
 		} else if (dstsock->sin6_scope_id) {
 			/* boundary check */
 			if (dstsock->sin6_scope_id < 0
 			 || if_index < dstsock->sin6_scope_id) {
 				error = ENXIO;  /* XXX EINVAL? */
 				goto bad;
 			}
 			ip6->ip6_dst.s6_addr16[1]
 				= htons(dstsock->sin6_scope_id & 0xffff);/*XXX*/
 		}
 	}
 
 	/*
 	 * Source address selection.
 	 */
 	{
 		struct in6_addr *in6a;
 
 		if ((in6a = in6_selectsrc(dstsock, optp,
 					  in6p->in6p_moptions,
 					  &in6p->in6p_route,
 					  &in6p->in6p_laddr,
 					  &error)) == 0) {
 			if (error == 0)
 				error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		ip6->ip6_src = *in6a;
 		if (in6p->in6p_route.ro_rt)
 			oifp = ifnet_byindex(in6p->in6p_route.ro_rt->rt_ifp->if_index);
 	}
 	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 		(in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 		(IPV6_VERSION & IPV6_VERSION_MASK);
 	/* ip6_plen will be filled in ip6_output, so not fill it here. */
 	ip6->ip6_nxt = in6p->in6p_ip6_nxt;
 	ip6->ip6_hlim = in6_selecthlim(in6p, oifp);
 
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
 	    in6p->in6p_cksum != -1) {
 		struct mbuf *n;
 		int off;
 		u_int16_t *p;
 
 		/* compute checksum */
 		if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 			off = offsetof(struct icmp6_hdr, icmp6_cksum);
 		else
 			off = in6p->in6p_cksum;
 		if (plen < off + 1) {
 			error = EINVAL;
 			goto bad;
 		}
 		off += sizeof(struct ip6_hdr);
 
 		n = m;
 		while (n && n->m_len <= off) {
 			off -= n->m_len;
 			n = n->m_next;
 		}
 		if (!n)
 			goto bad;
 		p = (u_int16_t *)(mtod(n, caddr_t) + off);
 		*p = 0;
 		*p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
 	}
 
-#ifdef IPSEC
-	if (ipsec_setsocket(m, so) != 0) {
-		error = ENOBUFS;
-		goto bad;
-	}
-#endif /*IPSEC*/
-
 	error = ip6_output(m, optp, &in6p->in6p_route, 0,
-			   in6p->in6p_moptions, &oifp);
+			   in6p->in6p_moptions, &oifp, in6p);
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		if (oifp)
 			icmp6_ifoutstat_inc(oifp, type, code);
 		icmp6stat.icp6s_outhist[type]++;
 	} else
 		rip6stat.rip6s_opackets++;
 
 	goto freectl;
 
  bad:
 	if (m)
 		m_freem(m);
 
  freectl:
 	if (optp == &opt && optp->ip6po_rthdr && optp->ip6po_route.ro_rt)
 		RTFREE(optp->ip6po_route.ro_rt);
 	if (control) {
 		if (optp == &opt)
 			ip6_clearpktopts(optp, 0, -1);
 		m_freem(control);
 	}
 	return(error);
 }
 
 /*
  * Raw IPv6 socket option processing.
  */
 int
 rip6_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error;
 
 	if (sopt->sopt_level == IPPROTO_ICMPV6)
 		/*
 		 * XXX: is it better to call icmp6_ctloutput() directly
 		 * from protosw?
 		 */
 		return(icmp6_ctloutput(so, sopt));
 	else if (sopt->sopt_level != IPPROTO_IPV6)
 		return (EINVAL);
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_get(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_set(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 static int
 rip6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error, s;
 
 	inp = sotoinpcb(so);
 	if (inp)
 		panic("rip6_attach");
 	if (td && (error = suser(td)) != 0)
 		return error;
 
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return error;
 	s = splnet();
 	error = in_pcballoc(so, &ripcbinfo, td);
 	splx(s);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV6;
 	inp->in6p_ip6_nxt = (long)proto;
 	inp->in6p_hops = -1;	/* use kernel default */
 	inp->in6p_cksum = -1;
 	MALLOC(inp->in6p_icmp6filt, struct icmp6_filter *,
 	       sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
 	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
 	return 0;
 }
 
 static int
 rip6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	if (inp == 0)
 		panic("rip6_detach");
 	/* xxx: RSVP */
 	if (so == ip6_mrouter)
 		ip6_mrouter_done();
 	if (inp->in6p_icmp6filt) {
 		FREE(inp->in6p_icmp6filt, M_PCB);
 		inp->in6p_icmp6filt = NULL;
 	}
 	in6_pcbdetach(inp);
 	return 0;
 }
 
 static int
 rip6_abort(struct socket *so)
 {
 	soisdisconnected(so);
 	return rip6_detach(so);
 }
 
 static int
 rip6_disconnect(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return ENOTCONN;
 	inp->in6p_faddr = in6addr_any;
 	return rip6_abort(so);
 }
 
 static int
 rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct ifaddr *ia = NULL;
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 
 	if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6)
 		return EADDRNOTAVAIL;
 #ifdef ENABLE_DEFAULT_SCOPE
 	if (addr->sin6_scope_id == 0) {	/* not change if specified  */
 		addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr);
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
 	    (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0)
 		return EADDRNOTAVAIL;
 	if (ia &&
 	    ((struct in6_ifaddr *)ia)->ia6_flags &
 	    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
 	     IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
 		return(EADDRNOTAVAIL);
 	}
 	inp->in6p_laddr = addr->sin6_addr;
 	return 0;
 }
 
 static int
 rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct in6_addr *in6a = NULL;
 	int error = 0;
 #ifdef ENABLE_DEFAULT_SCOPE
 	struct sockaddr_in6 tmp;
 #endif
 
 	if (nam->sa_len != sizeof(*addr))
 		return EINVAL;
 	if (TAILQ_EMPTY(&ifnet))
 		return EADDRNOTAVAIL;
 	if (addr->sin6_family != AF_INET6)
 		return EAFNOSUPPORT;
 #ifdef ENABLE_DEFAULT_SCOPE
 	if (addr->sin6_scope_id == 0) {	/* not change if specified  */
 		/* avoid overwrites */
 		tmp = *addr;
 		addr = &tmp;
 		addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr);
 	}
 #endif
 	/* Source address selection. XXX: need pcblookup? */
 	in6a = in6_selectsrc(addr, inp->in6p_outputopts,
 			     inp->in6p_moptions, &inp->in6p_route,
 			     &inp->in6p_laddr, &error);
 	if (in6a == NULL)
 		return (error ? error : EADDRNOTAVAIL);
 	inp->in6p_laddr = *in6a;
 	inp->in6p_faddr = addr->sin6_addr;
 	soisconnected(so);
 	return 0;
 }
 
 static int
 rip6_shutdown(struct socket *so)
 {
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct sockaddr_in6 tmp;
 	struct sockaddr_in6 *dst;
 
 	/* always copy sockaddr to avoid overwrites */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return EISCONN;
 		}
 		/* XXX */
 		bzero(&tmp, sizeof(tmp));
 		tmp.sin6_family = AF_INET6;
 		tmp.sin6_len = sizeof(struct sockaddr_in6);
 		bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
 		      sizeof(struct in6_addr));
 		dst = &tmp;
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return ENOTCONN;
 		}
 		tmp = *(struct sockaddr_in6 *)nam;
 		dst = &tmp;
 	}
 #ifdef ENABLE_DEFAULT_SCOPE
 	if (dst->sin6_scope_id == 0) {	/* not change if specified  */
 		dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr);
 	}
 #endif
 	return rip6_output(m, so, dst, control);
 }
 
 struct pr_usrreqs rip6_usrreqs = {
 	rip6_abort, pru_accept_notsupp, rip6_attach, rip6_bind, rip6_connect,
 	pru_connect2_notsupp, in6_control, rip6_detach, rip6_disconnect,
 	pru_listen_notsupp, in6_setpeeraddr, pru_rcvd_notsupp,
 	pru_rcvoob_notsupp, rip6_send, pru_sense_null, rip6_shutdown,
 	in6_setsockaddr, sosend, soreceive, sopoll
 };
Index: head/sys/netinet6/route6.c
===================================================================
--- head/sys/netinet6/route6.c	(revision 105193)
+++ head/sys/netinet6/route6.c	(revision 105194)
@@ -1,222 +1,221 @@
 /*	$FreeBSD$	*/
 /*	$KAME: route6.c,v 1.24 2001/03/14 03:07:05 itojun Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 
 #include <netinet/in.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 
 #include <netinet/icmp6.h>
 
 static int ip6_rthdr0 __P((struct mbuf *, struct ip6_hdr *,
     struct ip6_rthdr0 *));
 
 int
 route6_input(mp, offp, proto)
 	struct mbuf **mp;
 	int *offp, proto;	/* proto is unused */
 {
 	struct ip6_hdr *ip6;
 	struct mbuf *m = *mp;
 	struct ip6_rthdr *rh;
 	int off = *offp, rhlen;
-	struct mbuf *n;
+	struct ip6aux *ip6a;
 
-	n = ip6_findaux(m);
-	if (n) {
-		struct ip6aux *ip6a = mtod(n, struct ip6aux *);
+	ip6a = ip6_findaux(m);
+	if (ip6a) {
 		/* XXX reject home-address option before rthdr */
 		if (ip6a->ip6a_flags & IP6A_SWAP) {
 			ip6stat.ip6s_badoptions++;
 			m_freem(m);
 			return IPPROTO_DONE;
 		}
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*rh), IPPROTO_DONE);
 	ip6 = mtod(m, struct ip6_hdr *);
 	rh = (struct ip6_rthdr *)((caddr_t)ip6 + off);
 #else
 	ip6 = mtod(m, struct ip6_hdr *);
 	IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, sizeof(*rh));
 	if (rh == NULL) {
 		ip6stat.ip6s_tooshort++;
 		return IPPROTO_DONE;
 	}
 #endif
 
 	switch (rh->ip6r_type) {
 	case IPV6_RTHDR_TYPE_0:
 		rhlen = (rh->ip6r_len + 1) << 3;
 #ifndef PULLDOWN_TEST
 		/*
 		 * note on option length:
 		 * due to IP6_EXTHDR_CHECK assumption, we cannot handle
 		 * very big routing header (max rhlen == 2048).
 		 */
 		IP6_EXTHDR_CHECK(m, off, rhlen, IPPROTO_DONE);
 #else
 		/*
 		 * note on option length:
 		 * maximum rhlen: 2048
 		 * max mbuf m_pulldown can handle: MCLBYTES == usually 2048
 		 * so, here we are assuming that m_pulldown can handle
 		 * rhlen == 2048 case.  this may not be a good thing to
 		 * assume - we may want to avoid pulling it up altogether.
 		 */
 		IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, rhlen);
 		if (rh == NULL) {
 			ip6stat.ip6s_tooshort++;
 			return IPPROTO_DONE;
 		}
 #endif
 		if (ip6_rthdr0(m, ip6, (struct ip6_rthdr0 *)rh))
 			return(IPPROTO_DONE);
 		break;
 	default:
 		/* unknown routing type */
 		if (rh->ip6r_segleft == 0) {
 			rhlen = (rh->ip6r_len + 1) << 3;
 			break;	/* Final dst. Just ignore the header. */
 		}
 		ip6stat.ip6s_badoptions++;
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&rh->ip6r_type - (caddr_t)ip6);
 		return(IPPROTO_DONE);
 	}
 
 	*offp += rhlen;
 	return(rh->ip6r_nxt);
 }
 
 /*
  * Type0 routing header processing
  *
  * RFC2292 backward compatibility warning: no support for strict/loose bitmap,
  * as it was dropped between RFC1883 and RFC2460.
  */
 static int
 ip6_rthdr0(m, ip6, rh0)
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct ip6_rthdr0 *rh0;
 {
 	int addrs, index;
 	struct in6_addr *nextaddr, tmpaddr;
 
 	if (rh0->ip6r0_segleft == 0)
 		return(0);
 
 	if (rh0->ip6r0_len % 2
 #ifdef COMPAT_RFC1883
 	    || rh0->ip6r0_len > 46
 #endif
 		) {
 		/*
 		 * Type 0 routing header can't contain more than 23 addresses.
 		 * RFC 2462: this limitation was removed since strict/loose
 		 * bitmap field was deleted.
 		 */
 		ip6stat.ip6s_badoptions++;
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&rh0->ip6r0_len - (caddr_t)ip6);
 		return(-1);
 	}
 
 	if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) {
 		ip6stat.ip6s_badoptions++;
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6);
 		return(-1);
 	}
 
 	index = addrs - rh0->ip6r0_segleft;
 	rh0->ip6r0_segleft--;
 	/* note that ip6r0_addr does not exist in RFC2292bis */
 	nextaddr = rh0->ip6r0_addr + index;
 
 	/*
 	 * reject invalid addresses.  be proactive about malicious use of
 	 * IPv4 mapped/compat address.
 	 * XXX need more checks?
 	 */
 	if (IN6_IS_ADDR_MULTICAST(nextaddr) ||
 	    IN6_IS_ADDR_UNSPECIFIED(nextaddr) ||
 	    IN6_IS_ADDR_V4MAPPED(nextaddr) ||
 	    IN6_IS_ADDR_V4COMPAT(nextaddr)) {
 		ip6stat.ip6s_badoptions++;
 		m_freem(m);
 		return(-1);
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		ip6stat.ip6s_badoptions++;
 		m_freem(m);
 		return(-1);
 	}
 
 	/*
 	 * Swap the IPv6 destination address and nextaddr. Forward the packet.
 	 */
 	tmpaddr = *nextaddr;
 	*nextaddr = ip6->ip6_dst;
 	if (IN6_IS_ADDR_LINKLOCAL(nextaddr))
 		nextaddr->s6_addr16[1] = 0;
 	ip6->ip6_dst = tmpaddr;
 	if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst))
 		ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
 
 #ifdef COMPAT_RFC1883
 	if (rh0->ip6r0_slmap[index / 8] & (1 << (7 - (index % 8))))
 		ip6_forward(m, IPV6_SRCRT_NEIGHBOR);
 	else
 		ip6_forward(m, IPV6_SRCRT_NOTNEIGHBOR);
 #else
 	ip6_forward(m, 1);
 #endif
 
 	return(-1);			/* m would be freed in ip6_forward() */
 }
Index: head/sys/netinet6/udp6_output.c
===================================================================
--- head/sys/netinet6/udp6_output.c	(revision 105193)
+++ head/sys/netinet6/udp6_output.c	(revision 105194)
@@ -1,318 +1,312 @@
 /*	$FreeBSD$	*/
 /*	$KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_var.h	8.1 (Berkeley) 6/10/93
  */
 
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/if_types.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/udp6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6protosw.h>
 
 #ifdef IPSEC
 #include <netinet6/ipsec.h>
 #ifdef INET6
 #include <netinet6/ipsec6.h>
 #endif
 #endif /* IPSEC */
 
 #include <net/net_osdep.h>
 
 /*
  * UDP protocol inplementation.
  * Per RFC 768, August, 1980.
  */
 
 #define in6pcb		inpcb
 #define udp6stat	udpstat
 #define udp6s_opackets	udps_opackets
 
 int
 udp6_output(in6p, m, addr6, control, td)
 	struct in6pcb *in6p;
 	struct mbuf *m;
 	struct mbuf *control;
 	struct sockaddr *addr6;
 	struct thread *td;
 {
 	u_int32_t ulen = m->m_pkthdr.len;
 	u_int32_t plen = sizeof(struct udphdr) + ulen;
 	struct ip6_hdr *ip6;
 	struct udphdr *udp6;
 	struct in6_addr *laddr, *faddr;
 	u_short fport;
 	int error = 0;
 	struct ip6_pktopts opt, *stickyopt = in6p->in6p_outputopts;
 	int priv;
 	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
 	int flags;
 	struct sockaddr_in6 tmp;
 
 	priv = 0;
 	if (td && !suser(td))
 		priv = 1;
 	if (control) {
 		if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0)
 			goto release;
 		in6p->in6p_outputopts = &opt;
 	}
 
 	if (addr6) {
 		/*
 		 * IPv4 version of udp_output calls in_pcbconnect in this case,
 		 * which needs splnet and affects performance.
 		 * Since we saw no essential reason for calling in_pcbconnect,
 		 * we get rid of such kind of logic, and call in6_selectsrc
 		 * and in6_pcbsetport in order to fill in the local address
 		 * and the local port.
 		 */
 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr6;
 		if (sin6->sin6_port == 0) {
 			error = EADDRNOTAVAIL;
 			goto release;
 		}
 
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
 			/* how about ::ffff:0.0.0.0 case? */
 			error = EISCONN;
 			goto release;
 		}
 
 		/* protect *sin6 from overwrites */
 		tmp = *sin6;
 		sin6 = &tmp;
 
 		faddr = &sin6->sin6_addr;
 		fport = sin6->sin6_port; /* allow 0 port */
 
 		if (IN6_IS_ADDR_V4MAPPED(faddr)) {
 			if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * I believe we should explicitly discard the
 				 * packet when mapped addresses are disabled,
 				 * rather than send the packet as an IPv6 one.
 				 * If we chose the latter approach, the packet
 				 * might be sent out on the wire based on the
 				 * default route, the situation which we'd
 				 * probably want to avoid.
 				 * (20010421 jinmei@kame.net)
 				 */
 				error = EINVAL;
 				goto release;
 			} else
 				af = AF_INET;
 		}
 
 		/* KAME hack: embed scopeid */
 		if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL) != 0) {
 			error = EINVAL;
 			goto release;
 		}
 
 		if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
 			laddr = in6_selectsrc(sin6, in6p->in6p_outputopts,
 					      in6p->in6p_moptions,
 					      &in6p->in6p_route,
 					      &in6p->in6p_laddr, &error);
 		} else
 			laddr = &in6p->in6p_laddr;	/* XXX */
 		if (laddr == NULL) {
 			if (error == 0)
 				error = EADDRNOTAVAIL;
 			goto release;
 		}
 		if (in6p->in6p_lport == 0 &&
 		    (error = in6_pcbsetport(laddr, in6p, td)) != 0)
 			goto release;
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
 			error = ENOTCONN;
 			goto release;
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
 			if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * XXX: this case would happen when the
 				 * application sets the V6ONLY flag after
 				 * connecting the foreign address.
 				 * Such applications should be fixed,
 				 * so we bark here.
 				 */
 				log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
 				    "option was set for a connected socket\n");
 				error = EINVAL;
 				goto release;
 			} else
 				af = AF_INET;
 		}
 		laddr = &in6p->in6p_laddr;
 		faddr = &in6p->in6p_faddr;
 		fport = in6p->in6p_fport;
 	}
 
 	if (af == AF_INET)
 		hlen = sizeof(struct ip);
 
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP6 headers.
 	 */
 	M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
 	if (m == 0) {
 		error = ENOBUFS;
 		goto release;
 	}
 
 	/*
 	 * Stuff checksum and output datagram.
 	 */
 	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
 	udp6->uh_sport = in6p->in6p_lport; /* lport is always set in the PCB */
 	udp6->uh_dport = fport;
 	if (plen <= 0xffff)
 		udp6->uh_ulen = htons((u_short)plen);
 	else
 		udp6->uh_ulen = 0;
 	udp6->uh_sum = 0;
 
 	switch (af) {
 	case AF_INET6:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_flow	= in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK;
 		ip6->ip6_vfc 	&= ~IPV6_VERSION_MASK;
 		ip6->ip6_vfc 	|= IPV6_VERSION;
 #if 0				/* ip6_plen will be filled in ip6_output. */
 		ip6->ip6_plen	= htons((u_short)plen);
 #endif
 		ip6->ip6_nxt	= IPPROTO_UDP;
 		ip6->ip6_hlim	= in6_selecthlim(in6p,
 						 in6p->in6p_route.ro_rt ?
 						 in6p->in6p_route.ro_rt->rt_ifp : NULL);
 		ip6->ip6_src	= *laddr;
 		ip6->ip6_dst	= *faddr;
 
 		if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP,
 				sizeof(struct ip6_hdr), plen)) == 0) {
 			udp6->uh_sum = 0xffff;
 		}
 
 		flags = 0;
 
 		udp6stat.udp6s_opackets++;
-#ifdef IPSEC
-		if (ipsec_setsocket(m, in6p->in6p_socket) != 0) {
-			error = ENOBUFS;
-			goto release;
-		}
-#endif /* IPSEC */
 		error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route,
-		    flags, in6p->in6p_moptions, NULL);
+		    flags, in6p->in6p_moptions, NULL, in6p);
 		break;
 	case AF_INET:
 		error = EAFNOSUPPORT;
 		goto release;
 	}
 	goto releaseopt;
 
 release:
 	m_freem(m);
 
 releaseopt:
 	if (control) {
 		ip6_clearpktopts(in6p->in6p_outputopts, 0, -1);
 		in6p->in6p_outputopts = stickyopt;
 		m_freem(control);
 	}
 	return(error);
 }
Index: head/sys/netipx/ipx_ip.c
===================================================================
--- head/sys/netipx/ipx_ip.c	(revision 105193)
+++ head/sys/netipx/ipx_ip.c	(revision 105194)
@@ -1,454 +1,454 @@
 /*
  * Copyright (c) 1995, Mike Mitchell
  * Copyright (c) 1984, 1985, 1986, 1987, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
  *	@(#)ipx_ip.c
  *
  * $FreeBSD$
  */
 
 /*
  * Software interface driver for encapsulating IPX in IP.
  */
 
 #include "opt_inet.h"
 #include "opt_ipx.h"
 
 #ifdef IPXIP
 #ifndef INET
 #error The option IPXIP requires option INET.
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/netisr.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 
 #include <netipx/ipx.h>
 #include <netipx/ipx_if.h>
 #include <netipx/ipx_ip.h>
 #include <netipx/ipx_var.h>
 
 static struct	ifnet ipxipif;
 
 /* list of all hosts and gateways or broadcast addrs */
 static struct	ifnet_en *ipxip_list;
 
 static	struct ifnet_en *ipxipattach(void);
 static	int ipxip_free(struct ifnet *ifp);
 static	int ipxipioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static	int ipxipoutput(struct ifnet *ifp, struct mbuf *m,
 			struct sockaddr *dst, struct rtentry *rt);
 static	void ipxip_rtchange(struct in_addr *dst);
 static	void ipxipstart(struct ifnet *ifp);
 
 static struct ifnet_en *
 ipxipattach()
 {
 	register struct ifnet_en *m;
 	register struct ifnet *ifp;
 
 	if (ipxipif.if_mtu == 0) {
 		ifp = &ipxipif;
 		ifp->if_name = "ipxip";
 		ifp->if_mtu = LOMTU;
 		ifp->if_ioctl = ipxipioctl;
 		ifp->if_output = ipxipoutput;
 		ifp->if_start = ipxipstart;
 		ifp->if_flags = IFF_POINTOPOINT;
 	}
 
 	MALLOC((m), struct ifnet_en *, sizeof(*m), M_PCB, M_NOWAIT | M_ZERO);
 	if (m == NULL)
 		return (NULL);
 	m->ifen_next = ipxip_list;
 	ipxip_list = m;
 	ifp = &m->ifen_ifnet;
 
 	ifp->if_name = "ipxip";
 	ifp->if_mtu = LOMTU;
 	ifp->if_ioctl = ipxipioctl;
 	ifp->if_output = ipxipoutput;
 	ifp->if_start = ipxipstart;
 	ifp->if_flags = IFF_POINTOPOINT;
 	ifp->if_unit = ipxipif.if_unit++;
 	if_attach(ifp);
 
 	return (m);
 }
 
 
 /*
  * Process an ioctl request.
  */
 static int
 ipxipioctl(ifp, cmd, data)
 	register struct ifnet *ifp;
 	u_long cmd;
 	caddr_t data;
 {
 	int error = 0;
 	struct ifreq *ifr;
 
 	switch (cmd) {
 
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		/* FALLTHROUGH */
 
 	case SIOCSIFDSTADDR:
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 
 	case SIOCSIFFLAGS:
 		ifr = (struct ifreq *)data;
 		if ((ifr->ifr_flags & IFF_UP) == 0)
 			error = ipxip_free(ifp);
 
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static struct mbuf *ipxip_badlen;
 static struct mbuf *ipxip_lastin;
 static int ipxip_hold_input;
 
 void
 ipxip_input(m, hlen)
 	register struct mbuf *m;
 	int hlen;
 {
 	register struct ip *ip;
 	register struct ipx *ipx;
 	register struct ifqueue *ifq = &ipxintrq;
 	int len, s;
 
 	if (ipxip_hold_input) {
 		if (ipxip_lastin != NULL) {
 			m_freem(ipxip_lastin);
 		}
 		ipxip_lastin = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT);
 	}
 	/*
 	 * Get IP and IPX header together in first mbuf.
 	 */
 	ipxipif.if_ipackets++;
 	s = sizeof(struct ip) + sizeof(struct ipx);
 	if (((m->m_flags & M_EXT) || m->m_len < s) &&
 	    (m = m_pullup(m, s)) == NULL) {
 		ipxipif.if_ierrors++;
 		return;
 	}
 	ip = mtod(m, struct ip *);
 	if (ip->ip_hl > (sizeof(struct ip) >> 2)) {
 		ip_stripoptions(m, (struct mbuf *)NULL);
 		if (m->m_len < s) {
 			if ((m = m_pullup(m, s)) == NULL) {
 				ipxipif.if_ierrors++;
 				return;
 			}
 			ip = mtod(m, struct ip *);
 		}
 	}
 
 	/*
 	 * Make mbuf data length reflect IPX length.
 	 * If not enough data to reflect IPX length, drop.
 	 */
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 	m->m_pkthdr.len -= sizeof(struct ip);
 	ipx = mtod(m, struct ipx *);
 	len = ntohs(ipx->ipx_len);
 	if (len & 1)
 		len++;		/* Preserve Garbage Byte */
 	if (ip->ip_len != len) {
 		if (len > ip->ip_len) {
 			ipxipif.if_ierrors++;
 			if (ipxip_badlen)
 				m_freem(ipxip_badlen);
 			ipxip_badlen = m;
 			return;
 		}
 		/* Any extra will be trimmed off by the IPX routines */
 	}
 
 	/*
 	 * Deliver to IPX
 	 */
 	if (IF_HANDOFF(ifq, m, NULL))
 		schednetisr(NETISR_IPX);
 	return;
 }
 
 static int
 ipxipoutput(ifp, m, dst, rt)
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct sockaddr *dst;
 	struct rtentry *rt;
 {
 	register struct ifnet_en *ifn = (struct ifnet_en *)ifp;
 	register struct ip *ip;
 	register struct route *ro = &(ifn->ifen_route);
 	register int len = 0;
 	register struct ipx *ipx = mtod(m, struct ipx *);
 	int error;
 
 	ifn->ifen_ifnet.if_opackets++;
 	ipxipif.if_opackets++;
 
 	/*
 	 * Calculate data length and make space
 	 * for IP header.
 	 */
 	len =  ntohs(ipx->ipx_len);
 	if (len & 1)
 		len++;		/* Preserve Garbage Byte */
 	/* following clause not necessary on vax */
 	if (3 & (int)m->m_data) {
 		/* force longword alignment of ip hdr */
 		struct mbuf *m0 = m_gethdr(MT_HEADER, M_DONTWAIT);
 		if (m0 == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		MH_ALIGN(m0, sizeof(struct ip));
 		m0->m_flags = m->m_flags & M_COPYFLAGS;
 		m0->m_next = m;
 		m0->m_len = sizeof(struct ip);
 		m0->m_pkthdr.len = m0->m_len + m->m_len;
 		m->m_flags &= ~M_PKTHDR;
 		m = m0;
 	} else {
 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
 		if (m == NULL)
 			return (ENOBUFS);
 	}
 	/*
 	 * Fill in IP header.
 	 */
 	ip = mtod(m, struct ip *);
 	*(long *)ip = 0;
 	ip->ip_p = IPPROTO_IDP;
 	ip->ip_src = ifn->ifen_src;
 	ip->ip_dst = ifn->ifen_dst;
 	ip->ip_len = (u_short)len + sizeof(struct ip);
 	ip->ip_ttl = MAXTTL;
 
 	/*
 	 * Output final datagram.
 	 */
-	error =  (ip_output(m, (struct mbuf *)NULL, ro, SO_BROADCAST, NULL));
+	error =  (ip_output(m, (struct mbuf *)NULL, ro, SO_BROADCAST, NULL, NULL));
 	if (error) {
 		ifn->ifen_ifnet.if_oerrors++;
 		ifn->ifen_ifnet.if_ierrors = error;
 	}
 	return (error);
 	m_freem(m);
 	return (ENETUNREACH);
 }
 
 static void
 ipxipstart(ifp)
 struct ifnet *ifp;
 {
 	panic("ipxip_start called\n");
 }
 
 static struct ifreq ifr_ipxip = {"ipxip0"};
 
 int
 ipxip_route(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	int error;
 	struct ifnet_en *ifn;
 	struct sockaddr_in *src;
 	struct ipxip_req rq;
 	struct sockaddr_ipx *ipx_dst;
 	struct sockaddr_in *ip_dst;
 	struct route ro;
 
 	error = sooptcopyin(sopt, &rq, sizeof rq, sizeof rq);
 	if (error)
 		return (error);
 	ipx_dst = (struct sockaddr_ipx *)&rq.rq_ipx;
 	ip_dst = (struct sockaddr_in *)&rq.rq_ip;
 
 	/*
 	 * First, make sure we already have an IPX address:
 	 */
 	if (ipx_ifaddr == NULL)
 		return (EADDRNOTAVAIL);
 	/*
 	 * Now, determine if we can get to the destination
 	 */
 	bzero((caddr_t)&ro, sizeof(ro));
 	ro.ro_dst = *(struct sockaddr *)ip_dst;
 	rtalloc(&ro);
 	if (ro.ro_rt == NULL || ro.ro_rt->rt_ifp == NULL) {
 		return (ENETUNREACH);
 	}
 
 	/*
 	 * And see how he's going to get back to us:
 	 * i.e., what return ip address do we use?
 	 */
 	{
 		register struct in_ifaddr *ia;
 		struct ifnet *ifp = ro.ro_rt->rt_ifp;
 
 		for (ia = TAILQ_FIRST(&in_ifaddrhead); ia != NULL; 
 		     ia = TAILQ_NEXT(ia, ia_link))
 			if (ia->ia_ifp == ifp)
 				break;
 		if (ia == NULL)
 			ia = TAILQ_FIRST(&in_ifaddrhead);
 		if (ia == NULL) {
 			RTFREE(ro.ro_rt);
 			return (EADDRNOTAVAIL);
 		}
 		src = (struct sockaddr_in *)&ia->ia_addr;
 	}
 
 	/*
 	 * Is there a free (pseudo-)interface or space?
 	 */
 	for (ifn = ipxip_list; ifn != NULL; ifn = ifn->ifen_next) {
 		if ((ifn->ifen_ifnet.if_flags & IFF_UP) == 0)
 			break;
 	}
 	if (ifn == NULL)
 		ifn = ipxipattach();
 	if (ifn == NULL) {
 		RTFREE(ro.ro_rt);
 		return (ENOBUFS);
 	}
 	ifn->ifen_route = ro;
 	ifn->ifen_dst =  ip_dst->sin_addr;
 	ifn->ifen_src = src->sin_addr;
 
 	/*
 	 * now configure this as a point to point link
 	 */
 	ifr_ipxip.ifr_name[4] = '0' + ipxipif.if_unit - 1;
 	ifr_ipxip.ifr_dstaddr = *(struct sockaddr *)ipx_dst;
 	ipx_control(so, (int)SIOCSIFDSTADDR, (caddr_t)&ifr_ipxip,
 			(struct ifnet *)ifn, sopt->sopt_td);
 
 	/* use any of our addresses */
 	satoipx_addr(ifr_ipxip.ifr_addr).x_host = 
 			ipx_ifaddr->ia_addr.sipx_addr.x_host;
 
 	return (ipx_control(so, (int)SIOCSIFADDR, (caddr_t)&ifr_ipxip,
 			(struct ifnet *)ifn, sopt->sopt_td));
 }
 
 static int
 ipxip_free(ifp)
 struct ifnet *ifp;
 {
 	register struct ifnet_en *ifn = (struct ifnet_en *)ifp;
 	struct route *ro = & ifn->ifen_route;
 
 	if (ro->ro_rt != NULL) {
 		RTFREE(ro->ro_rt);
 		ro->ro_rt = NULL;
 	}
 	ifp->if_flags &= ~IFF_UP;
 	return (0);
 }
 
 void
 ipxip_ctlinput(cmd, sa, dummy)
 	int cmd;
 	struct sockaddr *sa;
 	void *dummy;
 {
 	struct sockaddr_in *sin;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (sa->sa_family != AF_INET && sa->sa_family != AF_IMPLINK)
 		return;
 	sin = (struct sockaddr_in *)sa;
 	if (sin->sin_addr.s_addr == INADDR_ANY)
 		return;
 
 	switch (cmd) {
 
 	case PRC_ROUTEDEAD:
 	case PRC_REDIRECT_NET:
 	case PRC_REDIRECT_HOST:
 	case PRC_REDIRECT_TOSNET:
 	case PRC_REDIRECT_TOSHOST:
 		ipxip_rtchange(&sin->sin_addr);
 		break;
 	}
 }
 
 static void
 ipxip_rtchange(dst)
 	register struct in_addr *dst;
 {
 	register struct ifnet_en *ifn;
 
 	for (ifn = ipxip_list; ifn != NULL; ifn = ifn->ifen_next) {
 		if (ifn->ifen_dst.s_addr == dst->s_addr &&
 			ifn->ifen_route.ro_rt != NULL) {
 				RTFREE(ifn->ifen_route.ro_rt);
 				ifn->ifen_route.ro_rt = NULL;
 		}
 	}
 }
 #endif /* IPXIP */
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h	(revision 105193)
+++ head/sys/sys/mbuf.h	(revision 105194)
@@ -1,499 +1,554 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 #include <sys/_label.h>
+#include <sys/queue.h>
 
 /*
  * Mbufs are of a single size, MSIZE (machine/param.h), which
  * includes overhead.  An mbuf may add a single "mbuf cluster" of size
  * MCLBYTES (also in machine/param.h), which has no additional overhead
  * and is used instead of the internal data area; this is done when
  * at least MINCLSIZE of data must be stored.  Additionally, it is possible
  * to allocate a separate buffer externally and attach it to the mbuf in
  * a way similar to that of mbuf clusters.
  */
 #define	MLEN		(MSIZE - sizeof(struct m_hdr))	/* normal data len */
 #define	MHLEN		(MLEN - sizeof(struct pkthdr))	/* data len w/pkthdr */
 #define	MINCLSIZE	(MHLEN + 1)	/* smallest amount to put in cluster */
 #define	M_MAXCOMPRESS	(MHLEN / 2)	/* max amount to copy for compression */
 
 #ifdef _KERNEL
 /*-
  * Macros for type conversion:
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * dtom(x)	-- Convert data pointer within mbuf to mbuf pointer (XXX).
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	dtom(x)		((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1)))
 #endif /* _KERNEL */
 
 /*
  * Header present at the beginning of every mbuf.
  */
 struct m_hdr {
 	struct	mbuf *mh_next;		/* next buffer in chain */
 	struct	mbuf *mh_nextpkt;	/* next chain in queue/record */
 	caddr_t	mh_data;		/* location of data */
 	int	mh_len;			/* amount of data in this mbuf */
 	int	mh_flags;		/* flags; see below */
 	short	mh_type;		/* type of data in this mbuf */
 };
 
 /*
+ * Packet tag structure (see below for details).
+ */
+struct m_tag {
+	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
+	u_int16_t		m_tag_id;	/* Tag ID */
+	u_int16_t		m_tag_len;	/* Length of data */
+	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
+};
+
+/*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  */
 struct pkthdr {
 	struct	ifnet *rcvif;		/* rcv interface */
 	int	len;			/* total packet length */
 	/* variables for ip and tcp reassembly */
 	void	*header;		/* pointer to packet header */
 	/* variables for hardware checksum */
 	int	csum_flags;		/* flags regarding checksum */
 	int	csum_data;		/* data field used by csum routines */
-	struct	mbuf *aux;		/* extra data buffer; ipsec/others */
+	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	struct	label label;		/* MAC label of data in packet */
 };
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is set.
  */
 struct m_ext {
 	caddr_t	ext_buf;		/* start of buffer */
 	void	(*ext_free)		/* free routine if not the usual */
 		    (void *, void *);
 	void	*ext_args;		/* optional argument pointer */
 	u_int	ext_size;		/* size of buffer, for ext_free */
 	u_int	*ref_cnt;		/* pointer to ref count info */
 	int	ext_type;		/* type of external storage */
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for
  * practical purposes.
  */
 struct mbuf {
 	struct	m_hdr m_hdr;
 	union {
 		struct {
 			struct	pkthdr MH_pkthdr;	/* M_PKTHDR set */
 			union {
 				struct	m_ext MH_ext;	/* M_EXT set */
 				char	MH_databuf[MHLEN];
 			} MH_dat;
 		} MH;
 		char	M_databuf[MLEN];		/* !M_PKTHDR, !M_EXT */
 	} M_dat;
 };
 #define	m_next		m_hdr.mh_next
 #define	m_len		m_hdr.mh_len
 #define	m_data		m_hdr.mh_data
 #define	m_type		m_hdr.mh_type
 #define	m_flags		m_hdr.mh_flags
 #define	m_nextpkt	m_hdr.mh_nextpkt
 #define	m_act		m_nextpkt
 #define	m_pkthdr	M_dat.MH.MH_pkthdr
 #define	m_ext		M_dat.MH.MH_dat.MH_ext
 #define	m_pktdat	M_dat.MH.MH_dat.MH_databuf
 #define	m_dat		M_dat.M_databuf
 
 /*
  * mbuf flags.
  */
 #define	M_EXT		0x0001	/* has associated external storage */
 #define	M_PKTHDR	0x0002	/* start of record */
 #define	M_EOR		0x0004	/* end of record */
 #define	M_RDONLY	0x0008	/* associated data is marked read-only */
 #define	M_PROTO1	0x0010	/* protocol-specific */
 #define	M_PROTO2	0x0020	/* protocol-specific */
 #define	M_PROTO3	0x0040	/* protocol-specific */
 #define	M_PROTO4	0x0080	/* protocol-specific */
 #define	M_PROTO5	0x0100	/* protocol-specific */
 
 /*
  * mbuf pkthdr flags (also stored in m_flags).
  */
 #define	M_BCAST		0x0200	/* send/received as link-level broadcast */
 #define	M_MCAST		0x0400	/* send/received as link-level multicast */
 #define	M_FRAG		0x0800	/* packet is a fragment of a larger packet */
 #define	M_FIRSTFRAG	0x1000	/* packet is first fragment */
 #define	M_LASTFRAG	0x2000	/* packet is last fragment */
 
 /*
  * External buffer types: identify ext_buf type.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
 #define	EXT_NET_DRV	100	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	200	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	300	/* can throw this buffer away w/page flipping */
 
 /*
  * Flags copied when copying m_pkthdr.
  */
 #define	M_COPYFLAGS	(M_PKTHDR|M_EOR|M_PROTO1|M_PROTO1|M_PROTO2|M_PROTO3 | \
 			    M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG|M_RDONLY)
 
 /*
  * Flags indicating hw checksum support and sw checksum requirements.
  */
 #define	CSUM_IP			0x0001		/* will csum IP */
 #define	CSUM_TCP		0x0002		/* will csum TCP */
 #define	CSUM_UDP		0x0004		/* will csum UDP */
 #define	CSUM_IP_FRAGS		0x0008		/* will csum IP fragments */
 #define	CSUM_FRAGMENT		0x0010		/* will do IP fragmentation */
 
 #define	CSUM_IP_CHECKED		0x0100		/* did csum IP */
 #define	CSUM_IP_VALID		0x0200		/*   ... the csum is valid */
 #define	CSUM_DATA_VALID		0x0400		/* csum_data field is valid */
 #define	CSUM_PSEUDO_HDR		0x0800		/* csum_data has pseudo hdr */
 
 #define	CSUM_DELAY_DATA		(CSUM_TCP | CSUM_UDP)
 #define	CSUM_DELAY_IP		(CSUM_IP)	/* XXX add ipv6 here too? */
 
 /*
  * mbuf types.
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	2	/* packet header */
 #if 0
 #define	MT_SOCKET	3	/* socket structure */
 #define	MT_PCB		4	/* protocol control block */
 #define	MT_RTABLE	5	/* routing tables */
 #define	MT_HTABLE	6	/* IMP host tables */
 #define	MT_ATABLE	7	/* address resolution tables */
 #endif
 #define	MT_SONAME	8	/* socket name */
 #if 0
 #define	MT_SOOPTS	10	/* socket options */
 #endif
 #define	MT_FTABLE	11	/* fragment reassembly header */
 #if 0
 #define	MT_RIGHTS	12	/* access rights */
 #define	MT_IFADDR	13	/* interface address */
 #endif
 #define	MT_TAG		13	/* volatile metadata associated to pkts */
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_OOBDATA	15	/* expedited data  */
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */
 
 /*
  * Mbuf and cluster allocation statistics PCPU structure.
  */
 struct mbpstat {
 	u_long	mb_mbfree;
 	u_long	mb_mbpgs;
 	u_long	mb_clfree;
 	u_long	mb_clpgs;
 	long	mb_mbtypes[MT_NTYPES];
 	short	mb_active;
 };
 
 /*
  * General mbuf allocator statistics structure.
  * XXX: Modifications of these are not protected by any mutex locks nor by
  * any atomic() manipulations.  As a result, we may occasionally lose
  * a count or two.  Luckily, not all of these fields are modified at all
  * and remain static, and those that are manipulated are only manipulated
  * in failure situations, which do not occur (hopefully) very often.
  */
 struct mbstat {
 	u_long	m_drops;	/* times failed to allocate */
 	u_long	m_wait;		/* times succesfully returned from wait */
 	u_long	m_drain;	/* times drained protocols for space */
 	u_long	m_mcfail;	/* XXX: times m_copym failed */
 	u_long	m_mpfail;	/* XXX: times m_pullup failed */
 	u_long	m_msize;	/* length of an mbuf */
 	u_long	m_mclbytes;	/* length of an mbuf cluster */
 	u_long	m_minclsize;	/* min length of data to allocate a cluster */
 	u_long	m_mlen;		/* length of data in an mbuf */
 	u_long	m_mhlen;	/* length of data in a header mbuf */
 	/* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */
 	short	m_numtypes;
 };
 
 /*
  * Flags specifying how an allocation should be made.
  * M_DONTWAIT means "don't block if nothing is available" whereas
  * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is
  * available."
  */
 #define	M_DONTWAIT	1
 #define	M_TRYWAIT	0
 #define	M_WAIT		M_TRYWAIT	/* XXX: Deprecated. */
 
 #ifdef _KERNEL
 /*-
  * mbuf external reference count management macros.
  *
  * MEXT_IS_REF(m): true if (m) is not the only mbuf referencing
  *     the external buffer ext_buf.
  *
  * MEXT_REM_REF(m): remove reference to m_ext object.
  *
  * MEXT_ADD_REF(m): add reference to m_ext object already
  *     referred to by (m).
  */
 #define	MEXT_IS_REF(m)	(*((m)->m_ext.ref_cnt) > 1)
 
 #define	MEXT_REM_REF(m) do {						\
 	KASSERT(*((m)->m_ext.ref_cnt) > 0, ("m_ext refcnt < 0"));	\
 	atomic_subtract_int((m)->m_ext.ref_cnt, 1);			\
 } while(0)
 
 #define	MEXT_ADD_REF(m)	atomic_add_int((m)->m_ext.ref_cnt, 1)
 
 /*
  * mbuf, cluster, and external object allocation macros
  * (for compatibility purposes).
  */
 #define	M_COPY_PKTHDR(to, from)	m_copy_pkthdr((to), (from))
 #define	m_getclr(how, type)	m_get_clrd((how), (type))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, args, flags, type) 			\
     m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type))
 
 /*
  * MEXTFREE(m): disassociate (and possibly free) an external object from (m).
  * 
  * If the atomic_cmpset_int() returns 0, then we effectively do nothing
  * in terms of "cleaning up" (freeing the ext buf and ref. counter) as
  * this means that either there are still references, or another thread
  * is taking care of the clean-up.
  */
 #define	MEXTFREE(m) do {						\
 	struct mbuf *_mb = (m);						\
 									\
 	MEXT_REM_REF(_mb);						\
 	if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1))		\
 		_mext_free(_mb);					\
 	_mb->m_flags &= ~M_EXT;						\
 } while (0)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this
  * can be both the local data payload, or an external buffer area,
  * depending on whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) && (!((m)->m_flags  \
 			    & M_EXT) || !MEXT_IS_REF(m)))
 
 /*
  * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place
  * an object of the specified size at the end of the mbuf, longword aligned.
  */
 #define	M_ALIGN(m, len) do {						\
 	(m)->m_data += (MLEN - (len)) & ~(sizeof(long) - 1);		\
 } while (0)
 
 /*
  * As above, for mbufs allocated with m_gethdr/MGETHDR
  * or initialized by M_COPY_PKTHDR.
  */
 #define	MH_ALIGN(m, len) do {						\
 	(m)->m_data += (MHLEN - (len)) & ~(sizeof(long) - 1);		\
 } while (0)
 
 /*
  * Compute the amount of space available
  * before the current start of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  */
 #define	M_LEADINGSPACE(m)						\
 	((m)->m_flags & M_EXT ?						\
 	    (M_WRITABLE(m) ? (m)->m_data - (m)->m_ext.ext_buf : 0):	\
 	    (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat :	\
 	    (m)->m_data - (m)->m_dat)
 
 /*
  * Compute the amount of space available
  * after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  */
 #define	M_TRAILINGSPACE(m)						\
 	((m)->m_flags & M_EXT ?						\
 	    (M_WRITABLE(m) ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size	\
 		- ((m)->m_data + (m)->m_len) : 0) :			\
 	    &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))
 
 /*
  * Arrange to prepend space of size plen to mbuf m.
  * If a new mbuf must be allocated, how specifies whether to wait.
  * If the allocation fails, the original mbuf chain is freed and m is
  * set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.
  * This is a relatively expensive operation and should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 /* Compatibility with 4.3. */
 #define	m_copy(m, o, l)	m_copym((m), (o), (l), M_DONTWAIT)
 
-/*
- * pkthdr.aux type tags.
- */
-struct mauxtag {
-	int	af;
-	int	type;
-	void	*p;
-};
-
-/*-
- * Some packet tags to identify different mbuf annotations.
- *
- * Eventually, these annotations will end up in an appropriate chain
- * (struct m_tag or similar, e.g. as in NetBSD) properly managed by
- * the mbuf handling routines.
- *
- * As a temporary and low impact solution to replace the even uglier
- * approach used so far in some parts of the network stack (which relies
- * on global variables), these annotations are stored in MT_TAG
- * mbufs (or lookalikes) prepended to the actual mbuf chain.
- *
- *	m_type	= MT_TAG
- *	m_flags = m_tag_id
- *	m_next	= next buffer in chain.
- *
- * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines.
- */
-
-#define	m_tag_id	m_hdr.mh_flags
-
-/* Packet tag types -- first ones are from NetBSD */
-
-#define	PACKET_TAG_NONE				0  /* Nadda */
-#define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
-#define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
-#define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
-#define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
-#define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
-#define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
-#define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
-#define	PACKET_TAG_GIF				8  /* GIF processing done */
-#define	PACKET_TAG_GRE				9  /* GRE processing done */
-#define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
-#define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
-#define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
-#define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
-#define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
-
-/* Packet tags used in the FreeBSD network stack */
-#define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
-#define	PACKET_TAG_IPFW				16 /* ipfw classification */
-#define	PACKET_TAG_DIVERT			17 /* divert info */
-#define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
-
-#define	PACKET_TAG_MAX				19
-
 extern	int max_datalen;		/* MHLEN - max_hdr */
 extern	int max_hdr;			/* Largest link + protocol header */
 extern	int max_linkhdr;		/* Largest link-level header */
 extern	int max_protohdr;		/* Largest protocol header */
 extern	struct mbpstat mb_statpcpu[];	/* Per-CPU allocation stats */
 extern	struct mbstat mbstat;		/* General mbuf stats/infos */
 extern	int nmbclusters;		/* Maximum number of clusters */
 extern	int nmbcnt;			/* Scale kmem_map for counter space */
 extern	int nmbufs;			/* Maximum number of mbufs */
 extern	int nsfbufs;			/* Number of sendfile(2) bufs */
 
 void		 _mext_free(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
-struct	mbuf	*m_aux_add(struct mbuf *, int, int);
-struct	mbuf	*m_aux_add2(struct mbuf *, int, int, void *);
-void		 m_aux_delete(struct mbuf *, struct mbuf *);
-struct	mbuf	*m_aux_find(struct mbuf *, int, int);
-struct	mbuf	*m_aux_find2(struct mbuf *, int, int, void *);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_chtype(struct mbuf *, short);
 void		 m_clget(struct mbuf *, int);
 void		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(void *, void *), void *, int, int);
 void		 m_copyback(struct mbuf *, int, int, caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct	mbuf	*m_copym(struct mbuf *, int, int, int);
 struct	mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct	mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 struct	mbuf	*m_dup(struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct	mbuf	*m_free(struct mbuf *);
 void		 m_freem(struct mbuf *);
 struct	mbuf	*m_get(int, short);
 struct	mbuf	*m_get_clrd(int, short);
 struct	mbuf	*m_getcl(int, short, int);
 struct	mbuf	*m_gethdr(int, short);
 struct	mbuf	*m_gethdr_clrd(int, short);
 struct	mbuf	*m_getm(struct mbuf *, int, int, short);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 struct	mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *);
 struct	mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct	mbuf	*m_pullup(struct mbuf *, int);
 struct	mbuf	*m_split(struct mbuf *, int, int);
+
+/*
+ * Packets may have annotations attached by affixing a list
+ * of "packet tags" to the pkthdr structure.  Packet tags are
+ * dynamically allocated semi-opaque data structures that have
+ * a fixed header (struct m_tag) that specifies the size of the
+ * memory block and a <cookie,type> pair that identifies it.
+ * The cookie is a 32-bit unique unsigned value used to identify
+ * a module or ABI.  By convention this value is chose as the
+ * date+time that the module is created, expressed as the number of
+ * seconds since the epoch (e.g. using date -u +'%s').  The type value
+ * is an ABI/module-specific value that identifies a particular annotation
+ * and is private to the module.  For compatibility with systems
+ * like openbsd that define packet tags w/o an ABI/module cookie,
+ * the value PACKET_ABI_COMPAT is used to implement m_tag_get and
+ * m_tag_find compatibility shim functions and several tag types are
+ * defined below.  Users that do not require compatibility should use
+ * a private cookie value so that packet tag-related definitions
+ * can be maintained privately.
+ *
+ * Note that the packet tag returned by m_tag_allocate has the default
+ * memory alignment implemented by malloc.  To reference private data
+ * one can use a construct like:
+ *
+ *	struct m_tag *mtag = m_tag_allocate(...);
+ *	struct foo *p = (struct foo *)(mtag+1);
+ *
+ * if the alignment of struct m_tag is sufficient for referencing members
+ * of struct foo.  Otherwise it is necessary to embed struct m_tag within
+ * the private data structure to insure proper alignment; e.g.
+ *
+ *	struct foo {
+ *		struct m_tag	tag;
+ *		...
+ *	};
+ *	struct foo *p = (struct foo *) m_tag_allocate(...);
+ *	struct m_tag *mtag = &p->tag;
+ */
+
+#define	PACKET_TAG_NONE				0  /* Nadda */
+
+/* Packet tag for use with PACKET_ABI_COMPAT */
+#define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
+#define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
+#define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
+#define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
+#define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
+#define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
+#define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
+#define	PACKET_TAG_GIF				8  /* GIF processing done */
+#define	PACKET_TAG_GRE				9  /* GRE processing done */
+#define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
+#define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
+#define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
+#define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
+#define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
+
+/*
+ * As a temporary and low impact solution to replace the even uglier
+ * approach used so far in some parts of the network stack (which relies
+ * on global variables), packet tag-like annotations are stored in MT_TAG
+ * mbufs (or lookalikes) prepended to the actual mbuf chain.
+ *
+ *	m_type	= MT_TAG
+ *	m_flags = m_tag_id
+ *	m_next	= next buffer in chain.
+ *
+ * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines.
+ */
+#define	_m_tag_id	m_hdr.mh_flags
+
+/* Packet tags used in the FreeBSD network stack */
+#define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
+#define	PACKET_TAG_IPFW				16 /* ipfw classification */
+#define	PACKET_TAG_DIVERT			17 /* divert info */
+#define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
+
+/* Packet tag routines */
+struct	m_tag 	*m_tag_alloc(u_int32_t, int, int, int);
+void		 m_tag_free(struct m_tag *);
+void		 m_tag_prepend(struct mbuf *, struct m_tag *);
+void		 m_tag_unlink(struct mbuf *, struct m_tag *);
+void		 m_tag_delete(struct mbuf *, struct m_tag *);
+void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
+struct	m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
+struct	m_tag	*m_tag_copy(struct m_tag *);
+int		 m_tag_copy_chain(struct mbuf *, struct mbuf *);
+void		 m_tag_init(struct mbuf *);
+struct	m_tag	*m_tag_first(struct mbuf *);
+struct	m_tag	*m_tag_next(struct mbuf *, struct m_tag *);
+
+/* these are for openbsd compatibility */
+#define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
+
+static __inline struct m_tag *
+m_tag_get(int type, int length, int wait)
+{
+	return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
+}
+
+static __inline struct m_tag *
+m_tag_find(struct mbuf *m, int type, struct m_tag *start)
+{
+	return m_tag_locate(m, MTAG_ABI_COMPAT, type, start);
+}
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MBUF_H_ */